Coverage for src/crawler/by_source/ams/ams_base_crawler.py: 25%
61 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1import json
2import os
3from urllib.parse import urljoin
4from uuid import uuid4
6from bs4 import BeautifulSoup
7from ptf.model_data import create_abstract
8from ptf.utils import execute_cmd
10from crawler.base_crawler import BaseCollectionCrawler
11from crawler.utils import skip_generation
14class AmsCrawler(BaseCollectionCrawler):
15 source_name = "American Mathematical Society"
16 source_domain = "AMS"
17 source_website = "https://www.ams.org/"
19 @classmethod
20 def get_view_id(cls):
21 return "AMS"
23 @skip_generation
24 def parse_collection_content(self, content):
25 xissues = []
26 soup = BeautifulSoup(content, "html.parser")
27 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
28 issues = json.loads(self.get_col_issues(issues_data_tag.text))
29 for i in issues:
30 xissues.append(
31 self.create_xissue(
32 urljoin(self.collection_url, issues[i]["url"]),
33 issues[i]["year"],
34 issues[i]["volume"],
35 issues[i].get("number", None),
36 )
37 )
38 return xissues
40 def get_col_issues(self, input: str):
41 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
42 filename_out = filename + "-out"
43 os.makedirs(os.path.dirname(filename), exist_ok=True)
44 with open(filename, "w") as file:
45 file.write(input)
47 content = None
48 attempt = 0
49 while not content and attempt < 3:
50 attempt += 1
51 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
52 execute_cmd(cmd)
54 if os.path.isfile(filename_out):
55 with open(filename_out) as file_:
56 content = file_.read()
58 os.remove(filename)
59 os.remove(filename_out)
61 if not content:
62 raise ValueError("Couldn't parse collection content")
63 return content
65 def parse_article_content(self, content, xissue, xarticle, url):
66 soup = BeautifulSoup(content, "html.parser")
67 self.get_metadata_using_citation_meta(
68 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
69 )
71 abstract_subtag = soup.select_one("a#Abstract")
72 if abstract_subtag:
73 abstract_tag = abstract_subtag.parent
74 if abstract_tag:
75 abstract_subtag.decompose()
76 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text))
77 else:
78 abstract_tag = soup.select_one("section#Abstract")
79 if abstract_tag:
80 abstract_header = abstract_tag.select_one("h2")
81 if abstract_header:
82 abstract_header.decompose()
83 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text))
85 return xarticle