Coverage for src/crawler/by_source/ams/ams_base_crawler.py: 23%
58 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1import json
2import os
3from urllib.parse import urljoin
4from uuid import uuid4
6from bs4 import BeautifulSoup
7from ptf.model_data import create_abstract
8from ptf.utils import execute_cmd
10from crawler.base_crawler import BaseCollectionCrawler
11from crawler.utils import skip_generation
14class AmsCrawler(BaseCollectionCrawler):
15 source_name = "American Mathematical Society"
16 source_domain = "AMS"
17 source_website = "https://www.ams.org/"
19 @skip_generation
20 def parse_collection_content(self, content):
21 xissues = []
22 soup = BeautifulSoup(content, "html.parser")
23 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
24 issues = json.loads(self.get_col_issues(issues_data_tag.text))
25 for i in issues:
26 xissues.append(
27 self.create_xissue(
28 urljoin(self.collection_url, issues[i]["url"]),
29 issues[i]["year"],
30 issues[i]["volume"],
31 issues[i].get("number", None),
32 )
33 )
34 return xissues
36 def get_col_issues(self, input: str):
37 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
38 filename_out = filename + "-out"
39 os.makedirs(os.path.dirname(filename), exist_ok=True)
40 with open(filename, "w") as file:
41 file.write(input)
43 content = None
44 attempt = 0
45 while not content and attempt < 3:
46 attempt += 1
47 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
48 execute_cmd(cmd)
50 if os.path.isfile(filename_out):
51 with open(filename_out) as file_:
52 content = file_.read()
54 os.remove(filename)
55 os.remove(filename_out)
57 if not content:
58 raise ValueError("Couldn't parse collection content")
59 return content
61 def parse_article_content(self, content, xissue, xarticle, url):
62 soup = BeautifulSoup(content, "html.parser")
63 self.get_metadata_using_citation_meta(
64 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
65 )
67 abstract_subtag = soup.select_one("a#Abstract")
68 if abstract_subtag:
69 abstract_tag = abstract_subtag.parent
70 if abstract_tag:
71 abstract_subtag.decompose()
72 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text))
73 else:
74 abstract_tag = soup.select_one("section#Abstract")
75 if abstract_tag:
76 abstract_header = abstract_tag.select_one("h2")
77 if abstract_header:
78 abstract_header.decompose()
79 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text))
81 return xarticle