Coverage for src/crawler/by_source/ams/ams_base_crawler.py: 26%
56 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-07-07 11:48 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-07-07 11:48 +0000
1import json
2import os
3from urllib.parse import urljoin
4from uuid import uuid4
6from bs4 import BeautifulSoup
7from ptf.cmds.xml.jats.jats_parser import JatsBase
8from ptf.model_data import create_abstract
9from ptf.utils import execute_cmd
11from crawler.base_crawler import BaseCollectionCrawler
12from crawler.utils import skip_generation
15class AmsCrawler(BaseCollectionCrawler):
16 source_name = "American Mathematical Society"
17 source_domain = "AMS"
18 source_website = "https://www.ams.org/"
20 @skip_generation
21 def parse_collection_content(self, content):
22 xissues = []
23 soup = BeautifulSoup(content, "html.parser")
24 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
25 issues = json.loads(self.get_col_issues(issues_data_tag.text))
26 for i in issues:
27 xissues.append(
28 self.create_xissue(
29 urljoin(self.collection_url, issues[i]["url"]),
30 issues[i]["year"],
31 issues[i]["volume"],
32 issues[i].get("number", None),
33 )
34 )
35 return xissues
37 def get_col_issues(self, input: str):
38 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
39 filename_out = filename + "-out"
40 os.makedirs(os.path.dirname(filename), exist_ok=True)
41 with open(filename, "w") as file:
42 file.write(input)
44 content = None
45 attempt = 0
46 while not content and attempt < 3:
47 attempt += 1
48 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
49 print(cmd)
50 execute_cmd(cmd)
52 if os.path.isfile(filename_out):
53 with open(filename_out) as file_:
54 content = file_.read()
56 os.remove(filename)
57 os.remove(filename_out)
59 if not content:
60 raise ValueError("Couldn't parse collection content")
61 return content
63 def parse_article_content(self, content, xissue, xarticle, url):
64 soup = BeautifulSoup(content, "html.parser")
65 self.get_metadata_using_citation_meta(
66 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
67 )
68 if len(xarticle.bibitems) > 0:
69 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))
71 abstract_subtag = soup.select_one("a#Abstract")
72 if abstract_subtag:
73 abstract_tag = abstract_subtag.parent
74 if abstract_tag:
75 abstract_subtag.decompose()
76 xarticle.abstracts.append(
77 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text)
78 )
80 return xarticle