Coverage for src/crawler/by_source/ams_crawler.py: 22%
71 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import json
2import os
3import subprocess
4from urllib.parse import urljoin
5from uuid import uuid4
7from bs4 import BeautifulSoup
8from ptf.cmds.xml.jats.jats_parser import JatsBase
9from ptf.model_data import create_abstract, create_articledata
10from ptf.utils import execute_cmd
12from crawler.base_crawler import BaseCollectionCrawler
13from crawler.utils import skip_generation
16class AmsCrawler(BaseCollectionCrawler):
17 source_name = "American Mathematical Society"
18 source_domain = "AMS"
19 source_website = "https://www.ams.org/"
21 @skip_generation
22 def parse_collection_content(self, content):
23 xissues = []
24 soup = BeautifulSoup(content, "html.parser")
25 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
26 issues = json.loads(self.get_col_issues(issues_data_tag.text))
27 for i in issues:
28 xissues.append(
29 self.create_xissue(
30 urljoin(self.collection_url, issues[i]["url"]),
31 issues[i]["year"],
32 issues[i]["volume"],
33 issues[i].get("number", None),
34 )
35 )
36 return xissues
38 def get_col_issues(self, input: str):
39 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
40 filename_out = filename + "-out"
41 os.makedirs(os.path.dirname(filename), exist_ok=True)
42 with open(filename, "w") as file:
43 file.write(input)
45 content = None
46 attempt = 0
47 while not content and attempt < 3:
48 attempt += 1
49 try:
50 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
51 print(cmd)
52 execute_cmd(cmd)
54 if os.path.isfile(filename_out):
55 with open(filename_out) as file_:
56 content = file_.read()
58 os.remove(filename)
59 os.remove(filename_out)
61 except subprocess.CalledProcessError:
62 pass
63 if not content:
64 raise ValueError("Couldn't parse collection content")
65 return content
67 def parse_issue_content(self, content, xissue):
68 soup = BeautifulSoup(content, "html.parser")
69 articles = soup.select(
70 "dd > a:-soup-contains-own('Abstract, references and article information')"
71 )
72 for index, a in enumerate(articles):
73 article_url = a.get("href")
74 if not isinstance(article_url, str):
75 raise ValueError("Couldn't parse article url")
76 xarticle = create_articledata()
77 xarticle.url = urljoin(self.collection_url, article_url)
78 xarticle.pid = "a" + str(index)
79 xissue.articles.append(xarticle)
81 def parse_article_content(self, content, xissue, xarticle, url):
82 soup = BeautifulSoup(content, "html.parser")
83 self.get_metadata_using_citation_meta(
84 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
85 )
86 if len(xarticle.bibitems) > 0:
87 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))
89 abstract_subtag = soup.select_one("a#Abstract")
90 if abstract_subtag:
91 abstract_tag = abstract_subtag.parent
92 if abstract_tag:
93 abstract_subtag.decompose()
94 xarticle.abstracts.append(
95 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text)
96 )
98 return xarticle