Coverage for src / crawler / by_source / ams_crawler.py: 15%
138 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import html
2import json
3import os
4from urllib.parse import urljoin
5from uuid import uuid4
7from bs4 import BeautifulSoup, Tag
8from opentelemetry import trace
9from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
10from ptf.cmds.xml.ckeditor.utils import get_abstract_xml
11from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
12from ptf.utils import execute_cmd
14from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler
15from crawler.cmds.mixed_citation import ExtLinkXml, MixedCitation
16from crawler.tests.data_generation.decorators import skip_generation
17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
20class AmsCrawler(ThreadedCrawler):
21 source_name = "American Mathematical Society"
22 source_domain = "AMS"
23 source_website = "https://www.ams.org/"
24 tracer = trace.get_tracer(__name__)
26 @classmethod
27 def get_view_id(cls):
28 return "AMS"
30 @skip_generation
31 def parse_collection_content(self, content):
32 xissues = []
33 soup = BeautifulSoup(content, "html.parser")
34 issues_data_tag = soup.select_one(
35 ".container main[role='main'] script[type='text/javascript']:not([src])"
36 )
37 data = json.loads(self.get_col_issues(issues_data_tag.text))
38 issues = data["issues"]
39 self.group_by_year = data["group_by_year"] == "Y"
40 self.ams_code = data["ams_code"].lower()
41 for i in issues:
42 number = i.get("IssueNumber", None)
43 if number:
44 number = str(number)
45 if self.group_by_year:
46 number = None
47 # For AMS, xissue.url is NOT a real URL, but the AMS issue ID
48 # Issue data is fetched from an API and thus every issue url is the same
49 xissues.append(
50 self.create_xissue(
51 str(i["IssueId"]),
52 str(i["Year"]),
53 str(i["Volume"]),
54 number,
55 )
56 )
58 if self.group_by_year:
59 # We take only the first issue advertised by the website
60 # All ignored issues will be present inside the API on the next step anyways
61 years = {}
62 for i in xissues:
63 if i.year not in years:
64 years[i.year] = []
65 years[i.year].append(i)
67 xissues = [y[0] for y in years.values()]
68 return xissues
70 def get_col_issues(self, input: str):
71 """
72 AMS Issues are listed inside an inline js script
73 We have to spawn a nodejs subprocess to convert javascript into json"""
75 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
76 filename_out = filename + "-out"
77 os.makedirs(os.path.dirname(filename), exist_ok=True)
78 with open(filename, "w") as file:
79 file.write(input)
81 content = None
82 attempt = 0
83 while not content and attempt < 3:
84 attempt += 1
85 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
86 execute_cmd(cmd)
88 if os.path.isfile(filename_out):
89 with open(filename_out) as file_:
90 content = file_.read()
92 os.remove(filename)
93 os.remove(filename_out)
95 if not content:
96 raise ValueError("Couldn't parse collection content")
97 return content
99 def download_issue_summary(self, issue_id):
100 response = self.session.post(
101 "https://pubs.ams.org/product/GetJournalIssueDetail",
102 data={"productCode": self.ams_code, "issueId": issue_id},
103 headers={
104 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:149.0) Gecko/20100101 Firefox/149.0"
105 },
106 )
107 return response.text
109 def start_process_issue(self, xissue):
110 issue_url = xissue.url
111 if not issue_url:
112 raise ValueError("Issue does not have an URL")
113 content = self.download_issue_summary(issue_url)
114 # API response is somehow a list of issues
115 # Currently CAMS somehow puts every article inside a different issue in the list...
116 articles = []
118 if self.group_by_year:
119 for issue in json.loads(content):
120 articles.extend(issue["Articles"])
121 else:
122 issue_json = next(i for i in json.loads(content) if str(i["IssueId"]) == xissue.url)
123 articles = issue_json["Articles"]
125 with self.tracer.start_as_current_span("parse_issue_content"):
126 self.parse_ams_issue_content(articles, xissue)
128 def parse_ams_issue_content(self, articles: list[dict], xissue):
129 for index, article_dict in enumerate(articles):
130 xarticle = create_articledata()
131 xarticle.title_tex = article_dict["Title"]
132 xarticle.doi = article_dict["DOI"]
133 xarticle.pid = f"a_{index}"
134 xarticle.fpage = str(article_dict["StartPage"])
135 xarticle.lpage = str(article_dict["EndPage"])
136 xarticle.date_published = article_dict["PostDate"]
138 if article_dict["DocumentType"] == "BOOKREV":
139 if xarticle.title_tex == "":
140 book_title = article_dict["BookReviews"][0]["Title"]
141 xarticle.title_tex = "Book review: " + book_title
142 if article_dict["PrimaryMsc"] is not None:
143 for msc in article_dict["PrimaryMsc"].split(", "):
144 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))
146 ckeditor_data = CkeditorParser(
147 html_value=article_dict["Abstract"],
148 mml_formulas="",
149 )
150 abstract = create_abstract(
151 lang="en",
152 value_xml=get_abstract_xml(ckeditor_data.value_xml, lang="en"),
153 value_tex=ckeditor_data.value_tex,
154 value_html=ckeditor_data.value_html,
155 )
156 xarticle.abstracts.append(abstract)
158 # TODO : EnhancedReferences
159 # TODO : UnenhancedReferences
160 # TODO : BibliographicInfo
162 add_pdf_link_to_xarticle(
163 xarticle,
164 urljoin("https://www.ams.org/journals/", self.ams_code + article_dict["PdfUrl"]),
165 )
166 xarticle.url = urljoin(
167 self.collection_url,
168 self.ams_code + "/" + article_dict["IssueDirectory"] + "/" + article_dict["PII"],
169 )
170 if article_dict["MRNumber"]:
171 xarticle.extids.append(("mr-item-id", article_dict["MRNumber"]))
173 for author in article_dict["Authors"]:
174 # TODO : AMS Provides Firstname/MiddleName/LastName but we do not have Middlename fields
175 # How should we proceed about that ?
176 xarticle.contributors.append(
177 create_contributor(
178 role="author",
179 string_name=html.unescape(author["FullName"]),
180 email=html.unescape(author["Email"] or ""),
181 addresses=[html.unescape(author["Affiliation"] or "")],
182 )
183 )
185 soup = BeautifulSoup(article_dict["EnhancedReferences"], "html5lib")
186 refs = soup.select("ul > li")
187 for ref in refs:
188 xarticle.bibitems.append(self.parse_ref(ref))
190 xissue.articles.append(xarticle)
192 def parse_ref(self, ref: "Tag"):
193 citation_builder = MixedCitation()
194 for el in ref.children:
195 if isinstance(el, str):
196 if el in [", DOI ", " DOI ", "DOI"]:
197 continue
198 citation_builder.elements.append(el)
199 continue
200 if isinstance(el, Tag):
201 if el.name == "a":
202 if el.text.startswith("10."):
203 extlink = ExtLinkXml(urljoin("https://doi.org/", el.text))
204 citation_builder.elements.append(extlink)
205 el.decompose()
206 continue
208 href = el.get("href")
209 if not isinstance(href, str):
210 continue
211 if href.startswith("https://mathscinet.ams.org/mathscinet-getitem"):
212 extlink = ExtLinkXml(href)
213 citation_builder.elements.append(extlink)
214 el.decompose()
215 continue
216 citation_builder.elements.append(el.get_text())
217 return citation_builder.get_jats_ref()
219 # def parse_ref(self, ref: "Tag"):
220 # citation_builder = MixedCitation()
221 # # Everything behind the title should be authors
222 # title_element = ref.select_one("em")
223 # if title_element:
224 # authors = list(title_element.previous_siblings)
225 # # if len(authors) != 1:
226 # # self.logger.error("Could not correctly parse reference. Fallback to text")
227 # # citation_builder.elements.append(ref.get_text())
228 # # return citation_builder.get_jats_ref()
229 # # Temporary fix : structured bibitems parsing is sometimes incorrect.
230 # # Better have no data than incorrect data (?)
231 # for el in authors:
232 # citation_builder.elements.append(el.get_text())
233 # for el in authors:
234 # el.extract()
235 # # authors_el = GenericRefElement()
236 # # authors_el.name = "person-group"
237 # # citation_builder.elements.append(authors_el)
238 # # authors_text = authors[0].text
239 # # if authors_text.endswith(", "):
240 # # authors_text = authors_text.removesuffix(", ")
241 # # authors_el.elements.append(authors_text)
242 # # citation_builder.elements.append(", ")
243 # # else:
244 # # authors_el.elements.append(authors_text)
246 # article_title = MixedCitation()
247 # article_title.name = "article-title"
248 # citation_builder.elements.append(article_title)
249 # article_title.elements.append(title_element.text)
250 # title_element.decompose()
252 # # everything before a tag is text
253 # first_link = ref.select_one("a")
254 # if first_link:
255 # texts = list(first_link.previous_siblings)
256 # if len(texts) == 0:
257 # raise ValueError("first_link previous_siblings is empty")
258 # for el in reversed(texts):
259 # citation_builder.elements.append(el.get_text().removesuffix(", Preprint, arXiv:"))
260 # el.extract()
262 # for link in ref.select("a"):
263 # url = link.get("href")
264 # if not isinstance(url, str):
265 # raise ValueError("Citation extlink does not have a valid url")
266 # reflink = ExtLinkXml(url)
267 # citation_builder.elements.append(reflink)
268 # else:
269 # citation_builder.elements.append(ref.get_text())
271 # return citation_builder.get_jats_ref()