Coverage for src/crawler/by_source/amp_crawler.py: 87%
99 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1import lingua
2import requests
3from bs4 import BeautifulSoup
4from lingua import LanguageDetectorBuilder
5from ptf.model_data import IssueData, create_abstract, create_articledata
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.types import CitationLiteral
11class AmpCrawler(BaseCollectionCrawler):
12 source_name = "Annals of Mathematics website"
13 source_domain = "AMP"
14 source_website = "https://annals.math.princeton.edu"
16 language_detector = LanguageDetectorBuilder.from_languages(
17 lingua.Language.ENGLISH,
18 lingua.Language.FRENCH,
19 ).build()
21 def crawl_issue(self, xissue: IssueData):
22 try:
23 super().crawl_issue(xissue)
24 except requests.exceptions.HTTPError:
25 self.logger.warning("Got HTTPError while crawling issue. Skipping")
27 def parse_collection_content(self, content):
28 """
29 Parse the HTML page of Annals of Math and returns a list of xissue.
30 Each xissue has its volume/number/year metadata + its url
31 """
32 soup = BeautifulSoup(content, "html.parser")
33 xissues = []
35 # Extract the list of issues
36 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})
38 for issue_node in issue_nodes:
39 issue_link_node = issue_node.find("a")
40 if issue_link_node: 40 ↛ 38line 40 didn't jump to line 38 because the condition on line 40 was always true
41 url = issue_link_node.get("href")
42 xissue = self.create_amp_xissue(url)
43 if xissue: 43 ↛ 38line 43 didn't jump to line 38 because the condition on line 43 was always true
44 xissues.append(xissue)
46 return xissues
48 def create_amp_xissue(self, url):
49 if url.endswith("/"):
50 url = url[:-1]
51 parts = url.split("/")
53 last_part = parts[-1]
54 exceptions = last_part.split("-")
55 if len(exceptions) > 2:
56 year = exceptions[0]
57 volume = exceptions[0]
58 number = exceptions[1]
59 else:
60 year = parts[-2]
61 if len(year) < 4:
62 # The links are different with volumes before 2015
63 year = parts[-3]
65 volume_number = parts[-1]
66 volume_number_parts = volume_number.split("-")
67 volume = volume_number_parts[0]
68 number = volume_number_parts[1]
70 return self.create_xissue(url, year, volume, number)
72 def parse_issue_content(self, content, xissue):
73 soup = BeautifulSoup(content, "html.parser")
74 article_nodes = soup.select("h2.entry-title")
76 for index_article, article_node in enumerate(article_nodes):
77 article_link_node = article_node.select_one("a")
78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true
79 href = article_link_node.get("href")
80 if not isinstance(href, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Couldn't parse issue: href is not a string")
82 xarticle = create_articledata()
83 xarticle.pid = "a" + str(index_article)
84 xarticle.url = href
85 xissue.articles.append(xarticle)
87 def parse_article_content(self, content, xissue, xarticle, url):
88 """
89 Parse the content with Beautifulsoup and returns an ArticleData
90 """
91 xarticle.lang = "en"
93 soup = BeautifulSoup(content, "html.parser")
95 what: list[CitationLiteral] = ["author", "abstract", "page"]
97 doi_tag = soup.select_one("meta[name='citation_doi']")
98 if doi_tag and not doi_tag.text.startswith("https://doi.org/"): 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was always true
99 what.append("pdf")
101 title_node = soup.find("h1", {"class": "entry-title"})
102 if title_node: 102 ↛ 105line 102 didn't jump to line 105 because the condition on line 102 was always true
103 what.append("title")
105 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 105 ↛ 111line 105 didn't jump to line 111 because the condition on line 105 was always true
106 # Exception with Annals of Math: 2 articles have the same DOI !
107 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14
108 # we ignore DOI/ZBMATH/MR for the first one
109 what.append("doi")
111 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
113 abstract_node = soup.select_one("div.entry-content")
114 if abstract_node is not None: 114 ↛ 125line 114 didn't jump to line 125 because the condition on line 114 was always true
115 abstract_section_node = abstract_node.select_one("p")
116 if abstract_section_node: 116 ↛ 125line 116 didn't jump to line 125 because the condition on line 116 was always true
117 abstract = str(abstract_section_node)
118 xarticle.abstracts.append(
119 create_abstract(
120 value_tex=abstract, lang=self.detect_language(abstract, xarticle)
121 )
122 )
124 # ZBMATH
125 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})
126 for metadata_header_node in metadata_header_nodes:
127 text = metadata_header_node.get_text()
129 if text == "zbMATH":
130 link_node = metadata_header_node.parent.find("a")
131 if link_node: 131 ↛ 126line 131 didn't jump to line 126 because the condition on line 131 was always true
132 zblid = link_node.get("href")
133 pos = zblid.find("?q=an:")
134 if pos > 0: 134 ↛ 136line 134 didn't jump to line 136 because the condition on line 134 was always true
135 zblid = zblid[pos + 6 :]
136 xarticle.extids.append(("zbl-item-id", zblid))
137 elif text == "MR":
138 link_node = metadata_header_node.parent.find("a")
139 if link_node: 139 ↛ 126line 139 didn't jump to line 126 because the condition on line 139 was always true
140 mrid = link_node.get("href")
141 pos = mrid.find("?mr=")
142 if pos > 0: 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was always true
143 mrid = mrid[pos + 4 :]
144 xarticle.extids.append(("mr-item-id", mrid))
146 return xarticle