Coverage for src/crawler/by_source/amp_crawler.py: 91%
97 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import lingua
2from bs4 import BeautifulSoup
3from lingua import LanguageDetectorBuilder
4from ptf.model_data import create_articledata, create_issuedata
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.types import CitationLiteral
10class AmpCrawler(BaseCollectionCrawler):
11 source_name = "Annals of Mathematics website"
12 source_domain = "AMP"
13 source_website = "https://annals.math.princeton.edu"
15 language_detector = LanguageDetectorBuilder.from_languages(
16 lingua.Language.ENGLISH,
17 lingua.Language.FRENCH,
18 ).build()
20 def parse_collection_content(self, content):
21 """
22 Parse the HTML page of Annals of Math and returns a list of xissue.
23 Each xissue has its volume/number/year metadata + its url
24 """
25 soup = BeautifulSoup(content, "html.parser")
26 xissues = []
28 # Extract the list of issues
29 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})
31 for issue_node in issue_nodes:
32 issue_link_node = issue_node.find("a")
33 if issue_link_node: 33 ↛ 31line 33 didn't jump to line 31 because the condition on line 33 was always true
34 url = issue_link_node.get("href")
35 xissue = self.create_amp_xissue(url)
36 if xissue: 36 ↛ 31line 36 didn't jump to line 31 because the condition on line 36 was always true
37 xissues.append(xissue)
39 return xissues
41 def create_amp_xissue(self, url):
42 if url.endswith("/"):
43 url = url[:-1]
44 parts = url.split("/")
46 last_part = parts[-1]
47 exceptions = last_part.split("-")
48 if len(exceptions) > 2:
49 year = exceptions[0]
50 volume = exceptions[0]
51 number = exceptions[1]
52 else:
53 year = parts[-2]
54 if len(year) < 4:
55 # The links are different with volumes before 2015
56 year = parts[-3]
58 volume_number = parts[-1]
59 volume_number_parts = volume_number.split("-")
60 volume = volume_number_parts[0]
61 number = volume_number_parts[1]
63 xissue = create_issuedata()
64 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
65 xissue.year = year
66 xissue.volume = volume
67 xissue.number = number
68 xissue.url = url
70 return xissue
72 def parse_issue_content(self, content, xissue):
73 soup = BeautifulSoup(content, "html.parser")
74 article_nodes = soup.find_all("h2", {"class": "entry-title"})
76 for index_article, article_node in enumerate(article_nodes):
77 article_link_node = article_node.find("a")
78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true
79 url = article_link_node.get("href")
80 xarticle = create_articledata()
81 xarticle.pid = "a" + str(index_article)
82 xarticle.url = url
83 xissue.articles.append(xarticle)
85 def parse_article_content(self, content, xissue, xarticle, url):
86 """
87 Parse the content with Beautifulsoup and returns an ArticleData
88 """
89 xarticle.lang = "en"
91 soup = BeautifulSoup(content, "html.parser")
93 what: list[CitationLiteral] = ["author", "abstract", "page"]
95 doi_tag = soup.select_one("meta[name='citation_doi']")
96 if not doi_tag.text.startswith("https://doi.org/"): 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true
97 what.append("pdf")
99 title_node = soup.find("h1", {"class": "entry-title"})
100 if title_node: 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was always true
101 what.append("title")
103 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 103 ↛ 109line 103 didn't jump to line 109 because the condition on line 103 was always true
104 # Exception with Annals of Math: 2 articles have the same DOI !
105 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14
106 # we ignore DOI/ZBMATH/MR for the first one
107 what.append("doi")
109 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
111 abstract_node = soup.select_one("div.entry-content")
112 if abstract_node is not None: 112 ↛ 127line 112 didn't jump to line 127 because the condition on line 112 was always true
113 abstract_section_node = abstract_node.select_one("p")
114 if abstract_section_node: 114 ↛ 127line 114 didn't jump to line 127 because the condition on line 114 was always true
115 abstract = str(abstract_section_node)
116 xarticle.abstracts.append(
117 {
118 "tag": "abstract",
119 "value_html": "",
120 "value_tex": abstract,
121 "value_xml": "",
122 "lang": self.detect_language(abstract, xarticle),
123 }
124 )
126 # ZBMATH
127 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})
128 for metadata_header_node in metadata_header_nodes:
129 text = metadata_header_node.get_text()
131 if text == "zbMATH":
132 link_node = metadata_header_node.parent.find("a")
133 if link_node: 133 ↛ 128line 133 didn't jump to line 128 because the condition on line 133 was always true
134 zblid = link_node.get("href")
135 pos = zblid.find("?q=an:")
136 if pos > 0: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true
137 zblid = zblid[pos + 6 :]
138 xarticle.extids.append(("zbl-item-id", zblid))
139 elif text == "MR":
140 link_node = metadata_header_node.parent.find("a")
141 if link_node: 141 ↛ 128line 141 didn't jump to line 128 because the condition on line 141 was always true
142 mrid = link_node.get("href")
143 pos = mrid.find("?mr=")
144 if pos > 0: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true
145 mrid = mrid[pos + 4 :]
146 xarticle.extids.append(("mr-item-id", mrid))
148 return xarticle