Coverage for src/crawler/by_source/amp_crawler.py: 93%
97 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from bs4 import BeautifulSoup
2from ptf.model_data import create_articledata, create_issuedata
4from crawler.base_crawler import BaseCollectionCrawler
5from crawler.types import CitationLiteral
8class AmpCrawler(BaseCollectionCrawler):
9 source_name = "Annals of Mathematics website"
10 source_domain = "AMP"
11 source_website = "https://annals.math.princeton.edu"
13 periode_begin = 2003
14 periode_end = 2017
16 def parse_collection_content(self, content):
17 """
18 Parse the HTML page of Annals of Math and returns a list of xissue.
19 Each xissue has its volume/number/year metadata + its url
20 """
21 soup = BeautifulSoup(content, "html.parser")
22 xissues = []
24 # Extract the list of issues
25 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})
27 for issue_node in issue_nodes:
28 issue_link_node = issue_node.find("a")
29 if issue_link_node: 29 ↛ 27line 29 didn't jump to line 27 because the condition on line 29 was always true
30 url = issue_link_node.get("href")
31 xissue = self.create_amp_xissue(url)
32 if xissue:
33 xissues.append(xissue)
35 return xissues
37 def create_amp_xissue(self, url):
38 if url.endswith("/"):
39 url = url[:-1]
40 parts = url.split("/")
42 last_part = parts[-1]
43 exceptions = last_part.split("-")
44 if len(exceptions) > 2:
45 year = exceptions[0]
46 volume = exceptions[0]
47 number = exceptions[1]
48 else:
49 year = parts[-2]
50 if len(year) < 4:
51 # The links are different with volumes before 2015
52 year = parts[-3]
54 volume_number = parts[-1]
55 volume_number_parts = volume_number.split("-")
56 volume = volume_number_parts[0]
57 number = volume_number_parts[1]
59 year_int = int(year)
60 if self.periode_begin <= year_int <= self.periode_end:
61 xissue = create_issuedata()
62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
63 xissue.year = year
64 xissue.volume = volume
65 xissue.number = number
66 xissue.url = url
67 else:
68 xissue = None
70 return xissue
72 def parse_issue_content(self, content, xissue):
73 soup = BeautifulSoup(content, "html.parser")
74 article_nodes = soup.find_all("h2", {"class": "entry-title"})
76 for index_article, article_node in enumerate(article_nodes):
77 article_link_node = article_node.find("a")
78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true
79 url = article_link_node.get("href")
80 xarticle = create_articledata()
81 xarticle.pid = "a" + str(index_article)
82 xarticle.url = url
83 xissue.articles.append(xarticle)
85 def parse_article_content(self, content, xissue, xarticle, url, pid):
86 """
87 Parse the content with Beautifulsoup and returns an ArticleData
88 """
89 xarticle.pid = pid
90 xarticle.lang = "en"
92 soup = BeautifulSoup(content, "html.parser")
94 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"]
96 title_node = soup.find("h1", {"class": "entry-title"})
97 if title_node: 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was always true
98 what.append("title")
100 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 100 ↛ 106line 100 didn't jump to line 106 because the condition on line 100 was always true
101 # Exception with Annals of Math: 2 articles have the same DOI !
102 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14
103 # we ignore DOI/ZBMATH/MR for the first one
104 what.append("doi")
106 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
108 abstract_node = soup.select_one("div.entry-content")
109 if abstract_node is not None: 109 ↛ 124line 109 didn't jump to line 124 because the condition on line 109 was always true
110 abstract_section_node = abstract_node.select_one("p")
111 if abstract_section_node: 111 ↛ 124line 111 didn't jump to line 124 because the condition on line 111 was always true
112 abstract = str(abstract_section_node)
113 xarticle.abstracts.append(
114 {
115 "tag": "abstract",
116 "value_html": "",
117 "value_tex": abstract,
118 "value_xml": "",
119 "lang": self.detect_language(abstract, xarticle),
120 }
121 )
123 # ZBMATH
124 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})
125 for metadata_header_node in metadata_header_nodes:
126 text = metadata_header_node.get_text()
128 if text == "zbMATH":
129 link_node = metadata_header_node.parent.find("a")
130 if link_node: 130 ↛ 125line 130 didn't jump to line 125 because the condition on line 130 was always true
131 zblid = link_node.get("href")
132 pos = zblid.find("?q=an:")
133 if pos > 0: 133 ↛ 135line 133 didn't jump to line 135 because the condition on line 133 was always true
134 zblid = zblid[pos + 6 :]
135 xarticle.extids.append(("zbl-item-id", zblid))
136 elif text == "MR":
137 link_node = metadata_header_node.parent.find("a")
138 if link_node: 138 ↛ 125line 138 didn't jump to line 125 because the condition on line 138 was always true
139 mrid = link_node.get("href")
140 pos = mrid.find("?mr=")
141 if pos > 0: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true
142 mrid = mrid[pos + 4 :]
143 xarticle.extids.append(("mr-item-id", mrid))
145 return xarticle