Coverage for src/crawler/by_source/amp_crawler.py: 94%
92 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1from bs4 import BeautifulSoup
2from ptf.model_data import create_articledata, create_issuedata
4from crawler.base_crawler import BaseCollectionCrawler
5from crawler.types import CitationLiteral
8class AmpCrawler(BaseCollectionCrawler):
9 source_name = "Annals of Mathematics website"
10 source_domain = "AMP"
11 source_website = "https://annals.math.princeton.edu"
13 periode_begin = 2003
14 periode_end = 2017
16 def parse_collection_content(self, content):
17 """
18 Parse the HTML page of Annals of Math and returns a list of xissue.
19 Each xissue has its volume/number/year metadata + its url
20 """
21 soup = BeautifulSoup(content, "html.parser")
22 xissues = []
24 # Extract the list of issues
25 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})
27 for issue_node in issue_nodes:
28 issue_link_node = issue_node.find("a")
29 if issue_link_node: 29 ↛ 27line 29 didn't jump to line 27 because the condition on line 29 was always true
30 url = issue_link_node.get("href")
31 xissue = self.create_amp_xissue(url)
32 if xissue:
33 xissues.append(xissue)
35 return xissues
37 def create_amp_xissue(self, url):
38 if url.endswith("/"):
39 url = url[:-1]
40 parts = url.split("/")
42 last_part = parts[-1]
43 exceptions = last_part.split("-")
44 if len(exceptions) > 2:
45 year = exceptions[0]
46 volume = exceptions[0]
47 number = exceptions[1]
48 else:
49 year = parts[-2]
50 if len(year) < 4:
51 # The links are different with volumes before 2015
52 year = parts[-3]
54 volume_number = parts[-1]
55 volume_number_parts = volume_number.split("-")
56 volume = volume_number_parts[0]
57 number = volume_number_parts[1]
59 year_int = int(year)
60 if self.periode_begin <= year_int <= self.periode_end:
61 xissue = create_issuedata()
62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
63 xissue.year = year
64 xissue.volume = volume
65 xissue.number = number
66 xissue.url = url
67 else:
68 xissue = None
70 return xissue
72 def parse_issue_content(self, content, xissue):
73 soup = BeautifulSoup(content, "html.parser")
74 article_nodes = soup.find_all("h2", {"class": "entry-title"})
76 for index_article, article_node in enumerate(article_nodes):
77 article_link_node = article_node.find("a")
78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true
79 url = article_link_node.get("href")
80 xarticle = create_articledata()
81 xarticle.pid = "a" + str(index_article)
82 xarticle.url = url
83 xissue.articles.append(xarticle)
85 def parse_article_content(self, content, xissue, xarticle, url, pid):
86 """
87 Parse the content with Beautifulsoup and returns an ArticleData
88 """
89 xarticle = create_articledata()
90 xarticle.pid = pid
91 xarticle.lang = "en"
93 soup = BeautifulSoup(content, "html.parser")
95 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"]
97 title_node = soup.find("h1", {"class": "entry-title"})
98 if title_node: 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was always true
99 what.append("title")
101 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 101 ↛ 107line 101 didn't jump to line 107 because the condition on line 101 was always true
102 # Exception with Annals of Math: 2 articles have the same DOI !
103 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14
104 # we ignore DOI/ZBMATH/MR for the first one
105 what.append("doi")
107 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
109 # ZBMATH
110 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})
111 for metadata_header_node in metadata_header_nodes:
112 text = metadata_header_node.get_text()
114 if text == "zbMATH":
115 link_node = metadata_header_node.parent.find("a")
116 if link_node: 116 ↛ 111line 116 didn't jump to line 111 because the condition on line 116 was always true
117 zblid = link_node.get("href")
118 pos = zblid.find("?q=an:")
119 if pos > 0: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true
120 zblid = zblid[pos + 6 :]
121 xarticle.extids.append(("zbl-item-id", zblid))
122 elif text == "MR":
123 link_node = metadata_header_node.parent.find("a")
124 if link_node: 124 ↛ 111line 124 didn't jump to line 111 because the condition on line 124 was always true
125 mrid = link_node.get("href")
126 pos = mrid.find("?mr=")
127 if pos > 0: 127 ↛ 129line 127 didn't jump to line 129 because the condition on line 127 was always true
128 mrid = mrid[pos + 4 :]
129 xarticle.extids.append(("mr-item-id", mrid))
131 return xarticle