Coverage for src/crawler/by_source/amp_crawler.py: 92%
97 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1from bs4 import BeautifulSoup
2from crawler.base_crawler import BaseCollectionCrawler
3from crawler.crawler_types import CitationLiteral
5from ptf.model_data import create_articledata
6from ptf.model_data import create_issuedata
9class AmpCrawler(BaseCollectionCrawler):
10 source_name = "Annals of Mathematics Princeton University"
11 source_domain = "AMP"
12 source_website = "https://annals.math.princeton.edu"
14 periode_begin = 2003
15 periode_end = 2017
17 def __init__(self, *args, **kwargs):
18 super().__init__(*args, **kwargs)
20 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue.
21 # self.collection_id = "AM"
22 # self.collection_url = "https://annals.math.princeton.edu"
24 self.source = self.get_or_create_source()
26 self.periode = self.get_or_create_periode()
28 def parse_collection_content(self, content):
29 """
30 Parse the HTML page of Annals of Math and returns a list of xissue.
31 Each xissue has its volume/number/year metadata + its url
32 """
33 soup = BeautifulSoup(content, "html.parser")
34 xissues = []
36 # Extract the list of issues
37 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})
39 for issue_node in issue_nodes:
40 issue_link_node = issue_node.find("a")
41 if issue_link_node: 41 ↛ 39line 41 didn't jump to line 39 because the condition on line 41 was always true
42 url = issue_link_node.get("href")
43 xissue = self.create_xissue(url)
44 if xissue:
45 xissues.append(xissue)
47 return xissues
49 def create_xissue(self, url):
50 if url.endswith("/"):
51 url = url[:-1]
52 parts = url.split("/")
54 last_part = parts[-1]
55 exceptions = last_part.split("-")
56 if len(exceptions) > 2:
57 year = exceptions[0]
58 volume = exceptions[0]
59 number = exceptions[1]
60 else:
61 year = parts[-2]
62 if len(year) < 4:
63 # The links are different with volumes before 2015
64 year = parts[-3]
66 volume_number = parts[-1]
67 volume_number_parts = volume_number.split("-")
68 volume = volume_number_parts[0]
69 number = volume_number_parts[1]
71 year_int = int(year)
72 if self.periode_begin <= year_int <= self.periode_end:
73 xissue = create_issuedata()
74 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
75 xissue.year = year
76 xissue.volume = volume
77 xissue.number = number
78 xissue.url = url
79 else:
80 xissue = None
82 return xissue
84 def parse_issue_content(self, content, xissue):
85 soup = BeautifulSoup(content, "html.parser")
86 article_nodes = soup.find_all("h2", {"class": "entry-title"})
88 for index_article, article_node in enumerate(article_nodes):
89 article_link_node = article_node.find("a")
90 if article_link_node: 90 ↛ 88line 90 didn't jump to line 88 because the condition on line 90 was always true
91 url = article_link_node.get("href")
92 xarticle = create_articledata()
93 xarticle.pid = "a" + str(index_article)
94 xarticle.url = url
95 xissue.articles.append(xarticle)
97 def parse_article_content(self, content, xissue, xarticle, url, pid):
98 """
99 Parse the content with Beautifulsoup and returns an ArticleData
100 """
101 xarticle = create_articledata()
102 xarticle.pid = pid
103 xarticle.lang = "en"
105 soup = BeautifulSoup(content, "html.parser")
107 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"]
109 title_node = soup.find("h1", {"class": "entry-title"})
110 if title_node: 110 ↛ 113line 110 didn't jump to line 113 because the condition on line 110 was always true
111 what.append("title")
113 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 113 ↛ 119line 113 didn't jump to line 119 because the condition on line 113 was always true
114 # Exception with Annals of Math: 2 articles have the same DOI !
115 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14
116 # we ignore DOI/ZBMATH/MR for the first one
117 what.append("doi")
119 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
121 # ZBMATH
122 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})
123 for metadata_header_node in metadata_header_nodes:
124 text = metadata_header_node.get_text()
126 if text == "zbMATH":
127 link_node = metadata_header_node.parent.find("a")
128 if link_node: 128 ↛ 123line 128 didn't jump to line 123 because the condition on line 128 was always true
129 zblid = link_node.get("href")
130 pos = zblid.find("?q=an:")
131 if pos > 0: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true
132 zblid = zblid[pos + 6 :]
133 xarticle.extids.append(("zbl-item-id", zblid))
134 elif text == "MR":
135 link_node = metadata_header_node.parent.find("a")
136 if link_node: 136 ↛ 123line 136 didn't jump to line 123 because the condition on line 136 was always true
137 mrid = link_node.get("href")
138 pos = mrid.find("?mr=")
139 if pos > 0: 139 ↛ 141line 139 didn't jump to line 141 because the condition on line 139 was always true
140 mrid = mrid[pos + 4 :]
141 xarticle.extids.append(("mr-item-id", mrid))
143 return xarticle