Coverage for src/crawler/by_source/impan_crawler.py: 74%
120 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1from bs4 import BeautifulSoup
2from crawler.base_crawler import BaseCollectionCrawler
3from crawler.base_crawler import add_pdf_link_to_xarticle
4from crawler.crawler_types import CitationLiteral
6from ptf.model_data import AbstractDict
7from ptf.model_data import IssueData
8from ptf.model_data import create_articledata
9from ptf.model_data import create_issuedata
12class ImpanCrawler(BaseCollectionCrawler):
13 source_name = "Institute of Mathematics Polish Academy of Sciences"
14 source_domain = "IMPAN"
15 source_website = "https://www.impan.pl"
17 periode_end = 2016
18 periode_begin = 0
20 def __init__(self, *args, **kwargs):
21 super().__init__(*args, **kwargs)
23 self.source = self.get_or_create_source()
25 self.periode = self.get_or_create_periode()
27 def parse_collection_content(self, content):
28 """
29 Discrete Analysis.
30 We ignore the journal web page and query Crossref to get the list of articles.
31 We query crossref for each article to get the list of xissues based on the publication date.
32 Each xissue has its year + list of articles with their URLs
33 """
34 if self.collection_id == "APM": 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 self.periode_begin = 1955
37 if self.collection_id == "DIM": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 self.periode_begin = 2000
40 soup = BeautifulSoup(content, "html.parser")
41 xissues_dict: dict[str, IssueData] = {}
43 # Extract the list of issues
44 volume_nodes = soup.select("div.year")
46 for volume_node in volume_nodes:
47 year = volume_node.get_text()
48 year_int = int(year)
49 if self.periode_begin > year_int or year_int > self.periode_end:
50 continue
51 issues_nodes = volume_node.parent
52 if issues_nodes is None: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 continue
54 issues_nodes = issues_nodes.select("div.issues")
56 for issue_node in issues_nodes:
57 issues_link_node = issue_node.select("a")
58 for issue_link_node in issues_link_node:
59 href = issue_link_node.get("href")
60 if href is None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError(
62 f"[{self.source_domain}] {self.collection_id} : Collection href is None"
63 )
64 if isinstance(href, list): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 raise ValueError(
66 f"[{self.source_domain}] {self.collection_id} : Collection href is an array"
67 )
68 url = self.source_website + href
70 xissue = self.create_xissue(url, year)
71 # Prevent duplicate issues
72 # NOTE : is this needed ?
73 pid = xissue.pid
74 if pid is None: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 continue
76 if pid in xissues_dict: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 print(
78 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}"
79 )
80 continue
81 xissues_dict[pid] = xissue
83 return list(xissues_dict.values())
85 def create_xissue(self, url: str, year: str):
86 if url.endswith("/"): 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 url = url[:-1]
88 parts = url.split("/")
89 issue_number = parts[-1].replace(",", "-")
90 volume_number = parts[-2]
92 xissue = create_issuedata()
93 if volume_number == "all": 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 xissue.pid = f"{self.collection_id}_{year}__{issue_number}"
95 xissue.volume = issue_number
97 else:
98 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}"
99 xissue.volume = volume_number
101 xissue.year = year
102 xissue.number = issue_number.replace(",", "-")
103 xissue.url = url
105 return xissue
107 def parse_issue_content(self, content, xissue: IssueData):
108 soup = BeautifulSoup(content, "html.parser")
109 article_nodes = soup.select("div.info")
110 for index_article, article_node in enumerate(article_nodes):
111 xarticle = create_articledata()
112 xarticle.pid = "a" + str(index_article)
114 article_link_node = article_node.select_one("a")
115 if article_link_node is None: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None")
117 href = article_link_node.get("href")
118 if href is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None")
120 if isinstance(href, list): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list")
122 xissue_url = xissue.url
123 if xissue_url is None: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None")
125 xarticle.url = xissue_url + href
127 xissue.articles.append(xarticle)
129 def parse_article_content(self, content, xissue, xarticle, url, pid):
130 """
131 Parse the content with Beautifulsoup and returns an ArticleData
132 """
134 # We only parse the arXiv id in the Discrete Analysis article page
135 soup = BeautifulSoup(content, "html.parser")
137 title_info = soup.select_one("div.info")
138 if title_info is None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 raise ValueError(
140 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
141 )
142 title_node = title_info.select_one("a")
143 if title_node is None: 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true
144 title_node = soup.select_one("h2.product-title")
145 if title_node is None: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 raise ValueError(
147 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
148 )
150 title_tex = title_node.get_text()
151 xarticle.title_tex = title_tex
153 what: list[CitationLiteral] = [
154 "author",
155 "pdf",
156 "page",
157 "doi",
158 "issn",
159 "publisher",
160 "page",
161 "keywords",
162 ]
163 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
164 # abstract
166 abstract_mml_node = soup.select_one("div.details.abstract p")
167 if abstract_mml_node is None: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true
168 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
169 else:
170 abstract_tex = abstract_mml_node.get_text()
171 xabstract: AbstractDict = {
172 "tag": "abstract",
173 "value_html": "",
174 "value_tex": abstract_tex,
175 "value_xml": "",
176 "lang": "en",
177 }
178 xarticle.abstracts.append(xabstract)
180 href_attrib = soup.select_one("div.order a")
182 if href_attrib is not None: 182 ↛ 190line 182 didn't jump to line 190 because the condition on line 182 was always true
183 href = href_attrib.get("href")
184 if isinstance(href, list): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list")
186 if href is None: 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true
187 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found")
188 pdf_url = self.source_website + href
189 add_pdf_link_to_xarticle(xarticle, pdf_url)
190 if xarticle.title_tex is None or xarticle.title_tex == "": 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 print(xarticle)
192 return xarticle