Coverage for src/crawler/by_source/impan_crawler.py: 77%
125 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import time
2from datetime import datetime, timedelta
3from urllib.parse import urljoin
5import lingua
6from bs4 import BeautifulSoup
7from lingua import LanguageDetectorBuilder
8from ptf.model_data import (
9 ArticleData,
10 IssueData,
11 create_abstract,
12 create_articledata,
13 create_issuedata,
14)
16from crawler.base_crawler import BaseCollectionCrawler
17from crawler.types import CitationLiteral
20class ImpanCrawler(BaseCollectionCrawler):
21 source_name = "Institute of Mathematics Polish Academy of Sciences"
22 source_domain = "IMPAN"
23 source_website = "https://www.impan.pl/"
25 language_detector = LanguageDetectorBuilder.from_languages(
26 lingua.Language.ENGLISH,
27 lingua.Language.FRENCH,
28 lingua.Language.POLISH,
29 lingua.Language.RUSSIAN,
30 lingua.Language.GERMAN,
31 ).build()
33 def parse_collection_content(self, content):
34 """
35 Discrete Analysis.
36 We ignore the journal web page and query Crossref to get the list of articles.
37 We query crossref for each article to get the list of xissues based on the publication date.
38 Each xissue has its year + list of articles with their URLs
39 """
41 soup = BeautifulSoup(content, "html.parser")
42 xissues_dict: dict[str, IssueData] = {}
44 # Extract the list of issues
45 volume_nodes = soup.select("div.year")
47 for volume_node in volume_nodes:
48 year = volume_node.get_text()
50 issues_nodes = volume_node.parent
51 if issues_nodes is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 continue
53 issues_nodes = issues_nodes.select("div.issues")
55 for issue_node in issues_nodes:
56 issues_link_node = issue_node.select("a")
57 for issue_link_node in issues_link_node:
58 href = issue_link_node.get("href")
59 if href is None: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError(
61 f"[{self.source_domain}] {self.collection_id} : Collection href is None"
62 )
63 if isinstance(href, list): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError(
65 f"[{self.source_domain}] {self.collection_id} : Collection href is an array"
66 )
67 url = urljoin(self.source_website, href)
69 xissue = self.create_impan_xissue(url, year)
70 # Prevent duplicate issues
71 # NOTE : is this needed ?
72 pid = xissue.pid
73 if pid is None: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 continue
75 if pid in xissues_dict:
76 self.logger.debug("Duplicate issue in connection", extra={"pid": pid})
77 continue
78 xissues_dict[pid] = xissue
80 return list(xissues_dict.values())
82 def create_impan_xissue(self, url: str, year: str):
83 if url.endswith("/"):
84 url = url[:-1]
85 parts = url.split("/")
86 issue_number = parts[-1].replace(",", "-")
87 volume_number = parts[-2]
89 xissue = create_issuedata()
90 if volume_number == "all":
91 xissue.pid = f"{self.collection_id}_{year}__{issue_number}"
92 xissue.volume = issue_number
94 else:
95 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}"
96 xissue.volume = volume_number
98 xissue.year = year
99 xissue.number = issue_number.replace(",", "-")
100 xissue.url = url
102 return xissue
104 def parse_issue_content(self, content, xissue: IssueData, retries=0):
105 soup = BeautifulSoup(content, "html.parser")
106 article_nodes = soup.select("div.info")
107 if len(article_nodes) == 0 and xissue.url: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 if retries > 3:
109 self.logger.info(
110 "Maximum number of retires reached. This issue seems to be empty",
111 {"pid": xissue.pid, "url": xissue.url},
112 )
113 return
114 self.logger.debug("Couldn't find articles... Retrying", {"pid": xissue.pid})
115 time.sleep(60)
116 return self.parse_issue_content(
117 self.download_file(xissue.url, force_refresh=True),
118 xissue=xissue,
119 retries=retries + 1,
120 )
121 for index_article, article_node in enumerate(article_nodes):
122 xarticle = create_articledata()
123 xarticle.pid = "a" + str(index_article)
125 article_link_node = article_node.select_one("a")
126 if article_link_node is None: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None")
128 href = article_link_node.get("href")
129 if href is None: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None")
131 if isinstance(href, list): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list")
133 xissue_url = xissue.url
134 if xissue_url is None: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true
135 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None")
136 xarticle.url = xissue_url + href
138 xissue.articles.append(xarticle)
140 def parse_article_content(self, content, xissue, xarticle, url):
141 """
142 Parse the content with Beautifulsoup and returns an ArticleData
143 """
145 # We only parse the arXiv id in the Discrete Analysis article page
146 soup = BeautifulSoup(content, "html.parser")
148 title_info = soup.select_one("div.info")
149 if title_info is None: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 raise ValueError(
151 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
152 )
153 title_node = title_info.select_one("a")
154 if title_node is None: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true
155 title_node = soup.select_one("h2.product-title")
156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 raise ValueError(
158 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
159 )
161 title_tex = title_node.get_text()
162 xarticle.title_tex = title_tex
163 xarticle.lang = self.detect_language(xarticle.title_tex)
165 what: list[CitationLiteral] = [
166 "author",
167 "page",
168 "doi",
169 "publisher",
170 "page",
171 "keywords",
172 ]
174 # If download button has the "buy" class, skip adding the pdf.
175 if not soup.select_one("a.button.download.noborder.buy"): 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true
176 what.append("pdf")
178 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
180 # abstract
181 abstract_mml_node = soup.select_one("div.details.abstract p")
182 if abstract_mml_node is None:
183 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid})
184 else:
185 abstract_tex = abstract_mml_node.get_text()
186 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang)
187 xarticle.abstracts.append(xabstract)
189 # href_attrib = soup.select_one("div.order a")
191 # if href_attrib is not None:
192 # href = href_attrib.get("href")
193 # if isinstance(href, list):
194 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list")
195 # if href is None:
196 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found")
197 # pdf_url = urljoin(self.source_website, href)
198 # add_pdf_link_to_xarticle(xarticle, pdf_url)
199 return xarticle
201 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
202 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
203 parsed_xarticle = xarticle
204 if hasattr(xarticle, "url") and xarticle.url: 204 ↛ 223line 204 didn't jump to line 223 because the condition on line 204 was always true
205 parsed_xarticle = None
206 attempts = 0
207 while parsed_xarticle is None and attempts < 3:
208 try:
209 parsed_xarticle = super().crawl_article(xarticle, xissue)
210 except ValueError as e:
211 self.logger.debug(f"Caught error : {e}", {"pid": xarticle.pid})
212 attempts += 1
213 self.logger.debug(
214 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})",
215 {"pid": xarticle.pid},
216 )
217 # 15 mins, 30 mins, 45 mins
218 time.sleep(attempts * 15 * 60)
219 self.download_file(xarticle.url, force_refresh=True)
221 if parsed_xarticle is None: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 raise ValueError(f"Couldn't parse article {xarticle.pid}")
223 return parsed_xarticle