Coverage for src / crawler / by_source / impan_crawler.py: 82%
101 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import time
2from urllib.parse import urljoin
4import lingua
5from bs4 import BeautifulSoup
6from lingua import LanguageDetectorBuilder
7from ptf.model_data import (
8 IssueData,
9 create_abstract,
10 create_articledata,
11 create_issuedata,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.types import CitationLiteral
18class ImpanCrawler(BaseCollectionCrawler):
19 source_name = "Institute of Mathematics Polish Academy of Sciences"
20 source_domain = "IMPAN"
21 source_website = "https://www.impan.pl/"
23 requests_timeout = 120
24 _language_detector_builder = LanguageDetectorBuilder.from_languages(
25 lingua.Language.ENGLISH,
26 lingua.Language.FRENCH,
27 lingua.Language.POLISH,
28 lingua.Language.RUSSIAN,
29 lingua.Language.GERMAN,
30 )
32 def parse_collection_content(self, content):
33 """
34 Discrete Analysis.
35 We ignore the journal web page and query Crossref to get the list of articles.
36 We query crossref for each article to get the list of xissues based on the publication date.
37 Each xissue has its year + list of articles with their URLs
38 """
40 soup = BeautifulSoup(content, "html.parser")
41 xissues_dict: dict[str, IssueData] = {}
43 # Extract the list of issues
44 volume_nodes = soup.select("div.year")
46 for volume_node in volume_nodes:
47 year = volume_node.get_text()
49 issues_nodes = volume_node.parent
50 if issues_nodes is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 continue
52 issues_nodes = issues_nodes.select("div.issues")
54 for issue_node in issues_nodes:
55 issues_link_node = issue_node.select("a")
56 for issue_link_node in issues_link_node:
57 href = self.get_str_attr(issue_link_node, "href")
58 url = urljoin(self.source_website, href)
60 xissue = self.create_impan_xissue(url, year)
61 # Prevent duplicate issues
62 # NOTE : is this needed ?
63 pid = xissue.pid
64 if pid is None: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 continue
66 if pid in xissues_dict:
67 self.logger.debug("Duplicate issue in connection", extra={"pid": pid})
68 continue
69 xissues_dict[pid] = xissue
71 return list(xissues_dict.values())
73 def create_impan_xissue(self, url: str, year: str):
74 if url.endswith("/"):
75 url = url[:-1]
76 parts = url.split("/")
77 issue_number = parts[-1].replace(",", "-")
78 volume_number = parts[-2]
80 xissue = create_issuedata()
81 if volume_number == "all":
82 xissue.pid = f"{self.collection_id}_{year}__{issue_number}"
83 xissue.volume = issue_number
85 else:
86 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}"
87 xissue.volume = volume_number
89 xissue.year = year
90 xissue.number = issue_number.replace(",", "-")
91 xissue.url = url
93 return xissue
95 def parse_issue_content(self, content, xissue: IssueData, retries=0):
96 soup = BeautifulSoup(content, "html.parser")
97 article_nodes = soup.select("div.info")
98 if len(article_nodes) == 0 and xissue.url: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 if retries > 3:
100 self.logger.info(
101 "Maximum number of retires reached. This issue seems to be empty",
102 {"pid": xissue.pid, "url": xissue.url},
103 )
104 return
105 self.logger.debug("Couldn't find articles... Retrying", {"pid": xissue.pid})
106 time.sleep(60)
107 return self.parse_issue_content(
108 self.download_file(xissue.url, force_refresh=True),
109 xissue=xissue,
110 retries=retries + 1,
111 )
112 for index_article, article_node in enumerate(article_nodes):
113 xarticle = create_articledata()
114 xarticle.pid = "a" + str(index_article)
116 article_link_node = article_node.select_one("a")
117 if article_link_node is None: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None")
120 href = self.get_str_attr(article_link_node, "href")
122 xissue_url = xissue.url
123 if xissue_url is None: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None")
125 xarticle.url = xissue_url + href
127 xissue.articles.append(xarticle)
129 def parse_article_content(self, content, xissue, xarticle, url):
130 """
131 Parse the content with Beautifulsoup and returns an ArticleData
132 """
134 # We only parse the arXiv id in the Discrete Analysis article page
135 soup = BeautifulSoup(content, "html.parser")
137 title_info = soup.select_one("div.info")
138 if title_info is None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 self.logger.error(
140 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
141 )
142 return None
143 title_node = title_info.select_one("a")
144 if title_node is None: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true
145 title_node = soup.select_one("h2.product-title")
146 if title_node is None: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 self.logger.error(
148 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
149 )
150 return None
152 title_tex = title_node.get_text()
153 xarticle.title_tex = title_tex
154 xarticle.lang = self.detect_language(xarticle.title_tex)
156 what: list[CitationLiteral] = [
157 "author",
158 "page",
159 "doi",
160 "publisher",
161 "page",
162 "keywords",
163 ]
165 # If download button has the "buy" class, skip adding the pdf.
166 if not soup.select_one("a.button.download.noborder.buy"): 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true
167 what.append("pdf")
169 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
171 # abstract
172 abstract_mml_node = soup.select_one("div.details.abstract p")
173 if abstract_mml_node is None:
174 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid})
175 else:
176 abstract_tex = abstract_mml_node.get_text()
178 xarticle.abstracts.append(create_abstract(value_tex=abstract_tex, lang=xarticle.lang))
180 # href_attrib = soup.select_one("div.order a")
182 # if href_attrib is not None:
183 # href = href_attrib.get("href")
184 # if isinstance(href, list):
185 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list")
186 # if href is None:
187 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found")
188 # pdf_url = urljoin(self.source_website, href)
189 # add_pdf_link_to_xarticle(xarticle, pdf_url)
190 return xarticle