Coverage for src/crawler/by_source/impan_crawler.py: 80%
115 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import IssueData, create_abstract, create_articledata, create_issuedata
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.types import CitationLiteral
8from crawler.utils import add_pdf_link_to_xarticle
11class ImpanCrawler(BaseCollectionCrawler):
12 source_name = "Institute of Mathematics Polish Academy of Sciences"
13 source_domain = "IMPAN"
14 source_website = "https://www.impan.pl/"
16 periode_end = 2016
17 periode_begin = 0
19 def parse_collection_content(self, content):
20 """
21 Discrete Analysis.
22 We ignore the journal web page and query Crossref to get the list of articles.
23 We query crossref for each article to get the list of xissues based on the publication date.
24 Each xissue has its year + list of articles with their URLs
25 """
26 if self.collection_id == "APM": 26 ↛ 27line 26 didn't jump to line 27 because the condition on line 26 was never true
27 self.periode_begin = 1955
29 if self.collection_id == "DIM": 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true
30 self.periode_begin = 2000
32 soup = BeautifulSoup(content, "html.parser")
33 xissues_dict: dict[str, IssueData] = {}
35 # Extract the list of issues
36 volume_nodes = soup.select("div.year")
38 for volume_node in volume_nodes:
39 year = volume_node.get_text()
40 year_int = int(year)
41 if self.periode_begin > year_int or year_int > self.periode_end:
42 continue
43 issues_nodes = volume_node.parent
44 if issues_nodes is None: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 continue
46 issues_nodes = issues_nodes.select("div.issues")
48 for issue_node in issues_nodes:
49 issues_link_node = issue_node.select("a")
50 for issue_link_node in issues_link_node:
51 href = issue_link_node.get("href")
52 if href is None: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 raise ValueError(
54 f"[{self.source_domain}] {self.collection_id} : Collection href is None"
55 )
56 if isinstance(href, list): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 raise ValueError(
58 f"[{self.source_domain}] {self.collection_id} : Collection href is an array"
59 )
60 url = urljoin(self.source_website, href)
62 xissue = self.create_impan_xissue(url, year)
63 # Prevent duplicate issues
64 # NOTE : is this needed ?
65 pid = xissue.pid
66 if pid is None: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 continue
68 if pid in xissues_dict:
69 print(
70 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}"
71 )
72 continue
73 xissues_dict[pid] = xissue
75 return list(xissues_dict.values())
77 def create_impan_xissue(self, url: str, year: str):
78 if url.endswith("/"): 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 url = url[:-1]
80 parts = url.split("/")
81 issue_number = parts[-1].replace(",", "-")
82 volume_number = parts[-2]
84 xissue = create_issuedata()
85 if volume_number == "all":
86 xissue.pid = f"{self.collection_id}_{year}__{issue_number}"
87 xissue.volume = issue_number
89 else:
90 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}"
91 xissue.volume = volume_number
93 xissue.year = year
94 xissue.number = issue_number.replace(",", "-")
95 xissue.url = url
97 return xissue
99 def parse_issue_content(self, content, xissue: IssueData):
100 soup = BeautifulSoup(content, "html.parser")
101 article_nodes = soup.select("div.info")
102 for index_article, article_node in enumerate(article_nodes):
103 xarticle = create_articledata()
104 xarticle.pid = "a" + str(index_article)
106 article_link_node = article_node.select_one("a")
107 if article_link_node is None: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None")
109 href = article_link_node.get("href")
110 if href is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None")
112 if isinstance(href, list): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list")
114 xissue_url = xissue.url
115 if xissue_url is None: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None")
117 xarticle.url = xissue_url + href
119 xissue.articles.append(xarticle)
121 def parse_article_content(self, content, xissue, xarticle, url, pid):
122 """
123 Parse the content with Beautifulsoup and returns an ArticleData
124 """
126 # We only parse the arXiv id in the Discrete Analysis article page
127 soup = BeautifulSoup(content, "html.parser")
129 title_info = soup.select_one("div.info")
130 if title_info is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 raise ValueError(
132 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
133 )
134 title_node = title_info.select_one("a")
135 if title_node is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true
136 title_node = soup.select_one("h2.product-title")
137 if title_node is None: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 raise ValueError(
139 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found"
140 )
142 title_tex = title_node.get_text()
143 xarticle.title_tex = title_tex
144 xarticle.lang = self.detect_language(xarticle.title_tex)
146 what: list[CitationLiteral] = [
147 "author",
148 "pdf",
149 "page",
150 "doi",
151 "issn",
152 "publisher",
153 "page",
154 "keywords",
155 ]
156 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
158 # abstract
159 abstract_mml_node = soup.select_one("div.details.abstract p")
160 if abstract_mml_node is None:
161 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
162 else:
163 abstract_tex = abstract_mml_node.get_text()
164 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang)
165 xarticle.abstracts.append(xabstract)
167 href_attrib = soup.select_one("div.order a")
169 if href_attrib is not None: 169 ↛ 177line 169 didn't jump to line 177 because the condition on line 169 was always true
170 href = href_attrib.get("href")
171 if isinstance(href, list): 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list")
173 if href is None: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found")
175 pdf_url = urljoin(self.source_website, href)
176 add_pdf_link_to_xarticle(xarticle, pdf_url)
177 if xarticle.title_tex is None or xarticle.title_tex == "": 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 print(xarticle)
179 return xarticle