Coverage for src/crawler/by_source/sasa_crawler.py: 81%
172 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import (
5 AbstractDict,
6 ArticleData,
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12 create_subj,
13)
14from requests import Response
16from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
19class SasaCrawler(BaseCollectionCrawler):
20 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"
21 source_domain = "SASA"
22 source_website = "http://elib.mi.sanu.ac.rs"
24 periode_end = float("inf")
25 periode_begin = 0
27 def __init__(self, *args, **kwargs):
28 super().__init__(*args, **kwargs)
30 self.source = self.get_or_create_source()
32 self.periode = self.get_or_create_periode()
34 def parse_collection_content(self, content):
35 soup = BeautifulSoup(content, "html.parser")
36 xissues: list[IssueData] = []
38 # Extract the list of issues
39 # Filter out empty table cells
40 volume_nodes = [
41 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"
42 ]
43 for vol_node in volume_nodes:
44 # NOTE : should we parse year here or in the issue itself ?
45 href = vol_node.get("href")
46 if isinstance(href, list): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 raise ValueError(
48 f"[{self.source_domain}] {self.collection_id} : Collection href is an array."
49 )
50 if href is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 raise ValueError(
52 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found"
53 )
55 # Parse Volume and Issue numbers
56 url = self.source_website + "/pages/" + href
57 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024
58 volume_re = list(
59 re.finditer(
60 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",
61 vol_node.text,
62 )
63 )
64 if len(volume_re) == 0:
65 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |
66 volume_re = list(
67 re.finditer(
68 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)",
69 vol_node.text,
70 )
71 )
72 if len(volume_re) == 0: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 raise IndexError(
74 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
75 )
76 volume_metadata = volume_re[0].groupdict()
77 year = int(volume_metadata["year"])
78 if self.periode_begin <= year and year <= self.periode_end: 78 ↛ 43line 78 didn't jump to line 43 because the condition on line 78 was always true
79 xissues.append(
80 self.create_xissue(
81 url,
82 volume_metadata["year"],
83 volume_metadata["volume"].strip(),
84 volume_metadata.get("issue", None),
85 )
86 )
88 # Handle pagination
89 pages_node = soup.select_one(".page_selector")
90 if pages_node is None: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 return xissues
92 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
93 if next_page_node is None:
94 return xissues
95 next_page_href = next_page_node.get("href")
96 if isinstance(next_page_href, list): 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError(
98 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array."
99 )
100 if next_page_href is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 raise ValueError(
102 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found"
103 )
105 content = self.get_page_content(self.source_website + "/" + next_page_href)
106 return xissues + self.parse_collection_content(content)
108 def create_xissue(self, url: str, year: str, volume_number: str, issue_number="1"):
109 if url.endswith("/"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 url = url[:-1]
111 xissue = create_issuedata()
112 xissue.url = url
114 # Replace any non-word character with an underscore
115 xissue.pid = re.sub(
116 r"[^a-zA-Z0-9-]+", "_", f"{self.collection_id}_{year}__{volume_number}"
117 )
118 xissue.volume = volume_number
120 xissue.year = year
121 if issue_number is not None:
122 xissue.pid += f"_{issue_number}"
123 xissue.number = issue_number.replace(",", "-")
124 return xissue
126 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
127 soup = BeautifulSoup(content, "html.parser")
128 article_nodes = soup.select(".content .result")
129 if xissue.pid is None: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError(
131 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"
132 )
134 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.
135 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")
136 # if (issue_publisher_node is not None):
137 # publisher = issue_publisher_node.text
138 # xpub = create_publisherdata()
139 # xpub.name = publisher.removeprefix("Publisher ")
140 # xissue.publisher = xpub
142 for i, art_node in enumerate(article_nodes):
143 xarticle = self.parse_sasa_article(i + index, art_node, xissue)
144 xissue.articles.append(xarticle)
146 index = index + len(article_nodes)
147 # Handle pagination
148 pages_node = soup.select_one(".page_selector")
149 if pages_node is None: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 return
151 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
152 if next_page_node is None:
153 return
154 next_page_href = next_page_node.get("href")
155 if isinstance(next_page_href, list): 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 raise ValueError(
157 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array."
158 )
159 if next_page_href is None: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 raise ValueError(
161 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found"
162 )
164 content = self.get_page_content(self.source_website + "/" + next_page_href)
165 self.parse_issue_content(content, xissue, index)
167 def parse_sasa_article(
168 self, article_index: int, article_node: Tag, xissue: IssueData
169 ) -> ArticleData:
170 """
171 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead
172 """
173 title_node = article_node.select_one(".main_link")
174 if title_node is None: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")
176 href = title_node.get("href")
177 if href is None or isinstance(href, list): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found")
180 xarticle = create_articledata()
182 pages_node = article_node.select_one(".pages")
183 if pages_node is not None: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true
184 xarticle.page_range = pages_node.text
185 xarticle.title_tex = title_node.text
186 xarticle.title_html = title_node.text
187 xarticle.pid = f"{xissue.pid}_a{article_index}"
189 if xissue.url is not None: 189 ↛ 196line 189 didn't jump to line 196 because the condition on line 189 was always true
190 ext_link = create_extlink(
191 rel="source", location=xissue.url, metadata=self.source_domain
192 )
193 xarticle.ext_links.append(ext_link)
194 # xarticle.url = xissue.url
196 author_node = article_node.select_one(".secondary_text")
197 if author_node is not None: 197 ↛ 207line 197 didn't jump to line 207 because the condition on line 197 was always true
198 authors = re.findall(
199 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text
200 )
201 for a in authors:
202 author = create_contributor()
203 author["role"] = "author"
204 author["string_name"] = a
205 xarticle.contributors.append(author)
206 else:
207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found")
209 abstract_node = article_node.select_one(".secondary_link")
210 if abstract_node is None:
211 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
212 else:
213 abstract_href = abstract_node.get("href")
214 if abstract_href is None or isinstance(abstract_href, list): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 raise ValueError(
216 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"
217 )
219 abstract = self.fetch_sasa_abstract(
220 self.source_website + "/" + abstract_href, xarticle.pid
221 )
222 if abstract is not None: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true
223 xarticle.abstracts.append(abstract)
225 secondary_nodes = article_node.select(".secondary_info_text")
226 subjects = []
227 keywords = []
228 doi = None
229 for node in secondary_nodes:
230 text = node.text
231 if text.startswith("Keywords"):
232 keywords = text.removeprefix("Keywords:\xa0").split("; ")
233 for kwd in keywords:
234 subject = create_subj()
235 subject["value"] = kwd
236 subject["lang"] = "en"
237 xarticle.kwds.append(subject)
238 elif text.startswith("DOI"):
239 doi = text.removeprefix("DOI:\xa0")
240 if doi is not None: 240 ↛ 229line 240 didn't jump to line 229 because the condition on line 240 was always true
241 xarticle.doi = doi
242 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
243 elif text.startswith("MSC"):
244 subjects = text.removeprefix("MSC:\xa0").split("; ")
245 for subj in subjects:
246 subject = create_subj()
247 subject["value"] = subj
248 subject["type"] = "msc"
249 subject["lang"] = "en"
250 xarticle.kwds.append(subject)
251 elif text.startswith("Zbl:"): 251 ↛ 229line 251 didn't jump to line 229 because the condition on line 251 was always true
252 zbl_link = node.select_one(".secondary_link")
253 if zbl_link is not None: 253 ↛ 229line 253 didn't jump to line 229 because the condition on line 253 was always true
254 xarticle.extids.append(("zbl-item-id", zbl_link.text))
256 if href.startswith("http"):
257 pdf_url = href
258 else:
259 pdf_url = self.source_website + "/files/" + href
261 # Fix for Filomat
262 if "www.pmf.ni.ac.rs" in pdf_url:
263 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")
265 add_pdf_link_to_xarticle(xarticle, pdf_url)
266 return xarticle
268 def fetch_sasa_abstract(self, abstract_url: str, pid: str):
269 content = self.get_page_content(abstract_url)
270 soup = BeautifulSoup(content, "html.parser")
271 text_node = soup.select_one("p")
272 if text_node is not None: 272 ↛ 280line 272 didn't jump to line 280 because the condition on line 272 was always true
273 text = text_node.text.replace("$$", "$")
274 abstract: AbstractDict = {
275 "tag": "abstract",
276 "value_tex": text,
277 "lang": "eng",
278 }
279 return abstract
280 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found")
282 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ?
283 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm
284 def decode_response(self, response: Response, encoding: str = "utf-8"):
285 """Force windows-1250 encoding if we cannot cannot read the abstract"""
286 try:
287 return super().decode_response(response, encoding)
288 except UnicodeDecodeError:
289 print(
290 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250"
291 )
292 try:
293 return super().decode_response(response, "windows-1250")
294 except UnicodeDecodeError:
295 raise BufferError(
296 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
297 )