Coverage for src/crawler/by_source/sasa_crawler.py: 79%
154 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from lingua import Language, LanguageDetectorBuilder
5from ptf.model_data import (
6 ArticleData,
7 IssueData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_extlink,
12 create_subj,
13)
14from requests import Response
16from crawler.base_crawler import BaseCollectionCrawler
17from crawler.utils import add_pdf_link_to_xarticle
20class SasaCrawler(BaseCollectionCrawler):
21 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"
22 source_domain = "SASA"
23 source_website = "http://elib.mi.sanu.ac.rs"
25 periode_end = 9999
26 periode_begin = 0
28 def build_language_detector(self):
29 self.language_detector = LanguageDetectorBuilder.from_languages(
30 Language.ENGLISH, Language.SERBIAN
31 ).build()
33 def parse_collection_content(self, content):
34 soup = BeautifulSoup(content, "html.parser")
35 xissues: list[IssueData] = []
37 # Extract the list of issues
38 # Filter out empty table cells
39 volume_nodes = [
40 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"
41 ]
42 for vol_node in volume_nodes:
43 # NOTE : should we parse year here or in the issue itself ?
44 href = vol_node.get("href")
45 if isinstance(href, list): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError(
47 f"[{self.source_domain}] {self.collection_id} : Collection href is an array."
48 )
49 if href is None: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError(
51 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found"
52 )
54 # Parse Volume and Issue numbers
55 url = self.source_website + "/pages/" + href
56 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024
57 volume_re = list(
58 re.finditer(
59 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",
60 vol_node.text,
61 )
62 )
63 if len(volume_re) == 0: 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was never true
64 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |
65 volume_re = list(
66 re.finditer(
67 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)",
68 vol_node.text,
69 )
70 )
71 if len(volume_re) == 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise IndexError(
73 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
74 )
75 volume_metadata = volume_re[0].groupdict()
76 year = int(volume_metadata["year"])
77 if self.periode_begin <= year and year <= self.periode_end: 77 ↛ 42line 77 didn't jump to line 42 because the condition on line 77 was always true
78 xissues.append(
79 self.create_xissue(
80 url,
81 volume_metadata["year"],
82 volume_metadata["volume"].strip(),
83 volume_metadata.get("issue", None),
84 )
85 )
87 # Handle pagination
88 pages_node = soup.select_one(".page_selector")
89 if pages_node is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 return xissues
91 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
92 if next_page_node is None:
93 return xissues
94 next_page_href = next_page_node.get("href")
95 if isinstance(next_page_href, list): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 raise ValueError(
97 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array."
98 )
99 if next_page_href is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 raise ValueError(
101 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found"
102 )
104 content = self.download_file(self.source_website + "/" + next_page_href)
105 return xissues + self.parse_collection_content(content)
107 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
108 soup = BeautifulSoup(content, "html.parser")
109 article_nodes = soup.select(".content .result")
110 if xissue.pid is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 raise ValueError(
112 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"
113 )
115 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.
116 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")
117 # if (issue_publisher_node is not None):
118 # publisher = issue_publisher_node.text
119 # xpub = create_publisherdata()
120 # xpub.name = publisher.removeprefix("Publisher ")
121 # xissue.publisher = xpub
123 for i, art_node in enumerate(article_nodes):
124 xarticle = self.parse_sasa_article(i + index, art_node, xissue)
125 xissue.articles.append(xarticle)
127 index = index + len(article_nodes)
128 # Handle pagination
129 pages_node = soup.select_one(".page_selector")
130 if pages_node is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 return
132 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
133 if next_page_node is None:
134 return
135 next_page_href = next_page_node.get("href")
136 if isinstance(next_page_href, list): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise ValueError(
138 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array."
139 )
140 if next_page_href is None: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 raise ValueError(
142 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found"
143 )
145 content = self.download_file(self.source_website + "/" + next_page_href)
146 self.parse_issue_content(content, xissue, index)
148 def parse_sasa_article(
149 self, article_index: int, article_node: Tag, xissue: IssueData
150 ) -> ArticleData:
151 """
152 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead
153 """
155 title_node = article_node.select_one(".main_link")
156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")
158 href = title_node.get("href")
159 if href is None or isinstance(href, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found")
162 xarticle = create_articledata()
164 pages_node = article_node.select_one(".pages")
165 if pages_node is not None: 165 ↛ 167line 165 didn't jump to line 167 because the condition on line 165 was always true
166 self.set_pages(xarticle, pages_node.text)
167 xarticle.title_tex = title_node.text
168 xarticle.title_html = title_node.text
169 xarticle.pid = f"{xissue.pid}_a{article_index}"
171 if xissue.url is not None: 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true
172 ext_link = create_extlink(
173 rel="source", location=xissue.url, metadata=self.source_domain
174 )
175 xarticle.ext_links.append(ext_link)
176 # xarticle.url = xissue.url
178 # Abstract
179 abstract_node = article_node.select_one(".secondary_link")
181 if abstract_node is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
183 else:
184 abstract_href = abstract_node.get("href")
185 if abstract_href is None or isinstance(abstract_href, list): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 raise ValueError(
187 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"
188 )
190 abstract = self.fetch_sasa_abstract(
191 self.source_website + "/" + abstract_href, xarticle.pid
192 )
193 if abstract is not None: 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was always true
194 xarticle.abstracts.append(abstract)
195 # LANG
196 xarticle.lang = abstract["lang"]
198 author_node = article_node.select_one(".secondary_text")
199 if author_node is not None: 199 ↛ 207line 199 didn't jump to line 207 because the condition on line 199 was always true
200 authors = re.findall(
201 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text
202 )
203 for a in authors:
204 author = create_contributor(role="author", string_name=a)
205 xarticle.contributors.append(author)
206 else:
207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found")
209 secondary_nodes = article_node.select(".secondary_info_text")
210 subjects = []
211 keywords = []
212 doi = None
213 for node in secondary_nodes:
214 text = node.text
215 if text.startswith("Keywords"):
216 keywords = text.removeprefix("Keywords:\xa0").split("; ")
217 for kwd in keywords:
218 subject = create_subj(value=kwd, lang=xarticle.lang)
219 xarticle.kwds.append(subject)
220 elif text.startswith("DOI"):
221 doi = text.removeprefix("DOI:\xa0")
222 if doi is not None: 222 ↛ 213line 222 didn't jump to line 213 because the condition on line 222 was always true
223 xarticle.doi = doi
224 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
225 elif text.startswith("MSC"):
226 subjects = text.removeprefix("MSC:\xa0").split("; ")
227 for subj in subjects:
228 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
229 xarticle.kwds.append(subject)
230 elif text.startswith("Zbl:"): 230 ↛ 213line 230 didn't jump to line 213 because the condition on line 230 was always true
231 zbl_link = node.select_one(".secondary_link")
232 if zbl_link is not None: 232 ↛ 213line 232 didn't jump to line 213 because the condition on line 232 was always true
233 xarticle.extids.append(("zbl-item-id", zbl_link.text))
235 if href.startswith("http"):
236 pdf_url = href
237 else:
238 pdf_url = self.source_website + "/files/" + href
240 # Fix for Filomat
241 if "www.pmf.ni.ac.rs" in pdf_url:
242 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")
244 add_pdf_link_to_xarticle(xarticle, pdf_url)
245 return xarticle
247 def fetch_sasa_abstract(self, abstract_url: str, pid: str):
248 content = self.download_file(abstract_url)
249 soup = BeautifulSoup(content, "html.parser")
250 text_node = soup.select_one("p")
251 if text_node is not None: 251 ↛ 259line 251 didn't jump to line 259 because the condition on line 251 was always true
252 text = text_node.text.replace("$$", "$")
253 abstract = create_abstract(
254 tag="abstract",
255 value_tex=text,
256 lang=self.detect_language(text),
257 )
258 return abstract
259 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found")
261 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ?
262 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm
263 def decode_response(self, response: Response, encoding: str = "utf-8"):
264 """Force windows-1250 encoding if we cannot cannot read the abstract"""
265 try:
266 return super().decode_response(response, encoding)
267 except UnicodeDecodeError:
268 print(
269 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250"
270 )
271 try:
272 return super().decode_response(response, "windows-1250")
273 except UnicodeDecodeError:
274 raise BufferError(
275 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
276 )