Coverage for src/crawler/by_source/sasa_crawler.py: 79%
155 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1import re
2from urllib.parse import urljoin
4import requests
5from bs4 import BeautifulSoup, Tag
6from lingua import Language, LanguageDetectorBuilder
7from ptf.model_data import (
8 ArticleData,
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13 create_extlink,
14 create_subj,
15)
16from requests import Response
18from crawler.base_crawler import BaseCollectionCrawler
19from crawler.utils import add_pdf_link_to_xarticle
22class SasaCrawler(BaseCollectionCrawler):
23 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"
24 source_domain = "SASA"
25 source_website = "http://elib.mi.sanu.ac.rs"
27 language_detector = LanguageDetectorBuilder.from_languages(
28 Language.ENGLISH, Language.SERBIAN
29 ).build()
31 def parse_collection_content(self, content):
32 soup = BeautifulSoup(content, "html.parser")
33 xissues: list[IssueData] = []
35 # Extract the list of issues
36 # Filter out empty table cells
37 volume_nodes = [
38 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"
39 ]
40 for vol_node in volume_nodes:
41 # NOTE : should we parse year here or in the issue itself ?
42 href = vol_node.get("href")
43 if isinstance(href, list): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 raise ValueError(
45 f"[{self.source_domain}] {self.collection_id} : Collection href is an array."
46 )
47 if href is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 raise ValueError(
49 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found"
50 )
52 # Parse Volume and Issue numbers
53 url = self.source_website + "/pages/" + href
54 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024
55 volume_re = list(
56 re.finditer(
57 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",
58 vol_node.text,
59 )
60 )
61 if len(volume_re) == 0: 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was never true
62 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |
63 volume_re = list(
64 re.finditer(
65 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)",
66 vol_node.text,
67 )
68 )
69 if len(volume_re) == 0: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 raise IndexError(
71 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
72 )
73 volume_metadata = volume_re[0].groupdict()
75 # HACK : temporary workaround
76 # https://gricad-gitlab.univ-grenoble-alpes.fr/mathdoc/ptfs/ptf-app-crawler/-/issues/27
77 if url != "http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=flmt&rbr=95":
78 xissues.append(
79 self.create_xissue(
80 url,
81 volume_metadata["year"],
82 volume_metadata["volume"].strip(),
83 volume_metadata.get("issue", None),
84 )
85 )
87 # Handle pagination
88 pages_node = soup.select_one(".page_selector")
89 if pages_node is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 return xissues
91 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
92 if next_page_node is None:
93 return xissues
94 next_page_href = next_page_node.get("href")
95 if isinstance(next_page_href, list): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 raise ValueError(
97 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array."
98 )
99 if next_page_href is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 raise ValueError(
101 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found"
102 )
104 content = self.download_file(self.source_website + "/" + next_page_href)
105 return xissues + self.parse_collection_content(content)
107 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
108 soup = BeautifulSoup(content, "html.parser")
109 article_nodes = soup.select(".content .result")
110 if xissue.pid is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 raise ValueError(
112 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"
113 )
115 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.
116 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")
117 # if (issue_publisher_node is not None):
118 # publisher = issue_publisher_node.text
119 # xpub = create_publisherdata()
120 # xpub.name = publisher.removeprefix("Publisher ")
121 # xissue.publisher = xpub
123 for i, art_node in enumerate(article_nodes):
124 xarticle = self.parse_sasa_article(i + index, art_node, xissue)
125 xissue.articles.append(xarticle)
127 index = index + len(article_nodes)
128 # Handle pagination
129 pages_node = soup.select_one(".page_selector")
130 if pages_node is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 return
132 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
133 if next_page_node is None:
134 return
135 next_page_href = next_page_node.get("href")
136 if isinstance(next_page_href, list): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise ValueError(
138 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array."
139 )
140 if next_page_href is None: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 raise ValueError(
142 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found"
143 )
145 content = self.download_file(self.source_website + "/" + next_page_href)
146 self.parse_issue_content(content, xissue, index)
148 def parse_sasa_article(
149 self, article_index: int, article_node: Tag, xissue: IssueData
150 ) -> ArticleData:
151 """
152 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead
153 """
155 title_node = article_node.select_one(".main_link")
156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")
158 href = title_node.get("href")
159 if href is None or isinstance(href, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found")
162 xarticle = create_articledata()
164 pages_node = article_node.select_one(".pages")
165 if pages_node is not None: 165 ↛ 167line 165 didn't jump to line 167 because the condition on line 165 was always true
166 self.set_pages(xarticle, pages_node.text)
167 xarticle.title_tex = title_node.text
168 xarticle.title_html = title_node.text
169 xarticle.pid = f"{xissue.pid}_a{article_index}"
171 if xissue.url is not None: 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true
172 ext_link = create_extlink(
173 rel="source", location=xissue.url, metadata=self.source_domain
174 )
175 xarticle.ext_links.append(ext_link)
176 # xarticle.url = xissue.url
178 # Abstract
179 abstract_node = article_node.select_one(".secondary_link:-soup-contains-own('Abstract')")
181 if abstract_node is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
183 else:
184 abstract_href = abstract_node.get("href")
185 if abstract_href is None or isinstance(abstract_href, list): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 raise ValueError(
187 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"
188 )
190 abstract = self.fetch_sasa_abstract(
191 urljoin(self.source_website, abstract_href), xarticle.pid
192 )
193 if abstract is not None: 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was always true
194 xarticle.abstracts.append(abstract)
195 # LANG
196 xarticle.lang = abstract["lang"]
198 author_node = article_node.select_one(".secondary_text")
199 if author_node is not None: 199 ↛ 207line 199 didn't jump to line 207 because the condition on line 199 was always true
200 authors = re.findall(
201 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text
202 )
203 for a in authors:
204 author = create_contributor(role="author", string_name=a)
205 xarticle.contributors.append(author)
206 else:
207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found")
209 secondary_nodes = article_node.select(".secondary_info_text")
210 subjects = []
211 keywords = []
212 doi = None
213 for node in secondary_nodes:
214 text = node.text
215 if text.startswith("Keywords"):
216 keywords = text.removeprefix("Keywords:\xa0").split("; ")
217 for kwd in keywords:
218 subject = create_subj(value=kwd, lang=xarticle.lang)
219 xarticle.kwds.append(subject)
220 elif text.startswith("DOI") and self.collection_id != "YJOR":
221 doi = text.removeprefix("DOI:\xa0")
222 if doi is not None: 222 ↛ 213line 222 didn't jump to line 213 because the condition on line 222 was always true
223 xarticle.doi = doi
224 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
225 elif text.startswith("MSC"):
226 subjects = text.removeprefix("MSC:\xa0").split("; ")
227 for subj in subjects:
228 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
229 xarticle.kwds.append(subject)
230 elif text.startswith("Zbl:"): 230 ↛ 213line 230 didn't jump to line 213 because the condition on line 230 was always true
231 zbl_link = node.select_one(".secondary_link")
232 if zbl_link is not None: 232 ↛ 213line 232 didn't jump to line 213 because the condition on line 232 was always true
233 xarticle.extids.append(("zbl-item-id", zbl_link.text))
235 if href.startswith("http"):
236 pdf_url = href
237 else:
238 pdf_url = self.source_website + "/files/" + href
240 # Fix for Filomat
241 if "www.pmf.ni.ac.rs" in pdf_url:
242 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")
244 add_pdf_link_to_xarticle(xarticle, pdf_url)
245 return xarticle
247 def fetch_sasa_abstract(self, abstract_url: str, pid: str):
248 try:
249 content = self.download_file(abstract_url)
250 except requests.exceptions.HTTPError:
251 pass
252 soup = BeautifulSoup(content, "html.parser")
253 text_node = soup.select_one("p")
254 if text_node is not None: 254 ↛ 261line 254 didn't jump to line 261 because the condition on line 254 was always true
255 text = text_node.text.replace("$$", "$")
256 abstract = create_abstract(
257 tag="abstract",
258 value_tex=text,
259 )
260 return abstract
261 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found")
263 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ?
264 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm
265 def decode_response(self, response: Response, encoding: str = "utf-8"):
266 """Force windows-1250 encoding if we cannot cannot read the abstract"""
267 try:
268 return super().decode_response(response, encoding)
269 except UnicodeDecodeError:
270 print(
271 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250"
272 )
273 try:
274 return super().decode_response(response, "windows-1250")
275 except UnicodeDecodeError:
276 raise BufferError(
277 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
278 )