Coverage for src/crawler/by_source/sasa_crawler.py: 79%
149 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from lingua import Language, LanguageDetectorBuilder
5from ptf.model_data import (
6 ArticleData,
7 IssueData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_extlink,
12 create_subj,
13)
14from requests import Response
16from crawler.base_crawler import BaseCollectionCrawler
17from crawler.utils import add_pdf_link_to_xarticle
20class SasaCrawler(BaseCollectionCrawler):
21 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"
22 source_domain = "SASA"
23 source_website = "http://elib.mi.sanu.ac.rs"
25 language_detector = LanguageDetectorBuilder.from_languages(
26 Language.ENGLISH, Language.SERBIAN
27 ).build()
29 def parse_collection_content(self, content):
30 soup = BeautifulSoup(content, "html.parser")
31 xissues: list[IssueData] = []
33 # Extract the list of issues
34 # Filter out empty table cells
35 volume_nodes = [
36 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"
37 ]
38 for vol_node in volume_nodes:
39 # NOTE : should we parse year here or in the issue itself ?
40 href = vol_node.get("href")
41 if isinstance(href, list): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError(
43 f"[{self.source_domain}] {self.collection_id} : Collection href is an array."
44 )
45 if href is None: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError(
47 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found"
48 )
50 # Parse Volume and Issue numbers
51 url = self.source_website + "/pages/" + href
52 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024
53 volume_re = list(
54 re.finditer(
55 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",
56 vol_node.text,
57 )
58 )
59 if len(volume_re) == 0: 59 ↛ 61line 59 didn't jump to line 61 because the condition on line 59 was never true
60 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |
61 volume_re = list(
62 re.finditer(
63 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)",
64 vol_node.text,
65 )
66 )
67 if len(volume_re) == 0: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 raise IndexError(
69 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
70 )
71 volume_metadata = volume_re[0].groupdict()
73 xissues.append(
74 self.create_xissue(
75 url,
76 volume_metadata["year"],
77 volume_metadata["volume"].strip(),
78 volume_metadata.get("issue", None),
79 )
80 )
82 # Handle pagination
83 pages_node = soup.select_one(".page_selector")
84 if pages_node is None: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 return xissues
86 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
87 if next_page_node is None:
88 return xissues
89 next_page_href = next_page_node.get("href")
90 if isinstance(next_page_href, list): 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 raise ValueError(
92 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array."
93 )
94 if next_page_href is None: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 raise ValueError(
96 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found"
97 )
99 content = self.download_file(self.source_website + "/" + next_page_href)
100 return xissues + self.parse_collection_content(content)
102 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
103 soup = BeautifulSoup(content, "html.parser")
104 article_nodes = soup.select(".content .result")
105 if xissue.pid is None: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 raise ValueError(
107 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"
108 )
110 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.
111 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")
112 # if (issue_publisher_node is not None):
113 # publisher = issue_publisher_node.text
114 # xpub = create_publisherdata()
115 # xpub.name = publisher.removeprefix("Publisher ")
116 # xissue.publisher = xpub
118 for i, art_node in enumerate(article_nodes):
119 xarticle = self.parse_sasa_article(i + index, art_node, xissue)
120 xissue.articles.append(xarticle)
122 index = index + len(article_nodes)
123 # Handle pagination
124 pages_node = soup.select_one(".page_selector")
125 if pages_node is None: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 return
127 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
128 if next_page_node is None:
129 return
130 next_page_href = next_page_node.get("href")
131 if isinstance(next_page_href, list): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError(
133 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array."
134 )
135 if next_page_href is None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 raise ValueError(
137 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found"
138 )
140 content = self.download_file(self.source_website + "/" + next_page_href)
141 self.parse_issue_content(content, xissue, index)
143 def parse_sasa_article(
144 self, article_index: int, article_node: Tag, xissue: IssueData
145 ) -> ArticleData:
146 """
147 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead
148 """
150 title_node = article_node.select_one(".main_link")
151 if title_node is None: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")
153 href = title_node.get("href")
154 if href is None or isinstance(href, list): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found")
157 xarticle = create_articledata()
159 pages_node = article_node.select_one(".pages")
160 if pages_node is not None: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true
161 self.set_pages(xarticle, pages_node.text)
162 xarticle.title_tex = title_node.text
163 xarticle.title_html = title_node.text
164 xarticle.pid = f"{xissue.pid}_a{article_index}"
166 if xissue.url is not None: 166 ↛ 174line 166 didn't jump to line 174 because the condition on line 166 was always true
167 ext_link = create_extlink(
168 rel="source", location=xissue.url, metadata=self.source_domain
169 )
170 xarticle.ext_links.append(ext_link)
171 # xarticle.url = xissue.url
173 # Abstract
174 abstract_node = article_node.select_one(".secondary_link")
176 if abstract_node is None: 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true
177 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")
178 else:
179 abstract_href = abstract_node.get("href")
180 if abstract_href is None or isinstance(abstract_href, list): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 raise ValueError(
182 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"
183 )
185 abstract = self.fetch_sasa_abstract(
186 self.source_website + "/" + abstract_href, xarticle.pid
187 )
188 if abstract is not None: 188 ↛ 193line 188 didn't jump to line 193 because the condition on line 188 was always true
189 xarticle.abstracts.append(abstract)
190 # LANG
191 xarticle.lang = abstract["lang"]
193 author_node = article_node.select_one(".secondary_text")
194 if author_node is not None: 194 ↛ 202line 194 didn't jump to line 202 because the condition on line 194 was always true
195 authors = re.findall(
196 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text
197 )
198 for a in authors:
199 author = create_contributor(role="author", string_name=a)
200 xarticle.contributors.append(author)
201 else:
202 print(f"[{self.source_domain}] {xarticle.pid} : Author not found")
204 secondary_nodes = article_node.select(".secondary_info_text")
205 subjects = []
206 keywords = []
207 doi = None
208 for node in secondary_nodes:
209 text = node.text
210 if text.startswith("Keywords"):
211 keywords = text.removeprefix("Keywords:\xa0").split("; ")
212 for kwd in keywords:
213 subject = create_subj(value=kwd, lang=xarticle.lang)
214 xarticle.kwds.append(subject)
215 elif text.startswith("DOI"):
216 doi = text.removeprefix("DOI:\xa0")
217 if doi is not None: 217 ↛ 208line 217 didn't jump to line 208 because the condition on line 217 was always true
218 xarticle.doi = doi
219 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
220 elif text.startswith("MSC"):
221 subjects = text.removeprefix("MSC:\xa0").split("; ")
222 for subj in subjects:
223 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
224 xarticle.kwds.append(subject)
225 elif text.startswith("Zbl:"): 225 ↛ 208line 225 didn't jump to line 208 because the condition on line 225 was always true
226 zbl_link = node.select_one(".secondary_link")
227 if zbl_link is not None: 227 ↛ 208line 227 didn't jump to line 208 because the condition on line 227 was always true
228 xarticle.extids.append(("zbl-item-id", zbl_link.text))
230 if href.startswith("http"):
231 pdf_url = href
232 else:
233 pdf_url = self.source_website + "/files/" + href
235 # Fix for Filomat
236 if "www.pmf.ni.ac.rs" in pdf_url:
237 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")
239 add_pdf_link_to_xarticle(xarticle, pdf_url)
240 return xarticle
242 def fetch_sasa_abstract(self, abstract_url: str, pid: str):
243 content = self.download_file(abstract_url)
244 soup = BeautifulSoup(content, "html.parser")
245 text_node = soup.select_one("p")
246 if text_node is not None: 246 ↛ 253line 246 didn't jump to line 253 because the condition on line 246 was always true
247 text = text_node.text.replace("$$", "$")
248 abstract = create_abstract(
249 tag="abstract",
250 value_tex=text,
251 )
252 return abstract
253 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found")
255 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ?
256 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm
257 def decode_response(self, response: Response, encoding: str = "utf-8"):
258 """Force windows-1250 encoding if we cannot cannot read the abstract"""
259 try:
260 return super().decode_response(response, encoding)
261 except UnicodeDecodeError:
262 print(
263 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250"
264 )
265 try:
266 return super().decode_response(response, "windows-1250")
267 except UnicodeDecodeError:
268 raise BufferError(
269 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
270 )