Coverage for src / crawler / by_source / sasa_crawler.py: 83%
141 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import re
2from urllib.parse import urljoin
4import regex
5from bs4 import BeautifulSoup, Tag
6from lingua import Language, LanguageDetectorBuilder
7from ptf.cmds.xml.xml_utils import escape
8from ptf.model_data import (
9 ArticleData,
10 IssueData,
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_extlink,
15 create_subj,
16)
18from crawler.base_crawler import BaseCollectionCrawler
19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
22class SasaCrawler(BaseCollectionCrawler):
23 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"
24 source_domain = "SASA"
25 source_website = "http://elib.mi.sanu.ac.rs"
27 _language_detector_builder = LanguageDetectorBuilder.from_languages(
28 Language.ENGLISH, Language.SERBIAN
29 )
31 def parse_collection_content(self, content):
32 soup = BeautifulSoup(content, "html.parser")
33 xissues: list[IssueData] = []
35 # Extract the list of issues
36 # Filter out empty table cells
37 volume_nodes = [
38 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"
39 ]
40 for vol_node in volume_nodes:
41 # NOTE : should we parse year here or in the issue itself ?
43 href = self.get_str_attr(vol_node, "href")
45 # Parse Volume and Issue numbers
46 url = self.source_website + "/pages/" + href
47 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024
48 volume_re = list(
49 re.finditer(
50 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",
51 vol_node.text,
52 )
53 )
54 if len(volume_re) == 0: 54 ↛ 56line 54 didn't jump to line 56 because the condition on line 54 was never true
55 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |
56 volume_re = list(
57 re.finditer(
58 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)",
59 vol_node.text,
60 )
61 )
62 if len(volume_re) == 0: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise IndexError(
64 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
65 )
66 volume_metadata = volume_re[0].groupdict()
68 # HACK : temporary workaround
69 # https://gricad-gitlab.univ-grenoble-alpes.fr/mathdoc/ptfs/ptf-app-crawler/-/issues/27
70 if url != "http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=flmt&rbr=95":
71 xissues.append(
72 self.create_xissue(
73 url,
74 volume_metadata["year"],
75 volume_metadata["volume"].strip(),
76 volume_metadata.get("issue", None),
77 )
78 )
80 # Handle pagination
81 pages_node = soup.select_one(".page_selector")
82 if pages_node is None: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 return xissues
84 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
85 if next_page_node is None:
86 return xissues
88 next_page_href = self.get_str_attr(next_page_node, "href")
90 content = self.download_file(self.source_website + "/" + next_page_href)
91 return xissues + self.parse_collection_content(content)
93 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):
94 soup = BeautifulSoup(content, "html.parser")
95 article_nodes = soup.select(".content .result")
96 if xissue.pid is None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError(
98 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"
99 )
101 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.
102 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")
103 # if (issue_publisher_node is not None):
104 # publisher = issue_publisher_node.text
105 # xpub = create_publisherdata()
106 # xpub.name = publisher.removeprefix("Publisher ")
107 # xissue.publisher = xpub
109 for i, art_node in enumerate(article_nodes):
110 xarticle = self.parse_sasa_article(i + index, art_node, xissue)
111 xissue.articles.append(xarticle)
113 index = index + len(article_nodes)
114 # Handle pagination
115 pages_node = soup.select_one(".page_selector")
116 if pages_node is None: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 return
118 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")
119 if next_page_node is None:
120 return
122 next_page_href = self.get_str_attr(next_page_node, "href")
124 content = self.download_file(self.source_website + "/" + next_page_href)
125 self.parse_issue_content(content, xissue, index)
127 def parse_sasa_article(
128 self, article_index: int, article_node: Tag, xissue: IssueData
129 ) -> ArticleData:
130 """
131 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead
132 """
134 title_node = article_node.select_one(".main_link")
136 if title_node is None: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")
138 href = self.get_str_attr(title_node, "href")
140 xarticle = create_articledata()
142 pages_node = article_node.select_one(".pages")
143 if pages_node is not None: 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true
144 self.set_pages(xarticle, pages_node.text)
145 xarticle.title_tex = title_node.text
146 xarticle.title_html = title_node.text
147 xarticle.pid = f"{xissue.pid}_a{article_index}"
149 if xissue.url is not None: 149 ↛ 157line 149 didn't jump to line 157 because the condition on line 149 was always true
150 ext_link = create_extlink(
151 rel="source", location=xissue.url, metadata=self.source_domain
152 )
153 xarticle.ext_links.append(ext_link)
154 # xarticle.url = xissue.url
156 # Abstract
157 abstract_node = article_node.select_one(".secondary_link:-soup-contains-own('Abstract')")
159 if abstract_node is None: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid})
161 else:
162 abstract_href = abstract_node.get("href")
163 if abstract_href is None or isinstance(abstract_href, list): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 raise ValueError(
165 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"
166 )
168 abstract = self.fetch_sasa_abstract(
169 urljoin(self.source_website, abstract_href), xarticle.pid
170 )
171 if abstract is not None: 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true
172 xarticle.abstracts.append(abstract)
173 # LANG
174 xarticle.lang = abstract["lang"]
176 author_node = article_node.select_one(".secondary_text")
177 if author_node is not None: 177 ↛ 185line 177 didn't jump to line 185 because the condition on line 177 was always true
178 authors = re.findall(
179 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text
180 )
181 for a in authors:
182 author = create_contributor(role="author", string_name=a)
183 xarticle.contributors.append(author)
184 else:
185 self.logger.debug("Author not found", extra={"pid": xarticle.pid})
187 secondary_nodes = article_node.select(".secondary_info_text")
188 subjects = []
189 keywords = []
190 doi = None
191 for node in secondary_nodes:
192 text = node.text
193 if text.startswith("Keywords"):
194 keywords = text.removeprefix("Keywords:\xa0").split("; ")
195 for kwd in keywords:
196 subject = create_subj(value=kwd, lang=xarticle.lang)
197 xarticle.kwds.append(subject)
198 elif text.startswith("DOI") and self.collection_id != "YJOR":
199 doi = regex.sub(r"DOI:?\s", "", text)
200 if doi is not None: 200 ↛ 191line 200 didn't jump to line 191 because the condition on line 200 was always true
201 doi = cleanup_str(escape(doi))
202 # Fix for badly formatted SASA Doi
203 # http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=kjm&rbr=
204 if regex.match(r"(?P<doi>10[0-9]{4,}.+)", doi): 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 doi = doi[:2] + "." + doi[2:]
206 xarticle.doi = doi
207 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
208 elif text.startswith("MSC"):
209 subjects = text.removeprefix("MSC:\xa0").split("; ")
210 for subj in subjects:
211 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
212 xarticle.kwds.append(subject)
213 elif text.startswith("Zbl:"): 213 ↛ 191line 213 didn't jump to line 191 because the condition on line 213 was always true
214 zbl_link = node.select_one(".secondary_link")
215 if zbl_link is not None: 215 ↛ 191line 215 didn't jump to line 191 because the condition on line 215 was always true
216 xarticle.extids.append(("zbl-item-id", zbl_link.text))
218 if href.startswith("http"):
219 pdf_url = href
220 else:
221 pdf_url = self.source_website + "/files/" + href
223 # Fix for Filomat
224 if "www.pmf.ni.ac.rs" in pdf_url:
225 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")
227 add_pdf_link_to_xarticle(xarticle, pdf_url)
228 return xarticle
230 def fetch_sasa_abstract(self, abstract_url: str, pid: str):
231 content = self.download_file(abstract_url)
232 soup = BeautifulSoup(content, "html.parser")
233 text_node = soup.select_one("p")
234 if text_node is not None: 234 ↛ 240line 234 didn't jump to line 240 because the condition on line 234 was always true
235 text = text_node.text.replace("$$", "$")
236 abstract = create_abstract(
237 value_tex=text,
238 )
239 return abstract
240 self.logger.debug("Abstract page exists, but text not found", extra={"pid": pid})
242 def decode_response(self, response, encoding=None):
243 """Attempt to decode content first before
245 SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise.
246 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm
247 """
248 # Attempt to get encoding using HTML meta charset tag
249 soup = BeautifulSoup(response.text, "html5lib")
250 charset = soup.select_one("meta[charset]")
251 if charset: 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true
252 htmlencoding = charset.get("charset")
253 if isinstance(htmlencoding, str):
254 response.encoding = htmlencoding
255 return response.text
257 return super().decode_response(response, encoding)