Coverage for src / crawler / by_source / sasa_crawler.py: 83%

141 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import re 

2from urllib.parse import urljoin 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from lingua import Language, LanguageDetectorBuilder 

7from ptf.cmds.xml.xml_utils import escape 

8from ptf.model_data import ( 

9 ArticleData, 

10 IssueData, 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_extlink, 

15 create_subj, 

16) 

17 

18from crawler.base_crawler import BaseCollectionCrawler 

19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

20 

21 

22class SasaCrawler(BaseCollectionCrawler): 

23 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts" 

24 source_domain = "SASA" 

25 source_website = "http://elib.mi.sanu.ac.rs" 

26 

27 _language_detector_builder = LanguageDetectorBuilder.from_languages( 

28 Language.ENGLISH, Language.SERBIAN 

29 ) 

30 

31 def parse_collection_content(self, content): 

32 soup = BeautifulSoup(content, "html.parser") 

33 xissues: list[IssueData] = [] 

34 

35 # Extract the list of issues 

36 # Filter out empty table cells 

37 volume_nodes = [ 

38 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0" 

39 ] 

40 for vol_node in volume_nodes: 

41 # NOTE : should we parse year here or in the issue itself ? 

42 

43 href = self.get_str_attr(vol_node, "href") 

44 

45 # Parse Volume and Issue numbers 

46 url = self.source_website + "/pages/" + href 

47 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024 

48 volume_re = list( 

49 re.finditer( 

50 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)", 

51 vol_node.text, 

52 ) 

53 ) 

54 if len(volume_re) == 0: 54 ↛ 56line 54 didn't jump to line 56 because the condition on line 54 was never true

55 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 | 

56 volume_re = list( 

57 re.finditer( 

58 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)", 

59 vol_node.text, 

60 ) 

61 ) 

62 if len(volume_re) == 0: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise IndexError( 

64 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

65 ) 

66 volume_metadata = volume_re[0].groupdict() 

67 

68 # HACK : temporary workaround 

69 # https://gricad-gitlab.univ-grenoble-alpes.fr/mathdoc/ptfs/ptf-app-crawler/-/issues/27 

70 if url != "http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=flmt&rbr=95": 

71 xissues.append( 

72 self.create_xissue( 

73 url, 

74 volume_metadata["year"], 

75 volume_metadata["volume"].strip(), 

76 volume_metadata.get("issue", None), 

77 ) 

78 ) 

79 

80 # Handle pagination 

81 pages_node = soup.select_one(".page_selector") 

82 if pages_node is None: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 return xissues 

84 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

85 if next_page_node is None: 

86 return xissues 

87 

88 next_page_href = self.get_str_attr(next_page_node, "href") 

89 

90 content = self.download_file(self.source_website + "/" + next_page_href) 

91 return xissues + self.parse_collection_content(content) 

92 

93 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

94 soup = BeautifulSoup(content, "html.parser") 

95 article_nodes = soup.select(".content .result") 

96 if xissue.pid is None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError( 

98 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it" 

99 ) 

100 

101 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA. 

102 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text") 

103 # if (issue_publisher_node is not None): 

104 # publisher = issue_publisher_node.text 

105 # xpub = create_publisherdata() 

106 # xpub.name = publisher.removeprefix("Publisher ") 

107 # xissue.publisher = xpub 

108 

109 for i, art_node in enumerate(article_nodes): 

110 xarticle = self.parse_sasa_article(i + index, art_node, xissue) 

111 xissue.articles.append(xarticle) 

112 

113 index = index + len(article_nodes) 

114 # Handle pagination 

115 pages_node = soup.select_one(".page_selector") 

116 if pages_node is None: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 return 

118 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

119 if next_page_node is None: 

120 return 

121 

122 next_page_href = self.get_str_attr(next_page_node, "href") 

123 

124 content = self.download_file(self.source_website + "/" + next_page_href) 

125 self.parse_issue_content(content, xissue, index) 

126 

127 def parse_sasa_article( 

128 self, article_index: int, article_node: Tag, xissue: IssueData 

129 ) -> ArticleData: 

130 """ 

131 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead 

132 """ 

133 

134 title_node = article_node.select_one(".main_link") 

135 

136 if title_node is None: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found") 

138 href = self.get_str_attr(title_node, "href") 

139 

140 xarticle = create_articledata() 

141 

142 pages_node = article_node.select_one(".pages") 

143 if pages_node is not None: 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true

144 self.set_pages(xarticle, pages_node.text) 

145 xarticle.title_tex = title_node.text 

146 xarticle.title_html = title_node.text 

147 xarticle.pid = f"{xissue.pid}_a{article_index}" 

148 

149 if xissue.url is not None: 149 ↛ 157line 149 didn't jump to line 157 because the condition on line 149 was always true

150 ext_link = create_extlink( 

151 rel="source", location=xissue.url, metadata=self.source_domain 

152 ) 

153 xarticle.ext_links.append(ext_link) 

154 # xarticle.url = xissue.url 

155 

156 # Abstract 

157 abstract_node = article_node.select_one(".secondary_link:-soup-contains-own('Abstract')") 

158 

159 if abstract_node is None: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid}) 

161 else: 

162 abstract_href = abstract_node.get("href") 

163 if abstract_href is None or isinstance(abstract_href, list): 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 raise ValueError( 

165 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found" 

166 ) 

167 

168 abstract = self.fetch_sasa_abstract( 

169 urljoin(self.source_website, abstract_href), xarticle.pid 

170 ) 

171 if abstract is not None: 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true

172 xarticle.abstracts.append(abstract) 

173 # LANG 

174 xarticle.lang = abstract["lang"] 

175 

176 author_node = article_node.select_one(".secondary_text") 

177 if author_node is not None: 177 ↛ 185line 177 didn't jump to line 185 because the condition on line 177 was always true

178 authors = re.findall( 

179 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text 

180 ) 

181 for a in authors: 

182 author = create_contributor(role="author", string_name=a) 

183 xarticle.contributors.append(author) 

184 else: 

185 self.logger.debug("Author not found", extra={"pid": xarticle.pid}) 

186 

187 secondary_nodes = article_node.select(".secondary_info_text") 

188 subjects = [] 

189 keywords = [] 

190 doi = None 

191 for node in secondary_nodes: 

192 text = node.text 

193 if text.startswith("Keywords"): 

194 keywords = text.removeprefix("Keywords:\xa0").split("; ") 

195 for kwd in keywords: 

196 subject = create_subj(value=kwd, lang=xarticle.lang) 

197 xarticle.kwds.append(subject) 

198 elif text.startswith("DOI") and self.collection_id != "YJOR": 

199 doi = regex.sub(r"DOI:?\s", "", text) 

200 if doi is not None: 200 ↛ 191line 200 didn't jump to line 191 because the condition on line 200 was always true

201 doi = cleanup_str(escape(doi)) 

202 # Fix for badly formatted SASA Doi 

203 # http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=kjm&rbr= 

204 if regex.match(r"(?P<doi>10[0-9]{4,}.+)", doi): 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 doi = doi[:2] + "." + doi[2:] 

206 xarticle.doi = doi 

207 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

208 elif text.startswith("MSC"): 

209 subjects = text.removeprefix("MSC:\xa0").split("; ") 

210 for subj in subjects: 

211 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

212 xarticle.kwds.append(subject) 

213 elif text.startswith("Zbl:"): 213 ↛ 191line 213 didn't jump to line 191 because the condition on line 213 was always true

214 zbl_link = node.select_one(".secondary_link") 

215 if zbl_link is not None: 215 ↛ 191line 215 didn't jump to line 191 because the condition on line 215 was always true

216 xarticle.extids.append(("zbl-item-id", zbl_link.text)) 

217 

218 if href.startswith("http"): 

219 pdf_url = href 

220 else: 

221 pdf_url = self.source_website + "/files/" + href 

222 

223 # Fix for Filomat 

224 if "www.pmf.ni.ac.rs" in pdf_url: 

225 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs") 

226 

227 add_pdf_link_to_xarticle(xarticle, pdf_url) 

228 return xarticle 

229 

230 def fetch_sasa_abstract(self, abstract_url: str, pid: str): 

231 content = self.download_file(abstract_url) 

232 soup = BeautifulSoup(content, "html.parser") 

233 text_node = soup.select_one("p") 

234 if text_node is not None: 234 ↛ 240line 234 didn't jump to line 240 because the condition on line 234 was always true

235 text = text_node.text.replace("$$", "$") 

236 abstract = create_abstract( 

237 value_tex=text, 

238 ) 

239 return abstract 

240 self.logger.debug("Abstract page exists, but text not found", extra={"pid": pid}) 

241 

242 def decode_response(self, response, encoding=None): 

243 """Attempt to decode content first before 

244 

245 SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. 

246 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm 

247 """ 

248 # Attempt to get encoding using HTML meta charset tag 

249 soup = BeautifulSoup(response.text, "html5lib") 

250 charset = soup.select_one("meta[charset]") 

251 if charset: 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 htmlencoding = charset.get("charset") 

253 if isinstance(htmlencoding, str): 

254 response.encoding = htmlencoding 

255 return response.text 

256 

257 return super().decode_response(response, encoding)