Coverage for src/crawler/by_source/sasa_crawler.py: 79%

155 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import re 

2from urllib.parse import urljoin 

3 

4import requests 

5from bs4 import BeautifulSoup, Tag 

6from lingua import Language, LanguageDetectorBuilder 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13 create_extlink, 

14 create_subj, 

15) 

16from requests import Response 

17 

18from crawler.base_crawler import BaseCollectionCrawler 

19from crawler.utils import add_pdf_link_to_xarticle 

20 

21 

22class SasaCrawler(BaseCollectionCrawler): 

23 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts" 

24 source_domain = "SASA" 

25 source_website = "http://elib.mi.sanu.ac.rs" 

26 

27 language_detector = LanguageDetectorBuilder.from_languages( 

28 Language.ENGLISH, Language.SERBIAN 

29 ).build() 

30 

31 def parse_collection_content(self, content): 

32 soup = BeautifulSoup(content, "html.parser") 

33 xissues: list[IssueData] = [] 

34 

35 # Extract the list of issues 

36 # Filter out empty table cells 

37 volume_nodes = [ 

38 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0" 

39 ] 

40 for vol_node in volume_nodes: 

41 # NOTE : should we parse year here or in the issue itself ? 

42 href = vol_node.get("href") 

43 if isinstance(href, list): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError( 

45 f"[{self.source_domain}] {self.collection_id} : Collection href is an array." 

46 ) 

47 if href is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 raise ValueError( 

49 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found" 

50 ) 

51 

52 # Parse Volume and Issue numbers 

53 url = self.source_website + "/pages/" + href 

54 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024 

55 volume_re = list( 

56 re.finditer( 

57 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)", 

58 vol_node.text, 

59 ) 

60 ) 

61 if len(volume_re) == 0: 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was never true

62 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 | 

63 volume_re = list( 

64 re.finditer( 

65 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)", 

66 vol_node.text, 

67 ) 

68 ) 

69 if len(volume_re) == 0: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 raise IndexError( 

71 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

72 ) 

73 volume_metadata = volume_re[0].groupdict() 

74 

75 # HACK : temporary workaround 

76 # https://gricad-gitlab.univ-grenoble-alpes.fr/mathdoc/ptfs/ptf-app-crawler/-/issues/27 

77 if url != "http://elib.mi.sanu.ac.rs/pages/browse_issue.php?db=flmt&rbr=95": 

78 xissues.append( 

79 self.create_xissue( 

80 url, 

81 volume_metadata["year"], 

82 volume_metadata["volume"].strip(), 

83 volume_metadata.get("issue", None), 

84 ) 

85 ) 

86 

87 # Handle pagination 

88 pages_node = soup.select_one(".page_selector") 

89 if pages_node is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return xissues 

91 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

92 if next_page_node is None: 

93 return xissues 

94 next_page_href = next_page_node.get("href") 

95 if isinstance(next_page_href, list): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 raise ValueError( 

97 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array." 

98 ) 

99 if next_page_href is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError( 

101 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found" 

102 ) 

103 

104 content = self.download_file(self.source_website + "/" + next_page_href) 

105 return xissues + self.parse_collection_content(content) 

106 

107 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

108 soup = BeautifulSoup(content, "html.parser") 

109 article_nodes = soup.select(".content .result") 

110 if xissue.pid is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 raise ValueError( 

112 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it" 

113 ) 

114 

115 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA. 

116 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text") 

117 # if (issue_publisher_node is not None): 

118 # publisher = issue_publisher_node.text 

119 # xpub = create_publisherdata() 

120 # xpub.name = publisher.removeprefix("Publisher ") 

121 # xissue.publisher = xpub 

122 

123 for i, art_node in enumerate(article_nodes): 

124 xarticle = self.parse_sasa_article(i + index, art_node, xissue) 

125 xissue.articles.append(xarticle) 

126 

127 index = index + len(article_nodes) 

128 # Handle pagination 

129 pages_node = soup.select_one(".page_selector") 

130 if pages_node is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 return 

132 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

133 if next_page_node is None: 

134 return 

135 next_page_href = next_page_node.get("href") 

136 if isinstance(next_page_href, list): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise ValueError( 

138 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array." 

139 ) 

140 if next_page_href is None: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 raise ValueError( 

142 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found" 

143 ) 

144 

145 content = self.download_file(self.source_website + "/" + next_page_href) 

146 self.parse_issue_content(content, xissue, index) 

147 

148 def parse_sasa_article( 

149 self, article_index: int, article_node: Tag, xissue: IssueData 

150 ) -> ArticleData: 

151 """ 

152 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead 

153 """ 

154 

155 title_node = article_node.select_one(".main_link") 

156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found") 

158 href = title_node.get("href") 

159 if href is None or isinstance(href, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found") 

161 

162 xarticle = create_articledata() 

163 

164 pages_node = article_node.select_one(".pages") 

165 if pages_node is not None: 165 ↛ 167line 165 didn't jump to line 167 because the condition on line 165 was always true

166 self.set_pages(xarticle, pages_node.text) 

167 xarticle.title_tex = title_node.text 

168 xarticle.title_html = title_node.text 

169 xarticle.pid = f"{xissue.pid}_a{article_index}" 

170 

171 if xissue.url is not None: 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 ext_link = create_extlink( 

173 rel="source", location=xissue.url, metadata=self.source_domain 

174 ) 

175 xarticle.ext_links.append(ext_link) 

176 # xarticle.url = xissue.url 

177 

178 # Abstract 

179 abstract_node = article_node.select_one(".secondary_link:-soup-contains-own('Abstract')") 

180 

181 if abstract_node is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

183 else: 

184 abstract_href = abstract_node.get("href") 

185 if abstract_href is None or isinstance(abstract_href, list): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 raise ValueError( 

187 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found" 

188 ) 

189 

190 abstract = self.fetch_sasa_abstract( 

191 urljoin(self.source_website, abstract_href), xarticle.pid 

192 ) 

193 if abstract is not None: 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was always true

194 xarticle.abstracts.append(abstract) 

195 # LANG 

196 xarticle.lang = abstract["lang"] 

197 

198 author_node = article_node.select_one(".secondary_text") 

199 if author_node is not None: 199 ↛ 207line 199 didn't jump to line 207 because the condition on line 199 was always true

200 authors = re.findall( 

201 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text 

202 ) 

203 for a in authors: 

204 author = create_contributor(role="author", string_name=a) 

205 xarticle.contributors.append(author) 

206 else: 

207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found") 

208 

209 secondary_nodes = article_node.select(".secondary_info_text") 

210 subjects = [] 

211 keywords = [] 

212 doi = None 

213 for node in secondary_nodes: 

214 text = node.text 

215 if text.startswith("Keywords"): 

216 keywords = text.removeprefix("Keywords:\xa0").split("; ") 

217 for kwd in keywords: 

218 subject = create_subj(value=kwd, lang=xarticle.lang) 

219 xarticle.kwds.append(subject) 

220 elif text.startswith("DOI") and self.collection_id != "YJOR": 

221 doi = text.removeprefix("DOI:\xa0") 

222 if doi is not None: 222 ↛ 213line 222 didn't jump to line 213 because the condition on line 222 was always true

223 xarticle.doi = doi 

224 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

225 elif text.startswith("MSC"): 

226 subjects = text.removeprefix("MSC:\xa0").split("; ") 

227 for subj in subjects: 

228 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

229 xarticle.kwds.append(subject) 

230 elif text.startswith("Zbl:"): 230 ↛ 213line 230 didn't jump to line 213 because the condition on line 230 was always true

231 zbl_link = node.select_one(".secondary_link") 

232 if zbl_link is not None: 232 ↛ 213line 232 didn't jump to line 213 because the condition on line 232 was always true

233 xarticle.extids.append(("zbl-item-id", zbl_link.text)) 

234 

235 if href.startswith("http"): 

236 pdf_url = href 

237 else: 

238 pdf_url = self.source_website + "/files/" + href 

239 

240 # Fix for Filomat 

241 if "www.pmf.ni.ac.rs" in pdf_url: 

242 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs") 

243 

244 add_pdf_link_to_xarticle(xarticle, pdf_url) 

245 return xarticle 

246 

247 def fetch_sasa_abstract(self, abstract_url: str, pid: str): 

248 try: 

249 content = self.download_file(abstract_url) 

250 except requests.exceptions.HTTPError: 

251 pass 

252 soup = BeautifulSoup(content, "html.parser") 

253 text_node = soup.select_one("p") 

254 if text_node is not None: 254 ↛ 261line 254 didn't jump to line 261 because the condition on line 254 was always true

255 text = text_node.text.replace("$$", "$") 

256 abstract = create_abstract( 

257 tag="abstract", 

258 value_tex=text, 

259 ) 

260 return abstract 

261 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found") 

262 

263 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ? 

264 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm 

265 def decode_response(self, response: Response, encoding: str = "utf-8"): 

266 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

267 try: 

268 return super().decode_response(response, encoding) 

269 except UnicodeDecodeError: 

270 print( 

271 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250" 

272 ) 

273 try: 

274 return super().decode_response(response, "windows-1250") 

275 except UnicodeDecodeError: 

276 raise BufferError( 

277 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

278 )