Coverage for src/crawler/by_source/sasa_crawler.py: 81%

172 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import ( 

5 AbstractDict, 

6 ArticleData, 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12 create_subj, 

13) 

14from requests import Response 

15 

16from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

17 

18 

19class SasaCrawler(BaseCollectionCrawler): 

20 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts" 

21 source_domain = "SASA" 

22 source_website = "http://elib.mi.sanu.ac.rs" 

23 

24 periode_end = float("inf") 

25 periode_begin = 0 

26 

27 def __init__(self, *args, **kwargs): 

28 super().__init__(*args, **kwargs) 

29 

30 self.source = self.get_or_create_source() 

31 

32 self.periode = self.get_or_create_periode() 

33 

34 def parse_collection_content(self, content): 

35 soup = BeautifulSoup(content, "html.parser") 

36 xissues: list[IssueData] = [] 

37 

38 # Extract the list of issues 

39 # Filter out empty table cells 

40 volume_nodes = [ 

41 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0" 

42 ] 

43 for vol_node in volume_nodes: 

44 # NOTE : should we parse year here or in the issue itself ? 

45 href = vol_node.get("href") 

46 if isinstance(href, list): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 raise ValueError( 

48 f"[{self.source_domain}] {self.collection_id} : Collection href is an array." 

49 ) 

50 if href is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError( 

52 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found" 

53 ) 

54 

55 # Parse Volume and Issue numbers 

56 url = self.source_website + "/pages/" + href 

57 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024 

58 volume_re = list( 

59 re.finditer( 

60 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)", 

61 vol_node.text, 

62 ) 

63 ) 

64 if len(volume_re) == 0: 

65 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 | 

66 volume_re = list( 

67 re.finditer( 

68 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)", 

69 vol_node.text, 

70 ) 

71 ) 

72 if len(volume_re) == 0: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise IndexError( 

74 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

75 ) 

76 volume_metadata = volume_re[0].groupdict() 

77 year = int(volume_metadata["year"]) 

78 if self.periode_begin <= year and year <= self.periode_end: 78 ↛ 43line 78 didn't jump to line 43 because the condition on line 78 was always true

79 xissues.append( 

80 self.create_xissue( 

81 url, 

82 volume_metadata["year"], 

83 volume_metadata["volume"].strip(), 

84 volume_metadata.get("issue", None), 

85 ) 

86 ) 

87 

88 # Handle pagination 

89 pages_node = soup.select_one(".page_selector") 

90 if pages_node is None: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 return xissues 

92 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

93 if next_page_node is None: 

94 return xissues 

95 next_page_href = next_page_node.get("href") 

96 if isinstance(next_page_href, list): 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError( 

98 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array." 

99 ) 

100 if next_page_href is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ValueError( 

102 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found" 

103 ) 

104 

105 content = self.get_page_content(self.source_website + "/" + next_page_href) 

106 return xissues + self.parse_collection_content(content) 

107 

108 def create_xissue(self, url: str, year: str, volume_number: str, issue_number="1"): 

109 if url.endswith("/"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 url = url[:-1] 

111 xissue = create_issuedata() 

112 xissue.url = url 

113 

114 # Replace any non-word character with an underscore 

115 xissue.pid = re.sub( 

116 r"[^a-zA-Z0-9-]+", "_", f"{self.collection_id}_{year}__{volume_number}" 

117 ) 

118 xissue.volume = volume_number 

119 

120 xissue.year = year 

121 if issue_number is not None: 

122 xissue.pid += f"_{issue_number}" 

123 xissue.number = issue_number.replace(",", "-") 

124 return xissue 

125 

126 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

127 soup = BeautifulSoup(content, "html.parser") 

128 article_nodes = soup.select(".content .result") 

129 if xissue.pid is None: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError( 

131 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it" 

132 ) 

133 

134 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA. 

135 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text") 

136 # if (issue_publisher_node is not None): 

137 # publisher = issue_publisher_node.text 

138 # xpub = create_publisherdata() 

139 # xpub.name = publisher.removeprefix("Publisher ") 

140 # xissue.publisher = xpub 

141 

142 for i, art_node in enumerate(article_nodes): 

143 xarticle = self.parse_sasa_article(i + index, art_node, xissue) 

144 xissue.articles.append(xarticle) 

145 

146 index = index + len(article_nodes) 

147 # Handle pagination 

148 pages_node = soup.select_one(".page_selector") 

149 if pages_node is None: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 return 

151 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

152 if next_page_node is None: 

153 return 

154 next_page_href = next_page_node.get("href") 

155 if isinstance(next_page_href, list): 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 raise ValueError( 

157 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array." 

158 ) 

159 if next_page_href is None: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 raise ValueError( 

161 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found" 

162 ) 

163 

164 content = self.get_page_content(self.source_website + "/" + next_page_href) 

165 self.parse_issue_content(content, xissue, index) 

166 

167 def parse_sasa_article( 

168 self, article_index: int, article_node: Tag, xissue: IssueData 

169 ) -> ArticleData: 

170 """ 

171 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead 

172 """ 

173 title_node = article_node.select_one(".main_link") 

174 if title_node is None: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found") 

176 href = title_node.get("href") 

177 if href is None or isinstance(href, list): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found") 

179 

180 xarticle = create_articledata() 

181 

182 pages_node = article_node.select_one(".pages") 

183 if pages_node is not None: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true

184 xarticle.page_range = pages_node.text 

185 xarticle.title_tex = title_node.text 

186 xarticle.title_html = title_node.text 

187 xarticle.pid = f"{xissue.pid}_a{article_index}" 

188 

189 if xissue.url is not None: 189 ↛ 196line 189 didn't jump to line 196 because the condition on line 189 was always true

190 ext_link = create_extlink( 

191 rel="source", location=xissue.url, metadata=self.source_domain 

192 ) 

193 xarticle.ext_links.append(ext_link) 

194 # xarticle.url = xissue.url 

195 

196 author_node = article_node.select_one(".secondary_text") 

197 if author_node is not None: 197 ↛ 207line 197 didn't jump to line 207 because the condition on line 197 was always true

198 authors = re.findall( 

199 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text 

200 ) 

201 for a in authors: 

202 author = create_contributor() 

203 author["role"] = "author" 

204 author["string_name"] = a 

205 xarticle.contributors.append(author) 

206 else: 

207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found") 

208 

209 abstract_node = article_node.select_one(".secondary_link") 

210 if abstract_node is None: 

211 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

212 else: 

213 abstract_href = abstract_node.get("href") 

214 if abstract_href is None or isinstance(abstract_href, list): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 raise ValueError( 

216 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found" 

217 ) 

218 

219 abstract = self.fetch_sasa_abstract( 

220 self.source_website + "/" + abstract_href, xarticle.pid 

221 ) 

222 if abstract is not None: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 xarticle.abstracts.append(abstract) 

224 

225 secondary_nodes = article_node.select(".secondary_info_text") 

226 subjects = [] 

227 keywords = [] 

228 doi = None 

229 for node in secondary_nodes: 

230 text = node.text 

231 if text.startswith("Keywords"): 

232 keywords = text.removeprefix("Keywords:\xa0").split("; ") 

233 for kwd in keywords: 

234 subject = create_subj() 

235 subject["value"] = kwd 

236 subject["lang"] = "en" 

237 xarticle.kwds.append(subject) 

238 elif text.startswith("DOI"): 

239 doi = text.removeprefix("DOI:\xa0") 

240 if doi is not None: 240 ↛ 229line 240 didn't jump to line 229 because the condition on line 240 was always true

241 xarticle.doi = doi 

242 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

243 elif text.startswith("MSC"): 

244 subjects = text.removeprefix("MSC:\xa0").split("; ") 

245 for subj in subjects: 

246 subject = create_subj() 

247 subject["value"] = subj 

248 subject["type"] = "msc" 

249 subject["lang"] = "en" 

250 xarticle.kwds.append(subject) 

251 elif text.startswith("Zbl:"): 251 ↛ 229line 251 didn't jump to line 229 because the condition on line 251 was always true

252 zbl_link = node.select_one(".secondary_link") 

253 if zbl_link is not None: 253 ↛ 229line 253 didn't jump to line 229 because the condition on line 253 was always true

254 xarticle.extids.append(("zbl-item-id", zbl_link.text)) 

255 

256 if href.startswith("http"): 

257 pdf_url = href 

258 else: 

259 pdf_url = self.source_website + "/files/" + href 

260 

261 # Fix for Filomat 

262 if "www.pmf.ni.ac.rs" in pdf_url: 

263 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs") 

264 

265 add_pdf_link_to_xarticle(xarticle, pdf_url) 

266 return xarticle 

267 

268 def fetch_sasa_abstract(self, abstract_url: str, pid: str): 

269 content = self.get_page_content(abstract_url) 

270 soup = BeautifulSoup(content, "html.parser") 

271 text_node = soup.select_one("p") 

272 if text_node is not None: 272 ↛ 280line 272 didn't jump to line 280 because the condition on line 272 was always true

273 text = text_node.text.replace("$$", "$") 

274 abstract: AbstractDict = { 

275 "tag": "abstract", 

276 "value_tex": text, 

277 "lang": "eng", 

278 } 

279 return abstract 

280 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found") 

281 

282 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ? 

283 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm 

284 def decode_response(self, response: Response, encoding: str = "utf-8"): 

285 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

286 try: 

287 return super().decode_response(response, encoding) 

288 except UnicodeDecodeError: 

289 print( 

290 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250" 

291 ) 

292 try: 

293 return super().decode_response(response, "windows-1250") 

294 except UnicodeDecodeError: 

295 raise BufferError( 

296 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

297 )