Coverage for src/crawler/by_source/sasa_crawler.py: 79%

154 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from lingua import Language, LanguageDetectorBuilder 

5from ptf.model_data import ( 

6 ArticleData, 

7 IssueData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_subj, 

13) 

14from requests import Response 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.utils import add_pdf_link_to_xarticle 

18 

19 

20class SasaCrawler(BaseCollectionCrawler): 

21 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts" 

22 source_domain = "SASA" 

23 source_website = "http://elib.mi.sanu.ac.rs" 

24 

25 periode_end = 9999 

26 periode_begin = 0 

27 

28 def build_language_detector(self): 

29 self.language_detector = LanguageDetectorBuilder.from_languages( 

30 Language.ENGLISH, Language.SERBIAN 

31 ).build() 

32 

33 def parse_collection_content(self, content): 

34 soup = BeautifulSoup(content, "html.parser") 

35 xissues: list[IssueData] = [] 

36 

37 # Extract the list of issues 

38 # Filter out empty table cells 

39 volume_nodes = [ 

40 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0" 

41 ] 

42 for vol_node in volume_nodes: 

43 # NOTE : should we parse year here or in the issue itself ? 

44 href = vol_node.get("href") 

45 if isinstance(href, list): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError( 

47 f"[{self.source_domain}] {self.collection_id} : Collection href is an array." 

48 ) 

49 if href is None: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError( 

51 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found" 

52 ) 

53 

54 # Parse Volume and Issue numbers 

55 url = self.source_website + "/pages/" + href 

56 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024 

57 volume_re = list( 

58 re.finditer( 

59 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)", 

60 vol_node.text, 

61 ) 

62 ) 

63 if len(volume_re) == 0: 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was never true

64 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 | 

65 volume_re = list( 

66 re.finditer( 

67 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)", 

68 vol_node.text, 

69 ) 

70 ) 

71 if len(volume_re) == 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raise IndexError( 

73 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

74 ) 

75 volume_metadata = volume_re[0].groupdict() 

76 year = int(volume_metadata["year"]) 

77 if self.periode_begin <= year and year <= self.periode_end: 77 ↛ 42line 77 didn't jump to line 42 because the condition on line 77 was always true

78 xissues.append( 

79 self.create_xissue( 

80 url, 

81 volume_metadata["year"], 

82 volume_metadata["volume"].strip(), 

83 volume_metadata.get("issue", None), 

84 ) 

85 ) 

86 

87 # Handle pagination 

88 pages_node = soup.select_one(".page_selector") 

89 if pages_node is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 return xissues 

91 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

92 if next_page_node is None: 

93 return xissues 

94 next_page_href = next_page_node.get("href") 

95 if isinstance(next_page_href, list): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 raise ValueError( 

97 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array." 

98 ) 

99 if next_page_href is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError( 

101 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found" 

102 ) 

103 

104 content = self.download_file(self.source_website + "/" + next_page_href) 

105 return xissues + self.parse_collection_content(content) 

106 

107 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

108 soup = BeautifulSoup(content, "html.parser") 

109 article_nodes = soup.select(".content .result") 

110 if xissue.pid is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 raise ValueError( 

112 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it" 

113 ) 

114 

115 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA. 

116 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text") 

117 # if (issue_publisher_node is not None): 

118 # publisher = issue_publisher_node.text 

119 # xpub = create_publisherdata() 

120 # xpub.name = publisher.removeprefix("Publisher ") 

121 # xissue.publisher = xpub 

122 

123 for i, art_node in enumerate(article_nodes): 

124 xarticle = self.parse_sasa_article(i + index, art_node, xissue) 

125 xissue.articles.append(xarticle) 

126 

127 index = index + len(article_nodes) 

128 # Handle pagination 

129 pages_node = soup.select_one(".page_selector") 

130 if pages_node is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 return 

132 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

133 if next_page_node is None: 

134 return 

135 next_page_href = next_page_node.get("href") 

136 if isinstance(next_page_href, list): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise ValueError( 

138 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array." 

139 ) 

140 if next_page_href is None: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 raise ValueError( 

142 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found" 

143 ) 

144 

145 content = self.download_file(self.source_website + "/" + next_page_href) 

146 self.parse_issue_content(content, xissue, index) 

147 

148 def parse_sasa_article( 

149 self, article_index: int, article_node: Tag, xissue: IssueData 

150 ) -> ArticleData: 

151 """ 

152 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead 

153 """ 

154 

155 title_node = article_node.select_one(".main_link") 

156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found") 

158 href = title_node.get("href") 

159 if href is None or isinstance(href, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found") 

161 

162 xarticle = create_articledata() 

163 

164 pages_node = article_node.select_one(".pages") 

165 if pages_node is not None: 165 ↛ 167line 165 didn't jump to line 167 because the condition on line 165 was always true

166 self.set_pages(xarticle, pages_node.text) 

167 xarticle.title_tex = title_node.text 

168 xarticle.title_html = title_node.text 

169 xarticle.pid = f"{xissue.pid}_a{article_index}" 

170 

171 if xissue.url is not None: 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 ext_link = create_extlink( 

173 rel="source", location=xissue.url, metadata=self.source_domain 

174 ) 

175 xarticle.ext_links.append(ext_link) 

176 # xarticle.url = xissue.url 

177 

178 # Abstract 

179 abstract_node = article_node.select_one(".secondary_link") 

180 

181 if abstract_node is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

183 else: 

184 abstract_href = abstract_node.get("href") 

185 if abstract_href is None or isinstance(abstract_href, list): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 raise ValueError( 

187 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found" 

188 ) 

189 

190 abstract = self.fetch_sasa_abstract( 

191 self.source_website + "/" + abstract_href, xarticle.pid 

192 ) 

193 if abstract is not None: 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was always true

194 xarticle.abstracts.append(abstract) 

195 # LANG 

196 xarticle.lang = abstract["lang"] 

197 

198 author_node = article_node.select_one(".secondary_text") 

199 if author_node is not None: 199 ↛ 207line 199 didn't jump to line 207 because the condition on line 199 was always true

200 authors = re.findall( 

201 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text 

202 ) 

203 for a in authors: 

204 author = create_contributor(role="author", string_name=a) 

205 xarticle.contributors.append(author) 

206 else: 

207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found") 

208 

209 secondary_nodes = article_node.select(".secondary_info_text") 

210 subjects = [] 

211 keywords = [] 

212 doi = None 

213 for node in secondary_nodes: 

214 text = node.text 

215 if text.startswith("Keywords"): 

216 keywords = text.removeprefix("Keywords:\xa0").split("; ") 

217 for kwd in keywords: 

218 subject = create_subj(value=kwd, lang=xarticle.lang) 

219 xarticle.kwds.append(subject) 

220 elif text.startswith("DOI"): 

221 doi = text.removeprefix("DOI:\xa0") 

222 if doi is not None: 222 ↛ 213line 222 didn't jump to line 213 because the condition on line 222 was always true

223 xarticle.doi = doi 

224 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

225 elif text.startswith("MSC"): 

226 subjects = text.removeprefix("MSC:\xa0").split("; ") 

227 for subj in subjects: 

228 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

229 xarticle.kwds.append(subject) 

230 elif text.startswith("Zbl:"): 230 ↛ 213line 230 didn't jump to line 213 because the condition on line 230 was always true

231 zbl_link = node.select_one(".secondary_link") 

232 if zbl_link is not None: 232 ↛ 213line 232 didn't jump to line 213 because the condition on line 232 was always true

233 xarticle.extids.append(("zbl-item-id", zbl_link.text)) 

234 

235 if href.startswith("http"): 

236 pdf_url = href 

237 else: 

238 pdf_url = self.source_website + "/files/" + href 

239 

240 # Fix for Filomat 

241 if "www.pmf.ni.ac.rs" in pdf_url: 

242 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs") 

243 

244 add_pdf_link_to_xarticle(xarticle, pdf_url) 

245 return xarticle 

246 

247 def fetch_sasa_abstract(self, abstract_url: str, pid: str): 

248 content = self.download_file(abstract_url) 

249 soup = BeautifulSoup(content, "html.parser") 

250 text_node = soup.select_one("p") 

251 if text_node is not None: 251 ↛ 259line 251 didn't jump to line 259 because the condition on line 251 was always true

252 text = text_node.text.replace("$$", "$") 

253 abstract = create_abstract( 

254 tag="abstract", 

255 value_tex=text, 

256 lang=self.detect_language(text), 

257 ) 

258 return abstract 

259 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found") 

260 

261 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ? 

262 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm 

263 def decode_response(self, response: Response, encoding: str = "utf-8"): 

264 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

265 try: 

266 return super().decode_response(response, encoding) 

267 except UnicodeDecodeError: 

268 print( 

269 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250" 

270 ) 

271 try: 

272 return super().decode_response(response, "windows-1250") 

273 except UnicodeDecodeError: 

274 raise BufferError( 

275 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

276 )