Coverage for src/crawler/by_source/sasa_crawler.py: 79%

149 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2 

3from bs4 import BeautifulSoup, Tag 

4from lingua import Language, LanguageDetectorBuilder 

5from ptf.model_data import ( 

6 ArticleData, 

7 IssueData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_subj, 

13) 

14from requests import Response 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.utils import add_pdf_link_to_xarticle 

18 

19 

20class SasaCrawler(BaseCollectionCrawler): 

21 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts" 

22 source_domain = "SASA" 

23 source_website = "http://elib.mi.sanu.ac.rs" 

24 

25 language_detector = LanguageDetectorBuilder.from_languages( 

26 Language.ENGLISH, Language.SERBIAN 

27 ).build() 

28 

29 def parse_collection_content(self, content): 

30 soup = BeautifulSoup(content, "html.parser") 

31 xissues: list[IssueData] = [] 

32 

33 # Extract the list of issues 

34 # Filter out empty table cells 

35 volume_nodes = [ 

36 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0" 

37 ] 

38 for vol_node in volume_nodes: 

39 # NOTE : should we parse year here or in the issue itself ? 

40 href = vol_node.get("href") 

41 if isinstance(href, list): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError( 

43 f"[{self.source_domain}] {self.collection_id} : Collection href is an array." 

44 ) 

45 if href is None: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError( 

47 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found" 

48 ) 

49 

50 # Parse Volume and Issue numbers 

51 url = self.source_website + "/pages/" + href 

52 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024 

53 volume_re = list( 

54 re.finditer( 

55 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)", 

56 vol_node.text, 

57 ) 

58 ) 

59 if len(volume_re) == 0: 59 ↛ 61line 59 didn't jump to line 61 because the condition on line 59 was never true

60 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 | 

61 volume_re = list( 

62 re.finditer( 

63 r"(?P<volume>[\.\( \)\w]+)\((?P<issue>\d+)\) \/ (?P<year>\d+)", 

64 vol_node.text, 

65 ) 

66 ) 

67 if len(volume_re) == 0: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 raise IndexError( 

69 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

70 ) 

71 volume_metadata = volume_re[0].groupdict() 

72 

73 xissues.append( 

74 self.create_xissue( 

75 url, 

76 volume_metadata["year"], 

77 volume_metadata["volume"].strip(), 

78 volume_metadata.get("issue", None), 

79 ) 

80 ) 

81 

82 # Handle pagination 

83 pages_node = soup.select_one(".page_selector") 

84 if pages_node is None: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 return xissues 

86 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

87 if next_page_node is None: 

88 return xissues 

89 next_page_href = next_page_node.get("href") 

90 if isinstance(next_page_href, list): 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 raise ValueError( 

92 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array." 

93 ) 

94 if next_page_href is None: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 raise ValueError( 

96 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found" 

97 ) 

98 

99 content = self.download_file(self.source_website + "/" + next_page_href) 

100 return xissues + self.parse_collection_content(content) 

101 

102 def parse_issue_content(self, content, xissue: IssueData, index: int = 0): 

103 soup = BeautifulSoup(content, "html.parser") 

104 article_nodes = soup.select(".content .result") 

105 if xissue.pid is None: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 raise ValueError( 

107 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it" 

108 ) 

109 

110 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA. 

111 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text") 

112 # if (issue_publisher_node is not None): 

113 # publisher = issue_publisher_node.text 

114 # xpub = create_publisherdata() 

115 # xpub.name = publisher.removeprefix("Publisher ") 

116 # xissue.publisher = xpub 

117 

118 for i, art_node in enumerate(article_nodes): 

119 xarticle = self.parse_sasa_article(i + index, art_node, xissue) 

120 xissue.articles.append(xarticle) 

121 

122 index = index + len(article_nodes) 

123 # Handle pagination 

124 pages_node = soup.select_one(".page_selector") 

125 if pages_node is None: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 return 

127 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link") 

128 if next_page_node is None: 

129 return 

130 next_page_href = next_page_node.get("href") 

131 if isinstance(next_page_href, list): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError( 

133 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array." 

134 ) 

135 if next_page_href is None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 raise ValueError( 

137 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found" 

138 ) 

139 

140 content = self.download_file(self.source_website + "/" + next_page_href) 

141 self.parse_issue_content(content, xissue, index) 

142 

143 def parse_sasa_article( 

144 self, article_index: int, article_node: Tag, xissue: IssueData 

145 ) -> ArticleData: 

146 """ 

147 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead 

148 """ 

149 

150 title_node = article_node.select_one(".main_link") 

151 if title_node is None: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found") 

153 href = title_node.get("href") 

154 if href is None or isinstance(href, list): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found") 

156 

157 xarticle = create_articledata() 

158 

159 pages_node = article_node.select_one(".pages") 

160 if pages_node is not None: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true

161 self.set_pages(xarticle, pages_node.text) 

162 xarticle.title_tex = title_node.text 

163 xarticle.title_html = title_node.text 

164 xarticle.pid = f"{xissue.pid}_a{article_index}" 

165 

166 if xissue.url is not None: 166 ↛ 174line 166 didn't jump to line 174 because the condition on line 166 was always true

167 ext_link = create_extlink( 

168 rel="source", location=xissue.url, metadata=self.source_domain 

169 ) 

170 xarticle.ext_links.append(ext_link) 

171 # xarticle.url = xissue.url 

172 

173 # Abstract 

174 abstract_node = article_node.select_one(".secondary_link") 

175 

176 if abstract_node is None: 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true

177 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

178 else: 

179 abstract_href = abstract_node.get("href") 

180 if abstract_href is None or isinstance(abstract_href, list): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 raise ValueError( 

182 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found" 

183 ) 

184 

185 abstract = self.fetch_sasa_abstract( 

186 self.source_website + "/" + abstract_href, xarticle.pid 

187 ) 

188 if abstract is not None: 188 ↛ 193line 188 didn't jump to line 193 because the condition on line 188 was always true

189 xarticle.abstracts.append(abstract) 

190 # LANG 

191 xarticle.lang = abstract["lang"] 

192 

193 author_node = article_node.select_one(".secondary_text") 

194 if author_node is not None: 194 ↛ 202line 194 didn't jump to line 202 because the condition on line 194 was always true

195 authors = re.findall( 

196 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text 

197 ) 

198 for a in authors: 

199 author = create_contributor(role="author", string_name=a) 

200 xarticle.contributors.append(author) 

201 else: 

202 print(f"[{self.source_domain}] {xarticle.pid} : Author not found") 

203 

204 secondary_nodes = article_node.select(".secondary_info_text") 

205 subjects = [] 

206 keywords = [] 

207 doi = None 

208 for node in secondary_nodes: 

209 text = node.text 

210 if text.startswith("Keywords"): 

211 keywords = text.removeprefix("Keywords:\xa0").split("; ") 

212 for kwd in keywords: 

213 subject = create_subj(value=kwd, lang=xarticle.lang) 

214 xarticle.kwds.append(subject) 

215 elif text.startswith("DOI"): 

216 doi = text.removeprefix("DOI:\xa0") 

217 if doi is not None: 217 ↛ 208line 217 didn't jump to line 208 because the condition on line 217 was always true

218 xarticle.doi = doi 

219 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

220 elif text.startswith("MSC"): 

221 subjects = text.removeprefix("MSC:\xa0").split("; ") 

222 for subj in subjects: 

223 subject = create_subj(value=subj, type="msc", lang=xarticle.lang) 

224 xarticle.kwds.append(subject) 

225 elif text.startswith("Zbl:"): 225 ↛ 208line 225 didn't jump to line 208 because the condition on line 225 was always true

226 zbl_link = node.select_one(".secondary_link") 

227 if zbl_link is not None: 227 ↛ 208line 227 didn't jump to line 208 because the condition on line 227 was always true

228 xarticle.extids.append(("zbl-item-id", zbl_link.text)) 

229 

230 if href.startswith("http"): 

231 pdf_url = href 

232 else: 

233 pdf_url = self.source_website + "/files/" + href 

234 

235 # Fix for Filomat 

236 if "www.pmf.ni.ac.rs" in pdf_url: 

237 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs") 

238 

239 add_pdf_link_to_xarticle(xarticle, pdf_url) 

240 return xarticle 

241 

242 def fetch_sasa_abstract(self, abstract_url: str, pid: str): 

243 content = self.download_file(abstract_url) 

244 soup = BeautifulSoup(content, "html.parser") 

245 text_node = soup.select_one("p") 

246 if text_node is not None: 246 ↛ 253line 246 didn't jump to line 253 because the condition on line 246 was always true

247 text = text_node.text.replace("$$", "$") 

248 abstract = create_abstract( 

249 tag="abstract", 

250 value_tex=text, 

251 ) 

252 return abstract 

253 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found") 

254 

255 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ? 

256 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm 

257 def decode_response(self, response: Response, encoding: str = "utf-8"): 

258 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

259 try: 

260 return super().decode_response(response, encoding) 

261 except UnicodeDecodeError: 

262 print( 

263 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250" 

264 ) 

265 try: 

266 return super().decode_response(response, "windows-1250") 

267 except UnicodeDecodeError: 

268 raise BufferError( 

269 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

270 )