Coverage for src/crawler/by_source/sasa

1import re

3from bs4 import BeautifulSoup, Tag

4from ptf.model_data import (

5 AbstractDict,

6 ArticleData,

7 IssueData,

8 create_articledata,

9 create_contributor,

10 create_extlink,

11 create_issuedata,

12 create_subj,

13)

14from requests import Response

16from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle

19class SasaCrawler(BaseCollectionCrawler):

20 source_name = "eLibrary of Mathematical Institute of the Serbian Academy of Sciences and Arts"

21 source_domain = "SASA"

22 source_website = "http://elib.mi.sanu.ac.rs"

24 periode_end = float("inf")

25 periode_begin = 0

27 def __init__(self, *args, **kwargs):

28 super().__init__(*args, **kwargs)

30 self.source = self.get_or_create_source()

32 self.periode = self.get_or_create_periode()

34 def parse_collection_content(self, content):

35 soup = BeautifulSoup(content, "html.parser")

36 xissues: list[IssueData] = []

38 # Extract the list of issues

39 # Filter out empty table cells

40 volume_nodes = [

41 node for node in soup.select("td.issue_cell a.main_link") if node.text != "\xa0"

42 ]

43 for vol_node in volume_nodes:

44 # NOTE : should we parse year here or in the issue itself ?

45 href = vol_node.get("href")

46 if isinstance(href, list): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 raise ValueError(

48 f"[{self.source_domain}] {self.collection_id} : Collection href is an array."

49 )

50 if href is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError(

52 f"[{self.source_domain}] {self.collection_id} : Collection href cannot be found"

53 )

55 # Parse Volume and Issue numbers

56 url = self.source_website + "/pages/" + href

57 # Formats like 44_1 / 2024 | Tom XIV / 2024 | Knj. 8 / 1960 | LXIX_1-2 / 2024

58 volume_re = list(

59 re.finditer(

60 r"(?P<volume>[a-zA-Z0-9 .-]+)(?:_(?P<issue>[\w-]+))? \/ (?P<year>\d+)",

61 vol_node.text,

62 )

63 )

64 if len(volume_re) == 0:

65 # Formats like 20(28) / 2022 | 44 (1) / 2024 | (N.S.) 115 (129) / 2024 |

66 volume_re = list(

67 re.finditer(

68 r"(?P<volume>[\.\w]+)$(?P<issue>\d+)$ \/ (?P<year>\d+)",

69 vol_node.text,

70 )

71 )

72 if len(volume_re) == 0: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise IndexError(

74 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"

75 )

76 volume_metadata = volume_re[0].groupdict()

77 year = int(volume_metadata["year"])

78 if self.periode_begin <= year and year <= self.periode_end: 78 ↛ 43line 78 didn't jump to line 43 because the condition on line 78 was always true

79 xissues.append(

80 self.create_xissue(

81 url,

82 volume_metadata["year"],

83 volume_metadata["volume"].strip(),

84 volume_metadata.get("issue", None),

85 )

86 )

88 # Handle pagination

89 pages_node = soup.select_one(".page_selector")

90 if pages_node is None: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 return xissues

92 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")

93 if next_page_node is None:

94 return xissues

95 next_page_href = next_page_node.get("href")

96 if isinstance(next_page_href, list): 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError(

98 f"[{self.source_domain}] {self.collection_id} : Collection next page href is an array."

99 )

100 if next_page_href is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ValueError(

102 f"[{self.source_domain}] {self.collection_id} : Collection next page href cannot be found"

103 )

104

105 content = self.get_page_content(self.source_website + "/" + next_page_href)

106 return xissues + self.parse_collection_content(content)

107

108 def create_xissue(self, url: str, year: str, volume_number: str, issue_number="1"):

109 if url.endswith("/"): 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 url = url[:-1]

111 xissue = create_issuedata()

112 xissue.url = url

113

114 # Replace any non-word character with an underscore

115 xissue.pid = re.sub(

116 r"[^a-zA-Z0-9-]+", "_", f"{self.collection_id}_{year}__{volume_number}"

117 )

118 xissue.volume = volume_number

119

120 xissue.year = year

121 if issue_number is not None:

122 xissue.pid += f"_{issue_number}"

123 xissue.number = issue_number.replace(",", "-")

124 return xissue

125

126 def parse_issue_content(self, content, xissue: IssueData, index: int = 0):

127 soup = BeautifulSoup(content, "html.parser")

128 article_nodes = soup.select(".content .result")

129 if xissue.pid is None: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError(

131 f"Error in crawler : {self.source_domain} : you must set an issue PID before parsing it"

132 )

133

134 # NOTE : publishers aren't implemented yet in base_crawler, but this should work for SASA.

135 # issue_publisher_node = soup.select_one(".content>table td.text_cell span.data_text")

136 # if (issue_publisher_node is not None):

137 # publisher = issue_publisher_node.text

138 # xpub = create_publisherdata()

139 # xpub.name = publisher.removeprefix("Publisher ")

140 # xissue.publisher = xpub

141

142 for i, art_node in enumerate(article_nodes):

143 xarticle = self.parse_sasa_article(i + index, art_node, xissue)

144 xissue.articles.append(xarticle)

145

146 index = index + len(article_nodes)

147 # Handle pagination

148 pages_node = soup.select_one(".page_selector")

149 if pages_node is None: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 return

151 next_page_node = pages_node.select_one(".page_selector_dead_link+a.page_selector_link")

152 if next_page_node is None:

153 return

154 next_page_href = next_page_node.get("href")

155 if isinstance(next_page_href, list): 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 raise ValueError(

157 f"[{self.source_domain}] {self.collection_id} : Issue next page href is an array."

158 )

159 if next_page_href is None: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 raise ValueError(

161 f"[{self.source_domain}] {self.collection_id} : Issue next page href cannot be found"

162 )

163

164 content = self.get_page_content(self.source_website + "/" + next_page_href)

165 self.parse_issue_content(content, xissue, index)

166

167 def parse_sasa_article(

168 self, article_index: int, article_node: Tag, xissue: IssueData

169 ) -> ArticleData:

170 """

171 Since Sasa doesn't have pages per articles, we parse the article data from the issue page instead

172 """

173 title_node = article_node.select_one(".main_link")

174 if title_node is None: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Title not found")

176 href = title_node.get("href")

177 if href is None or isinstance(href, list): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Article href not found")

179

180 xarticle = create_articledata()

181

182 pages_node = article_node.select_one(".pages")

183 if pages_node is not None: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true

184 xarticle.page_range = pages_node.text

185 xarticle.title_tex = title_node.text

186 xarticle.title_html = title_node.text

187 xarticle.pid = f"{xissue.pid}_a{article_index}"

188

189 if xissue.url is not None: 189 ↛ 196line 189 didn't jump to line 196 because the condition on line 189 was always true

190 ext_link = create_extlink(

191 rel="source", location=xissue.url, metadata=self.source_domain

192 )

193 xarticle.ext_links.append(ext_link)

194 # xarticle.url = xissue.url

195

196 author_node = article_node.select_one(".secondary_text")

197 if author_node is not None: 197 ↛ 207line 197 didn't jump to line 207 because the condition on line 197 was always true

198 authors = re.findall(

199 r"(?: and )?((?:(?<!,)(?<! and)[\w. -](?!and ))+)", author_node.text

200 )

201 for a in authors:

202 author = create_contributor()

203 author["role"] = "author"

204 author["string_name"] = a

205 xarticle.contributors.append(author)

206 else:

207 print(f"[{self.source_domain}] {xarticle.pid} : Author not found")

208

209 abstract_node = article_node.select_one(".secondary_link")

210 if abstract_node is None:

211 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found")

212 else:

213 abstract_href = abstract_node.get("href")

214 if abstract_href is None or isinstance(abstract_href, list): 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 raise ValueError(

216 f"[{self.source_domain}] {xarticle.pid} : Abstract href not found"

217 )

218

219 abstract = self.fetch_sasa_abstract(

220 self.source_website + "/" + abstract_href, xarticle.pid

221 )

222 if abstract is not None: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 xarticle.abstracts.append(abstract)

224

225 secondary_nodes = article_node.select(".secondary_info_text")

226 subjects = []

227 keywords = []

228 doi = None

229 for node in secondary_nodes:

230 text = node.text

231 if text.startswith("Keywords"):

232 keywords = text.removeprefix("Keywords:\xa0").split("; ")

233 for kwd in keywords:

234 subject = create_subj()

235 subject["value"] = kwd

236 subject["lang"] = "en"

237 xarticle.kwds.append(subject)

238 elif text.startswith("DOI"):

239 doi = text.removeprefix("DOI:\xa0")

240 if doi is not None: 240 ↛ 229line 240 didn't jump to line 229 because the condition on line 240 was always true

241 xarticle.doi = doi

242 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")

243 elif text.startswith("MSC"):

244 subjects = text.removeprefix("MSC:\xa0").split("; ")

245 for subj in subjects:

246 subject = create_subj()

247 subject["value"] = subj

248 subject["type"] = "msc"

249 subject["lang"] = "en"

250 xarticle.kwds.append(subject)

251 elif text.startswith("Zbl:"): 251 ↛ 229line 251 didn't jump to line 229 because the condition on line 251 was always true

252 zbl_link = node.select_one(".secondary_link")

253 if zbl_link is not None: 253 ↛ 229line 253 didn't jump to line 229 because the condition on line 253 was always true

254 xarticle.extids.append(("zbl-item-id", zbl_link.text))

255

256 if href.startswith("http"):

257 pdf_url = href

258 else:

259 pdf_url = self.source_website + "/files/" + href

260

261 # Fix for Filomat

262 if "www.pmf.ni.ac.rs" in pdf_url:

263 pdf_url = pdf_url.replace("www.pmf.ni.ac.rs", "www1.pmf.ni.ac.rs")

264

265 add_pdf_link_to_xarticle(xarticle, pdf_url)

266 return xarticle

267

268 def fetch_sasa_abstract(self, abstract_url: str, pid: str):

269 content = self.get_page_content(abstract_url)

270 soup = BeautifulSoup(content, "html.parser")

271 text_node = soup.select_one("p")

272 if text_node is not None: 272 ↛ 280line 272 didn't jump to line 280 because the condition on line 272 was always true

273 text = text_node.text.replace("$$", "$")

274 abstract: AbstractDict = {

275 "tag": "abstract",

276 "value_tex": text,

277 "lang": "eng",

278 }

279 return abstract

280 print(f"[{self.source_domain}] {pid} : Abstract page exists, but text not found")

281

282 # NOTE : SASA abstracts are encoded in windows-1250 despite the header and meta tag advertising otherwise. Is it possible to handle this more elegantly ?

283 # example : http://elib.mi.sanu.ac.rs/files/journals/bltn/26/1e.htm

284 def decode_response(self, response: Response, encoding: str = "utf-8"):

285 """Force windows-1250 encoding if we cannot cannot read the abstract"""

286 try:

287 return super().decode_response(response, encoding)

288 except UnicodeDecodeError:

289 print(

290 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1250"

291 )

292 try:

293 return super().decode_response(response, "windows-1250")

294 except UnicodeDecodeError:

295 raise BufferError(

296 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"

297 )

Coverage for src/crawler/by_source/sasa_crawler.py: 81%

172 statements