Coverage for src / crawler / by_source / elibm_crawler.py: 53%

234 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from collections import OrderedDict 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import ( 

5 IssueData, 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.crawler_utils import article_has_pdf 

15from crawler.utils import add_pdf_link_to_xarticle 

16 

17 

18class ElibmCrawler(BaseCollectionCrawler): 

19 source_name = "Electronic Library of Mathematics" 

20 source_domain = "ELIBM" 

21 source_website = "https://www.elibm.org" 

22 

23 def __init__(self, *args, **kwargs): 

24 super().__init__(*args, **kwargs) 

25 if self.collection_id == "DOCMA": 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 self.delimiter_inline_formula = "\\(" 

27 self.delimiter_disp_formula = "\\[" 

28 

29 def parse_collection_content(self, content): 

30 """ 

31 Parse the HTML page of Annals of Math and returns a list of xissue. 

32 Each xissue has its pid/volume/number/year metadata + its url 

33 """ 

34 soup = BeautifulSoup(content, "html.parser") 

35 xissues = [] 

36 

37 # Extract the list of issues 

38 link_nodes = soup.find_all("a") 

39 

40 # eLibM puts special issue titles as volume number 

41 # to create a issue pid, we use S1, S2... 

42 last_special_issue_number = 0 

43 

44 for link_node in link_nodes: 

45 url = link_node.get("href") 

46 text = link_node.get_text() 

47 if url.startswith("/issue"): 

48 xissue, last_special_issue_number = self.create_elibm_xissue( 

49 url, text, last_special_issue_number 

50 ) 

51 

52 if xissue: 52 ↛ 44line 52 didn't jump to line 44 because the condition on line 52 was always true

53 xissues.append(xissue) 

54 

55 return xissues 

56 

57 def get_first_year(self, year): 

58 if "/" in year: 

59 year = year.split("/")[0] 

60 

61 return year 

62 

63 def create_elibm_xissue(self, url, text, last_special_issue_number): 

64 if "(" not in text or ")" not in text: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return None, None 

66 

67 parts = text.split("(") 

68 

69 year = parts[1].split(")")[0] 

70 year = year.replace("/", "-") 

71 

72 # volume might not be an integer. eLibM puts special issue titles as volume number. 

73 volume = parts[0].strip() 

74 

75 number = "" 

76 if "No. " in volume: 

77 parts = volume.split("No. ") 

78 volume = parts[0].strip() 

79 number = parts[1].strip() 

80 

81 try: 

82 volume_for_pid = int(volume) 

83 except ValueError: 

84 last_special_issue_number += 1 

85 volume_for_pid = f"S{last_special_issue_number}" 

86 

87 xissue = create_issuedata() 

88 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

89 xissue.year = year 

90 xissue.volume = volume 

91 xissue.number = number 

92 xissue.url = self.source_website + url 

93 

94 return xissue, last_special_issue_number 

95 

96 def parse_issue_content(self, content, xissue): 

97 soup = BeautifulSoup(content, "html.parser") 

98 article_nodes = soup.find_all("div", {"class": "title"}) 

99 

100 for index_article, article_node in enumerate(article_nodes): 

101 article_link_node = article_node.find("a") 

102 if article_link_node: 102 ↛ 100line 102 didn't jump to line 100 because the condition on line 102 was always true

103 url = article_link_node.get("href") 

104 xarticle = create_articledata() 

105 xarticle.pid = "a" + str(index_article) 

106 xarticle.url = self.source_website + url 

107 

108 # eLibM lists the articles in the reverse order, except for one special issue 

109 if xissue.volume == "Mahler Selecta": 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 xissue.articles.append(xarticle) 

111 else: 

112 xissue.articles.insert(0, xarticle) 

113 

114 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page 

115 if len(xissue.articles) == 0: 

116 title_node = soup.find("h2", {"class": "document_title"}) 

117 if title_node is not None: 117 ↛ exitline 117 didn't return from function 'parse_issue_content' because the condition on line 117 was always true

118 xarticle = create_articledata() 

119 xarticle.pid = "a0" 

120 xarticle.url = xissue.url 

121 

122 xissue.articles.append(xarticle) 

123 

124 def parse_article_content(self, content, xissue, xarticle, url): 

125 """ 

126 Parse the content with Beautifulsoup and returns an ArticleData 

127 """ 

128 xarticle.lang = "en" 

129 

130 soup = BeautifulSoup(content, "html.parser") 

131 

132 # TITLE 

133 title_node = soup.find("h2", {"class": "document_title"}) 

134 if title_node: 134 ↛ 138line 134 didn't jump to line 138 because the condition on line 134 was always true

135 xarticle.title_tex = title_node.get_text() 

136 

137 # AUTHORS 

138 citation_author_node = soup.find("h3", {"class": "document_author"}) 

139 if citation_author_node: 139 ↛ 158line 139 didn't jump to line 158 because the condition on line 139 was always true

140 text = citation_author_node.get_text() 

141 if text: 141 ↛ 158line 141 didn't jump to line 158 because the condition on line 141 was always true

142 parts = text.split(";") 

143 for part in parts: 

144 text_author = part.strip() 

145 

146 role = "author" 

147 if "(ed.)" in text_author: 

148 role = "editor" 

149 text_author = text_author.split("(ed.)")[0].strip() 

150 

151 author = create_contributor() 

152 author["role"] = role 

153 author["string_name"] = text_author 

154 

155 xarticle.contributors.append(author) 

156 

157 # PDF 

158 link_nodes = soup.find_all("a") 

159 for link_node in link_nodes: 

160 url = link_node.get("href") 

161 if url.startswith("/ft/"): 

162 pdf_url = self.source_website + url 

163 add_pdf_link_to_xarticle(xarticle, pdf_url) 

164 

165 panel_nodes = soup.find_all("h3", {"class": "panel-title"}) 

166 for panel_node in panel_nodes: 

167 text = panel_node.get_text() 

168 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"}) 

169 

170 if text == "Summary": 

171 # ABSTRACT 

172 abstract = content_node.get_text() 

173 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang)) 

174 

175 elif text == "Mathematics Subject Classification": 

176 # MSC 

177 subjs = content_node.get_text().split(", ") 

178 for subj in subjs: 

179 subject = create_subj() 

180 subject["value"] = subj 

181 subject["type"] = "msc" 

182 subject["lang"] = "en" 

183 xarticle.kwds.append(subject) 

184 

185 elif text == "Keywords/Phrases": 

186 # Keywords 

187 subjs = content_node.get_text().split(", ") 

188 for subj in subjs: 

189 subject = create_subj() 

190 subject["value"] = subj 

191 subject["lang"] = "en" 

192 xarticle.kwds.append(subject) 

193 

194 # PAGES 

195 citation_node = soup.find("h5", {"class": "document_source"}) 

196 if citation_node: 196 ↛ 215line 196 didn't jump to line 215 because the condition on line 196 was always true

197 text = citation_node.get_text() 

198 year = f"({xissue.year})" 

199 if year in text: 199 ↛ 215line 199 didn't jump to line 215 because the condition on line 199 was always true

200 text = text.split(year)[0] 

201 

202 if "p." in text: 

203 text = text.split("p.")[0].split(",")[-1].strip() 

204 xarticle.size = text 

205 

206 elif "-" in text: 

207 parts = text.split("-") 

208 first_page = parts[-2].split(" ")[-1] 

209 last_page = parts[-1].split(",")[0].split(" ")[0] 

210 

211 xarticle.fpage = first_page 

212 xarticle.lpage = last_page 

213 

214 # DOI 

215 doi_node = citation_node.next_sibling 

216 if doi_node.name == "div": 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 text = doi_node.get_text() 

218 if text.startswith("DOI: "): 

219 doi = text[5:] 

220 

221 xarticle.doi = doi 

222 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

223 

224 return xarticle 

225 

226 def crawl_collection(self): 

227 if self.source is None: 

228 raise RuntimeError("ERROR: the source is not set") 

229 

230 content = self.download_file(self.collection_url) 

231 xissues = self.parse_collection_content(content) 

232 

233 """ 

234 Some collections split the same volumes in different pages 

235 Ex: Volume 6 (2000) and Volume 6 (1999) 

236 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

237 """ 

238 xissues_dict = self.merge_xissues(xissues) 

239 

240 filtered_xissues = xissues_dict 

241 # Filter the issues to crawl if start_pid was set in the constructor 

242 # if self.start_pid is not None: 

243 # filtered_xissues = {} 

244 # start = False 

245 # for pid in xissues_dict: 

246 # if pid == self.start_pid: 

247 # start = True 

248 # if start: 

249 # filtered_xissues[pid] = xissues_dict[pid] 

250 

251 return filtered_xissues 

252 

253 def merge_xissues(self, xissues: list[IssueData]): 

254 """ 

255 Some collections split the same volumes in different pages 

256 Ex: Volume 6 (2000) and Volume 6 (1999) 

257 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

258 """ 

259 

260 merged_xissues = OrderedDict() 

261 

262 for xissue in xissues: 

263 xissues_with_same_volume = [ 

264 item 

265 for item in xissues 

266 if xissue.volume == item.volume 

267 and xissue.number == item.number 

268 and xissue.vseries == item.vseries 

269 and (item.volume or item.number) 

270 ] 

271 

272 if len(xissues_with_same_volume) < 2: 

273 if xissue.pid is None: 

274 raise ValueError("Issue does not have a PID") 

275 merged_xissues[xissue.pid] = {"issues": [xissue]} 

276 first_issue = xissue 

277 year = xissue.year 

278 else: 

279 first_issue = xissues_with_same_volume[0] 

280 volume = xissues_with_same_volume[0].volume 

281 number = xissues_with_same_volume[0].number 

282 vseries = xissues_with_same_volume[0].vseries 

283 

284 # Compute the year based on all issues with the same volume/number 

285 begin = end = year = xissues_with_same_volume[0].year 

286 if not year: 

287 raise ValueError("year is not defined") 

288 

289 if "-" in year: 

290 parts = year.split("-") 

291 begin = parts[0] 

292 end = parts[1] 

293 

294 for xissue_with_same_volume in xissues_with_same_volume[1:]: 

295 new_begin = new_end = xissue_with_same_volume.year 

296 

297 if not xissue_with_same_volume.year: 

298 raise ValueError("xissue year is not defined") 

299 

300 if "-" in xissue_with_same_volume.year: 

301 parts = year.split("-") 

302 new_begin = parts[0] 

303 new_end = parts[1] 

304 

305 if begin is None or end is None or new_begin is None or new_end is None: 

306 continue 

307 begin_int = int(begin) 

308 end_int = int(end) 

309 new_begin_int = int(new_begin) 

310 new_end_int = int(new_end) 

311 

312 if new_begin_int < begin_int: 

313 begin = new_begin 

314 if new_end_int > end_int: 

315 end = new_end 

316 

317 if begin != end: 

318 year = f"{begin}-{end}" 

319 else: 

320 year = begin 

321 

322 # We can now set the real pid 

323 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}" 

324 for issue in xissues_with_same_volume: 

325 issue.pid = pid 

326 

327 if pid not in merged_xissues: 

328 merged_xissues[pid] = { 

329 "issues": xissues_with_same_volume, 

330 } 

331 

332 # We can set the year only for the first xissue because it is the one used to collect 

333 # all the articles. 

334 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0]) 

335 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year 

336 first_issue.merged_year = year 

337 

338 return merged_xissues 

339 

340 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]): 

341 """ 

342 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages. 

343 """ 

344 

345 xissues_to_crawl = merged_xissues["issues"] 

346 

347 merged_xissue = xissues_to_crawl[0] 

348 self.crawl_elibm_issue(merged_xissue) 

349 

350 if len(xissues_to_crawl) > 1: 

351 for raw_xissue in xissues_to_crawl[1:]: 

352 self.crawl_elibm_issue(raw_xissue) 

353 

354 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles 

355 

356 # Updates the article pid 

357 for article_index, xarticle in enumerate(merged_xissue): 

358 if raw_xissue.pid in xarticle.pid: 

359 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}" 

360 

361 # Now that the issue pages have been downloaded/read, we can set the merged pid 

362 # The merged_year was set in self.merge_xissues 

363 # merged_xissue.pid 

364 merged_xissue.year = merged_xissue.merged_year 

365 

366 if self.ignore_missing_pdf: 

367 merged_xissue.articles = [a for a in merged_xissue.articles if article_has_pdf(a)] 

368 

369 if not self.dry and len(merged_xissue.articles) > 0: 

370 self.process_resource_metadata(merged_xissue, resource_type="issue") 

371 self.add_xissue_into_database(merged_xissue) 

372 

373 def crawl_elibm_issue(self, xissue: IssueData): 

374 """ 

375 Crawl 1 wag page of an issue. 

376 - get the HTML content of the issue 

377 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

378 - crawl each article 

379 """ 

380 

381 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

382 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

383 if hasattr(xissue, "url") and xissue.url: 

384 content = self.download_file(xissue.url) 

385 self.parse_issue_content(content, xissue) 

386 

387 xarticles = xissue.articles 

388 

389 parsed_xarticles = [] 

390 

391 for xarticle in xarticles: 

392 parsed_xarticle = self.crawl_article(xarticle, xissue) 

393 if parsed_xarticle is not None: 

394 parsed_xarticles.append(parsed_xarticle) 

395 

396 xissue.articles = parsed_xarticles