Coverage for src/crawler/by_source/elibm_crawler.py: 53%

233 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1from collections import OrderedDict 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import ( 

5 IssueData, 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle 

15 

16 

17class ElibmCrawler(BaseCollectionCrawler): 

18 source_name = "Electronic Library of Mathematics" 

19 source_domain = "ELIBM" 

20 source_website = "https://www.elibm.org" 

21 

22 def __init__(self, *args, **kwargs): 

23 super().__init__(*args, **kwargs) 

24 if self.collection_id == "DOCMA": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 self.delimiter_inline_formula = "\\(" 

26 self.delimiter_disp_formula = "\\[" 

27 

28 def parse_collection_content(self, content): 

29 """ 

30 Parse the HTML page of Annals of Math and returns a list of xissue. 

31 Each xissue has its pid/volume/number/year metadata + its url 

32 """ 

33 soup = BeautifulSoup(content, "html.parser") 

34 xissues = [] 

35 

36 # Extract the list of issues 

37 link_nodes = soup.find_all("a") 

38 

39 # eLibM puts special issue titles as volume number 

40 # to create a issue pid, we use S1, S2... 

41 last_special_issue_number = 0 

42 

43 for link_node in link_nodes: 

44 url = link_node.get("href") 

45 text = link_node.get_text() 

46 if url.startswith("/issue"): 

47 xissue, last_special_issue_number = self.create_elibm_xissue( 

48 url, text, last_special_issue_number 

49 ) 

50 

51 if xissue: 51 ↛ 43line 51 didn't jump to line 43 because the condition on line 51 was always true

52 xissues.append(xissue) 

53 

54 return xissues 

55 

56 def get_first_year(self, year): 

57 if "/" in year: 

58 year = year.split("/")[0] 

59 

60 return year 

61 

62 def create_elibm_xissue(self, url, text, last_special_issue_number): 

63 if "(" not in text or ")" not in text: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 return None, None 

65 

66 parts = text.split("(") 

67 

68 year = parts[1].split(")")[0] 

69 year = year.replace("/", "-") 

70 

71 # volume might not be an integer. eLibM puts special issue titles as volume number. 

72 volume = parts[0].strip() 

73 

74 number = "" 

75 if "No. " in volume: 

76 parts = volume.split("No. ") 

77 volume = parts[0].strip() 

78 number = parts[1].strip() 

79 

80 try: 

81 volume_for_pid = int(volume) 

82 except ValueError: 

83 last_special_issue_number += 1 

84 volume_for_pid = f"S{last_special_issue_number}" 

85 

86 xissue = create_issuedata() 

87 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

88 xissue.year = year 

89 xissue.volume = volume 

90 xissue.number = number 

91 xissue.url = self.source_website + url 

92 

93 return xissue, last_special_issue_number 

94 

95 def parse_issue_content(self, content, xissue): 

96 soup = BeautifulSoup(content, "html.parser") 

97 article_nodes = soup.find_all("div", {"class": "title"}) 

98 

99 for index_article, article_node in enumerate(article_nodes): 

100 article_link_node = article_node.find("a") 

101 if article_link_node: 101 ↛ 99line 101 didn't jump to line 99 because the condition on line 101 was always true

102 url = article_link_node.get("href") 

103 xarticle = create_articledata() 

104 xarticle.pid = "a" + str(index_article) 

105 xarticle.url = self.source_website + url 

106 

107 # eLibM lists the articles in the reverse order, except for one special issue 

108 if xissue.volume == "Mahler Selecta": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 xissue.articles.append(xarticle) 

110 else: 

111 xissue.articles.insert(0, xarticle) 

112 

113 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page 

114 if len(xissue.articles) == 0: 

115 title_node = soup.find("h2", {"class": "document_title"}) 

116 if title_node is not None: 116 ↛ exitline 116 didn't return from function 'parse_issue_content' because the condition on line 116 was always true

117 xarticle = create_articledata() 

118 xarticle.pid = "a0" 

119 xarticle.url = xissue.url 

120 

121 xissue.articles.append(xarticle) 

122 

123 def parse_article_content(self, content, xissue, xarticle, url): 

124 """ 

125 Parse the content with Beautifulsoup and returns an ArticleData 

126 """ 

127 xarticle.lang = "en" 

128 

129 soup = BeautifulSoup(content, "html.parser") 

130 

131 # TITLE 

132 title_node = soup.find("h2", {"class": "document_title"}) 

133 if title_node: 133 ↛ 137line 133 didn't jump to line 137 because the condition on line 133 was always true

134 xarticle.title_tex = title_node.get_text() 

135 

136 # AUTHORS 

137 citation_author_node = soup.find("h3", {"class": "document_author"}) 

138 if citation_author_node: 138 ↛ 157line 138 didn't jump to line 157 because the condition on line 138 was always true

139 text = citation_author_node.get_text() 

140 if text: 140 ↛ 157line 140 didn't jump to line 157 because the condition on line 140 was always true

141 parts = text.split(";") 

142 for part in parts: 

143 text_author = part.strip() 

144 

145 role = "author" 

146 if "(ed.)" in text_author: 

147 role = "editor" 

148 text_author = text_author.split("(ed.)")[0].strip() 

149 

150 author = create_contributor() 

151 author["role"] = role 

152 author["string_name"] = text_author 

153 

154 xarticle.contributors.append(author) 

155 

156 # PDF 

157 link_nodes = soup.find_all("a") 

158 for link_node in link_nodes: 

159 url = link_node.get("href") 

160 if url.startswith("/ft/"): 

161 pdf_url = self.source_website + url 

162 add_pdf_link_to_xarticle(xarticle, pdf_url) 

163 

164 panel_nodes = soup.find_all("h3", {"class": "panel-title"}) 

165 for panel_node in panel_nodes: 

166 text = panel_node.get_text() 

167 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"}) 

168 

169 if text == "Summary": 

170 # ABSTRACT 

171 abstract = content_node.get_text() 

172 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang)) 

173 

174 elif text == "Mathematics Subject Classification": 

175 # MSC 

176 subjs = content_node.get_text().split(", ") 

177 for subj in subjs: 

178 subject = create_subj() 

179 subject["value"] = subj 

180 subject["type"] = "msc" 

181 subject["lang"] = "en" 

182 xarticle.kwds.append(subject) 

183 

184 elif text == "Keywords/Phrases": 

185 # Keywords 

186 subjs = content_node.get_text().split(", ") 

187 for subj in subjs: 

188 subject = create_subj() 

189 subject["value"] = subj 

190 subject["lang"] = "en" 

191 xarticle.kwds.append(subject) 

192 

193 # PAGES 

194 citation_node = soup.find("h5", {"class": "document_source"}) 

195 if citation_node: 195 ↛ 214line 195 didn't jump to line 214 because the condition on line 195 was always true

196 text = citation_node.get_text() 

197 year = f"({xissue.year})" 

198 if year in text: 198 ↛ 214line 198 didn't jump to line 214 because the condition on line 198 was always true

199 text = text.split(year)[0] 

200 

201 if "p." in text: 

202 text = text.split("p.")[0].split(",")[-1].strip() 

203 xarticle.size = text 

204 

205 elif "-" in text: 

206 parts = text.split("-") 

207 first_page = parts[-2].split(" ")[-1] 

208 last_page = parts[-1].split(",")[0].split(" ")[0] 

209 

210 xarticle.fpage = first_page 

211 xarticle.lpage = last_page 

212 

213 # DOI 

214 doi_node = citation_node.next_sibling 

215 if doi_node.name == "div": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 text = doi_node.get_text() 

217 if text.startswith("DOI: "): 

218 doi = text[5:] 

219 

220 xarticle.doi = doi 

221 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

222 

223 return xarticle 

224 

225 def crawl_collection(self): 

226 if self.source is None: 

227 raise RuntimeError("ERROR: the source is not set") 

228 

229 content = self.download_file(self.collection_url) 

230 xissues = self.parse_collection_content(content) 

231 

232 """ 

233 Some collections split the same volumes in different pages 

234 Ex: Volume 6 (2000) and Volume 6 (1999) 

235 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

236 """ 

237 xissues_dict = self.merge_xissues(xissues) 

238 

239 filtered_xissues = xissues_dict 

240 # Filter the issues to crawl if start_pid was set in the constructor 

241 # if self.start_pid is not None: 

242 # filtered_xissues = {} 

243 # start = False 

244 # for pid in xissues_dict: 

245 # if pid == self.start_pid: 

246 # start = True 

247 # if start: 

248 # filtered_xissues[pid] = xissues_dict[pid] 

249 

250 return filtered_xissues 

251 

252 def merge_xissues(self, xissues: list[IssueData]): 

253 """ 

254 Some collections split the same volumes in different pages 

255 Ex: Volume 6 (2000) and Volume 6 (1999) 

256 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

257 """ 

258 

259 merged_xissues = OrderedDict() 

260 

261 for xissue in xissues: 

262 xissues_with_same_volume = [ 

263 item 

264 for item in xissues 

265 if xissue.volume == item.volume 

266 and xissue.number == item.number 

267 and xissue.vseries == item.vseries 

268 and (item.volume or item.number) 

269 ] 

270 

271 if len(xissues_with_same_volume) < 2: 

272 if xissue.pid is None: 

273 raise ValueError("Issue does not have a PID") 

274 merged_xissues[xissue.pid] = {"issues": [xissue]} 

275 first_issue = xissue 

276 year = xissue.year 

277 else: 

278 first_issue = xissues_with_same_volume[0] 

279 volume = xissues_with_same_volume[0].volume 

280 number = xissues_with_same_volume[0].number 

281 vseries = xissues_with_same_volume[0].vseries 

282 

283 # Compute the year based on all issues with the same volume/number 

284 begin = end = year = xissues_with_same_volume[0].year 

285 if not year: 

286 raise ValueError("year is not defined") 

287 

288 if "-" in year: 

289 parts = year.split("-") 

290 begin = parts[0] 

291 end = parts[1] 

292 

293 for xissue_with_same_volume in xissues_with_same_volume[1:]: 

294 new_begin = new_end = xissue_with_same_volume.year 

295 

296 if not xissue_with_same_volume.year: 

297 raise ValueError("xissue year is not defined") 

298 

299 if "-" in xissue_with_same_volume.year: 

300 parts = year.split("-") 

301 new_begin = parts[0] 

302 new_end = parts[1] 

303 

304 if begin is None or end is None or new_begin is None or new_end is None: 

305 continue 

306 begin_int = int(begin) 

307 end_int = int(end) 

308 new_begin_int = int(new_begin) 

309 new_end_int = int(new_end) 

310 

311 if new_begin_int < begin_int: 

312 begin = new_begin 

313 if new_end_int > end_int: 

314 end = new_end 

315 

316 if begin != end: 

317 year = f"{begin}-{end}" 

318 else: 

319 year = begin 

320 

321 # We can now set the real pid 

322 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}" 

323 for issue in xissues_with_same_volume: 

324 issue.pid = pid 

325 

326 if pid not in merged_xissues: 

327 merged_xissues[pid] = { 

328 "issues": xissues_with_same_volume, 

329 } 

330 

331 # We can set the year only for the first xissue because it is the one used to collect 

332 # all the articles. 

333 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0]) 

334 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year 

335 first_issue.merged_year = year 

336 

337 return merged_xissues 

338 

339 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]): 

340 """ 

341 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages. 

342 """ 

343 

344 xissues_to_crawl = merged_xissues["issues"] 

345 

346 merged_xissue = xissues_to_crawl[0] 

347 self.crawl_elibm_issue(merged_xissue) 

348 

349 if len(xissues_to_crawl) > 1: 

350 for raw_xissue in xissues_to_crawl[1:]: 

351 self.crawl_elibm_issue(raw_xissue) 

352 

353 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles 

354 

355 # Updates the article pid 

356 for article_index, xarticle in enumerate(merged_xissue): 

357 if raw_xissue.pid in xarticle.pid: 

358 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}" 

359 

360 # Now that the issue pages have been downloaded/read, we can set the merged pid 

361 # The merged_year was set in self.merge_xissues 

362 # merged_xissue.pid 

363 merged_xissue.year = merged_xissue.merged_year 

364 

365 if self.ignore_missing_pdf: 

366 merged_xissue.articles = [a for a in merged_xissue.articles if self.article_has_pdf(a)] 

367 

368 if not self.test_mode and len(merged_xissue.articles) > 0: 

369 self.process_resource_metadata(merged_xissue, resource_type="issue") 

370 self.add_xissue_into_database(merged_xissue) 

371 

372 def crawl_elibm_issue(self, xissue: IssueData): 

373 """ 

374 Crawl 1 wag page of an issue. 

375 - get the HTML content of the issue 

376 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

377 - crawl each article 

378 """ 

379 

380 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

381 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

382 if hasattr(xissue, "url") and xissue.url: 

383 content = self.download_file(xissue.url) 

384 self.parse_issue_content(content, xissue) 

385 

386 xarticles = xissue.articles 

387 

388 parsed_xarticles = [] 

389 

390 for xarticle in xarticles: 

391 parsed_xarticle = self.crawl_article(xarticle, xissue) 

392 if parsed_xarticle is not None: 

393 parsed_xarticles.append(parsed_xarticle) 

394 

395 xissue.articles = parsed_xarticles