Coverage for src/crawler/by_source/elibm_crawler.py: 53%

246 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from collections import OrderedDict 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import ( 

5 IssueData, 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle 

15 

16 

17class ElibmCrawler(BaseCollectionCrawler): 

18 source_name = "Electronic Library of Mathematics" 

19 source_domain = "ELIBM" 

20 source_website = "https://www.elibm.org" 

21 

22 def __init__(self, *args, **kwargs): 

23 super().__init__(*args, **kwargs) 

24 if self.collection_id == "DOCMA": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 self.delimiter_inline_formula = "\\(" 

26 self.delimiter_disp_formula = "\\[" 

27 

28 def parse_collection_content(self, content): 

29 """ 

30 Parse the HTML page of Annals of Math and returns a list of xissue. 

31 Each xissue has its pid/volume/number/year metadata + its url 

32 

33 self.periode is set at the end based on the xissue years of the HTML page 

34 """ 

35 soup = BeautifulSoup(content, "html.parser") 

36 xissues = [] 

37 

38 # Extract the list of issues 

39 link_nodes = soup.find_all("a") 

40 

41 # eLibM puts special issue titles as volume number 

42 # to create a issue pid, we use S1, S2... 

43 last_special_issue_number = 0 

44 

45 for link_node in link_nodes: 

46 url = link_node.get("href") 

47 text = link_node.get_text() 

48 if url.startswith("/issue"): 

49 xissue, last_special_issue_number = self.create_elibm_xissue( 

50 url, text, last_special_issue_number 

51 ) 

52 

53 # eLibM lists the special issues at the end. 

54 # set the periode_begin if we find a special issue 

55 if last_special_issue_number == 1: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 self.periode_begin = self.get_first_year(xissues[-1].year) 

57 

58 if xissue: 58 ↛ 45line 58 didn't jump to line 45 because the condition on line 58 was always true

59 xissues.append(xissue) 

60 

61 self.periode_end = self.get_first_year(xissues[0].year) 

62 

63 if last_special_issue_number == 0: 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true

64 self.periode_begin = self.get_first_year(xissues[-1].year) 

65 

66 self.periode = self.get_or_create_periode() 

67 

68 return xissues 

69 

70 def get_first_year(self, year): 

71 if "/" in year: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 year = year.split("/")[0] 

73 

74 return year 

75 

76 def create_elibm_xissue(self, url, text, last_special_issue_number): 

77 if "(" not in text or ")" not in text: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 return None, None 

79 

80 parts = text.split("(") 

81 

82 year = parts[1].split(")")[0] 

83 year = year.replace("/", "-") 

84 

85 # volume might not be an integer. eLibM puts special issue titles as volume number. 

86 volume = parts[0].strip() 

87 

88 number = "" 

89 if "No. " in volume: 

90 parts = volume.split("No. ") 

91 volume = parts[0].strip() 

92 number = parts[1].strip() 

93 

94 try: 

95 volume_for_pid = int(volume) 

96 except ValueError: 

97 last_special_issue_number += 1 

98 volume_for_pid = f"S{last_special_issue_number}" 

99 

100 xissue = create_issuedata() 

101 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

102 xissue.year = year 

103 xissue.volume = volume 

104 xissue.number = number 

105 xissue.url = self.source_website + url 

106 

107 return xissue, last_special_issue_number 

108 

109 def parse_issue_content(self, content, xissue): 

110 soup = BeautifulSoup(content, "html.parser") 

111 article_nodes = soup.find_all("div", {"class": "title"}) 

112 

113 for index_article, article_node in enumerate(article_nodes): 

114 article_link_node = article_node.find("a") 

115 if article_link_node: 115 ↛ 113line 115 didn't jump to line 113 because the condition on line 115 was always true

116 url = article_link_node.get("href") 

117 xarticle = create_articledata() 

118 xarticle.pid = "a" + str(index_article) 

119 xarticle.url = self.source_website + url 

120 

121 # eLibM lists the articles in the reverse order, except for one special issue 

122 if xissue.volume == "Mahler Selecta": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 xissue.articles.append(xarticle) 

124 else: 

125 xissue.articles.insert(0, xarticle) 

126 

127 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page 

128 if len(xissue.articles) == 0: 

129 title_node = soup.find("h2", {"class": "document_title"}) 

130 if title_node is not None: 130 ↛ exitline 130 didn't return from function 'parse_issue_content' because the condition on line 130 was always true

131 xarticle = create_articledata() 

132 xarticle.pid = "a0" 

133 xarticle.url = xissue.url 

134 

135 xissue.articles.append(xarticle) 

136 

137 def parse_article_content(self, content, xissue, xarticle, url, pid): 

138 """ 

139 Parse the content with Beautifulsoup and returns an ArticleData 

140 """ 

141 xarticle.pid = pid 

142 xarticle.lang = "en" 

143 

144 soup = BeautifulSoup(content, "html.parser") 

145 

146 # TITLE 

147 title_node = soup.find("h2", {"class": "document_title"}) 

148 if title_node: 148 ↛ 152line 148 didn't jump to line 152 because the condition on line 148 was always true

149 xarticle.title_tex = title_node.get_text() 

150 

151 # AUTHORS 

152 citation_author_node = soup.find("h3", {"class": "document_author"}) 

153 if citation_author_node: 153 ↛ 172line 153 didn't jump to line 172 because the condition on line 153 was always true

154 text = citation_author_node.get_text() 

155 if text: 155 ↛ 172line 155 didn't jump to line 172 because the condition on line 155 was always true

156 parts = text.split(";") 

157 for part in parts: 

158 text_author = part.strip() 

159 

160 role = "author" 

161 if "(ed.)" in text_author: 

162 role = "editor" 

163 text_author = text_author.split("(ed.)")[0].strip() 

164 

165 author = create_contributor() 

166 author["role"] = role 

167 author["string_name"] = text_author 

168 

169 xarticle.contributors.append(author) 

170 

171 # PDF 

172 link_nodes = soup.find_all("a") 

173 for link_node in link_nodes: 

174 url = link_node.get("href") 

175 if url.startswith("/ft/"): 

176 pdf_url = self.source_website + url 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 panel_nodes = soup.find_all("h3", {"class": "panel-title"}) 

180 for panel_node in panel_nodes: 

181 text = panel_node.get_text() 

182 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"}) 

183 

184 if text == "Summary": 

185 # ABSTRACT 

186 abstract = content_node.get_text() 

187 xabstract = create_abstract(tag="abstract", value_tex=abstract, lang=xarticle.lang) 

188 xarticle.abstracts.append(xabstract) 

189 

190 elif text == "Mathematics Subject Classification": 

191 # MSC 

192 subjs = content_node.get_text().split(", ") 

193 for subj in subjs: 

194 subject = create_subj() 

195 subject["value"] = subj 

196 subject["type"] = "msc" 

197 subject["lang"] = "en" 

198 xarticle.kwds.append(subject) 

199 

200 elif text == "Keywords/Phrases": 

201 # Keywords 

202 subjs = content_node.get_text().split(", ") 

203 for subj in subjs: 

204 subject = create_subj() 

205 subject["value"] = subj 

206 subject["lang"] = "en" 

207 xarticle.kwds.append(subject) 

208 

209 # PAGES 

210 citation_node = soup.find("h5", {"class": "document_source"}) 

211 if citation_node: 211 ↛ 230line 211 didn't jump to line 230 because the condition on line 211 was always true

212 text = citation_node.get_text() 

213 year = f"({xissue.year})" 

214 if year in text: 214 ↛ 230line 214 didn't jump to line 230 because the condition on line 214 was always true

215 text = text.split(year)[0] 

216 

217 if "p." in text: 

218 text = text.split("p.")[0].split(",")[-1].strip() 

219 xarticle.size = text 

220 

221 elif "-" in text: 

222 parts = text.split("-") 

223 first_page = parts[-2].split(" ")[-1] 

224 last_page = parts[-1].split(",")[0].split(" ")[0] 

225 

226 xarticle.fpage = first_page 

227 xarticle.lpage = last_page 

228 

229 # DOI 

230 doi_node = citation_node.next_sibling 

231 if doi_node.name == "div": 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true

232 text = doi_node.get_text() 

233 if text.startswith("DOI: "): 

234 doi = text[5:] 

235 

236 xarticle.doi = doi 

237 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

238 

239 return xarticle 

240 

241 def crawl_collection(self): 

242 if self.source is None: 

243 raise RuntimeError("ERROR: the source is not set") 

244 

245 content = self.download_file(self.collection_url) 

246 xissues = self.parse_collection_content(content) 

247 

248 """ 

249 Some collections split the same volumes in different pages 

250 Ex: Volume 6 (2000) and Volume 6 (1999) 

251 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

252 """ 

253 xissues_dict = self.merge_xissues(xissues) 

254 

255 filtered_xissues = xissues_dict 

256 # Filter the issues to crawl if start_pid was set in the constructor 

257 if self.start_pid is not None: 

258 filtered_xissues = {} 

259 start = False 

260 for pid in xissues_dict: 

261 if pid == self.start_pid: 

262 start = True 

263 if start: 

264 filtered_xissues[pid] = xissues_dict[pid] 

265 

266 return filtered_xissues 

267 

268 def merge_xissues(self, xissues: list[IssueData]): 

269 """ 

270 Some collections split the same volumes in different pages 

271 Ex: Volume 6 (2000) and Volume 6 (1999) 

272 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

273 """ 

274 

275 merged_xissues = OrderedDict() 

276 

277 for xissue in xissues: 

278 xissues_with_same_volume = [ 

279 item 

280 for item in xissues 

281 if xissue.volume == item.volume 

282 and xissue.number == item.number 

283 and xissue.vseries == item.vseries 

284 and (item.volume or item.number) 

285 ] 

286 

287 if len(xissues_with_same_volume) < 2: 

288 if xissue.pid is None: 

289 raise ValueError("Issue does not have a PID") 

290 merged_xissues[xissue.pid] = {"issues": [xissue]} 

291 first_issue = xissue 

292 year = xissue.year 

293 else: 

294 first_issue = xissues_with_same_volume[0] 

295 volume = xissues_with_same_volume[0].volume 

296 number = xissues_with_same_volume[0].number 

297 vseries = xissues_with_same_volume[0].vseries 

298 

299 # Compute the year based on all issues with the same volume/number 

300 begin = end = year = xissues_with_same_volume[0].year 

301 if not year: 

302 raise ValueError("year is not defined") 

303 

304 if "-" in year: 

305 parts = year.split("-") 

306 begin = parts[0] 

307 end = parts[1] 

308 

309 for xissue_with_same_volume in xissues_with_same_volume[1:]: 

310 new_begin = new_end = xissue_with_same_volume.year 

311 

312 if not xissue_with_same_volume.year: 

313 raise ValueError("xissue year is not defined") 

314 

315 if "-" in xissue_with_same_volume.year: 

316 parts = year.split("-") 

317 new_begin = parts[0] 

318 new_end = parts[1] 

319 

320 if begin is None or end is None or new_begin is None or new_end is None: 

321 continue 

322 begin_int = int(begin) 

323 end_int = int(end) 

324 new_begin_int = int(new_begin) 

325 new_end_int = int(new_end) 

326 

327 if new_begin_int < begin_int: 

328 begin = new_begin 

329 if new_end_int > end_int: 

330 end = new_end 

331 

332 if begin != end: 

333 year = f"{begin}-{end}" 

334 else: 

335 year = begin 

336 

337 # We can now set the real pid 

338 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}" 

339 for issue in xissues_with_same_volume: 

340 issue.pid = pid 

341 

342 if pid not in merged_xissues: 

343 merged_xissues[pid] = { 

344 "issues": xissues_with_same_volume, 

345 } 

346 

347 # We can set the year only for the first xissue because it is the one used to collect 

348 # all the articles. 

349 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0]) 

350 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year 

351 first_issue.merged_year = year 

352 

353 return merged_xissues 

354 

355 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]): 

356 """ 

357 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages. 

358 """ 

359 

360 xissues_to_crawl = merged_xissues["issues"] 

361 

362 merged_xissue = xissues_to_crawl[0] 

363 self.crawl_elibm_issue(merged_xissue) 

364 

365 if len(xissues_to_crawl) > 1: 

366 for raw_xissue in xissues_to_crawl[1:]: 

367 self.crawl_elibm_issue(raw_xissue) 

368 

369 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles 

370 

371 # Updates the article pid 

372 for article_index, xarticle in enumerate(merged_xissue): 

373 if raw_xissue.pid in xarticle.pid: 

374 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}" 

375 

376 # Now that the issue pages have been downloaded/read, we can set the merged pid 

377 # The merged_year was set in self.merge_xissues 

378 # merged_xissue.pid 

379 merged_xissue.year = merged_xissue.merged_year 

380 

381 if not self.test_mode and len(merged_xissue.articles) > 0: 

382 self.add_xissue_into_database(merged_xissue) 

383 

384 def crawl_elibm_issue(self, xissue: IssueData): 

385 """ 

386 Crawl 1 wag page of an issue. 

387 - get the HTML content of the issue 

388 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

389 - crawl each article 

390 """ 

391 

392 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

393 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

394 if hasattr(xissue, "url") and xissue.url: 

395 content = self.download_file(xissue.url) 

396 self.parse_issue_content(content, xissue) 

397 

398 xarticles = xissue.articles 

399 

400 parsed_xarticles = [] 

401 

402 for xarticle in xarticles: 

403 parsed_xarticle = self.crawl_article(xarticle, xissue) 

404 if parsed_xarticle is not None: 

405 parsed_xarticles.append(parsed_xarticle) 

406 

407 xissue.articles = parsed_xarticles