Coverage for src/crawler/by_source/eudml_crawler.py: 57%

276 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import base64 

2import json 

3import os 

4import re 

5import subprocess 

6 

7import regex 

8import requests 

9from bs4 import BeautifulSoup 

10from django.conf import settings 

11from ptf.model_data import ( 

12 create_abstract, 

13 create_articledata, 

14 create_contributor, 

15 create_extlink, 

16 create_issuedata, 

17 create_subj, 

18) 

19from ptf.utils import execute_cmd 

20from requests_cache import CachedSession 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import add_pdf_link_to_xarticle 

24 

25 

26class EudmlCrawler(BaseCollectionCrawler): 

27 source_name = "European Digital Mathematics Library" 

28 source_domain = "EUDML" 

29 source_website = "https://eudml.org" 

30 

31 def parse_collection_content(self, content): 

32 """ 

33 Parse the HTML page of a EuDml journal and returns a list of xissue. 

34 Each xissue has a list of articles with just an url. 

35 

36 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

37 """ 

38 data = json.loads(content) 

39 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser") 

40 xissues = [] 

41 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)") 

42 # Extract the list of volumes 

43 volume_count = 0 

44 issue_count = 0 

45 for v in data["volumes"]: 

46 volume_count += 1 

47 volume_number = v["name"] 

48 

49 year_re_groups = volume_year_re.search(v["desc"]) 

50 if year_re_groups is None: 

51 print("skipping volume : no year found") 

52 continue 

53 year = year_re_groups.group(1) 

54 

55 if len(v["issues"]) > 0 and year != "": 

56 # Extract all the issues 

57 for i in v["issues"]: 

58 issue_count += 1 

59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number) 

60 xissues.append(xissue) 

61 else: 

62 # No issues, articles are directly in the volumeF 

63 xissue = self.create_eudml_xissue(v, year, None, volume_number) 

64 xissues.append(xissue) 

65 

66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list"> 

67 # This info is used to check the number of articles/issues parsed in the page 

68 volumes_to_find = 0 

69 issues_to_find = 0 

70 articles_to_find = 0 

71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"}) 

72 for article_detail_node in article_details_nodes: 

73 unit_nodes = article_detail_node.find_all("li") 

74 for unit_node in unit_nodes: 

75 strong_node = unit_node.find("strong") 

76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true

77 text = strong_node.get_text() 

78 if text == "Issue count:": 

79 value = unit_node.get_text()[13:] 

80 issues_to_find += int(value) 

81 elif text == "Volume count:": 

82 value = unit_node.get_text()[14:] 

83 volumes_to_find += int(value) 

84 elif text == "Number of articles:": 

85 value = unit_node.get_text()[20:] 

86 articles_to_find += int(value) 

87 

88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}" 

90 print(txt) 

91 if settings.CRAWLER_LOG_FILE: 

92 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

93 f_.write(txt + "\n") 

94 

95 if issue_count != issues_to_find: 

96 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}" 

97 print(txt) 

98 if settings.CRAWLER_LOG_FILE: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

100 f_.write(txt + "\n") 

101 

102 article_count = sum([len(xissue.articles) for xissue in xissues]) 

103 if article_count != articles_to_find: 

104 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}" 

105 print(txt) 

106 if settings.CRAWLER_LOG_FILE: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

108 f_.write(txt + "\n") 

109 

110 return xissues 

111 

112 def create_eudml_xissue( 

113 self, issue_data: dict, year_str, issue_number: str | None, volume_number 

114 ): 

115 """ 

116 EuDML does not have a separate HTML page for an issue. 

117 The list of issues/articles is directly found in the collection page. 

118 

119 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume 

120 The PID is temporary and will be updated with the issue number (if any) 

121 create_xissue directly creates articles, but with just a pid and an url. 

122 """ 

123 xissue = create_issuedata() 

124 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number 

125 if issue_number: 

126 xissue.pid = xissue.pid + "_" + issue_number 

127 xissue.year = year_str 

128 xissue.volume = volume_number 

129 if issue_number: 

130 xissue.number = issue_number 

131 

132 issue_data["articles"].sort(key=lambda a: a["sortKey"]) 

133 for index_article, article_data in enumerate(issue_data["articles"]): 

134 xarticle = create_articledata() 

135 xarticle.pid = "a" + str(index_article) 

136 xarticle.url = article_data["url"] 

137 xissue.articles.append(xarticle) 

138 return xissue 

139 

140 def parse_article_content(self, content, xissue, xarticle, url, pid): 

141 """ 

142 Parse the content with Beautifulsoup and returns an ArticleData 

143 """ 

144 xarticle = create_articledata() 

145 xarticle.pid = pid 

146 soup = BeautifulSoup(content, "xml") 

147 

148 self.get_metadata_using_citation_meta( 

149 xarticle, 

150 xissue, 

151 soup, 

152 [ 

153 "lang", 

154 "title", 

155 "author", 

156 "pdf", 

157 "abstract", 

158 "page", 

159 "doi", 

160 "mr", 

161 "zbl", 

162 "publisher", 

163 "keywords", 

164 ], 

165 ) 

166 

167 # LINK to SOURCE 

168 url_full_text_node = soup.find("a", text="Access to full text") 

169 if url_full_text_node is not None: 

170 url_full_text = url_full_text_node.get("href") 

171 if isinstance(url_full_text, str): 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true

172 ext_link = create_extlink(rel="primary-source", location=url_full_text) 

173 xarticle.ext_links.append(ext_link) 

174 

175 # MSC KEYWORDS 

176 subj_part = soup.select_one("article#unit-subject-areas") 

177 if subj_part is not None: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+") 

179 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))] 

180 for subj in subjs: 

181 type_class = subj.get("href").split("/") 

182 subject = create_subj(type="msc", lang=xarticle.lang) 

183 subject["value"] = type_class[3] 

184 xarticle.kwds.append(subject) 

185 

186 # FALLBACK 

187 if not xarticle.title_tex: 

188 try: 

189 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ") 

190 txt = f"{url} Fallback for title" 

191 print(txt) 

192 if settings.CRAWLER_LOG_FILE: 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

194 f_.write(txt + "\n") 

195 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "") 

196 # FIXME 

197 except: # noqa: E722 

198 pass 

199 

200 if len(xarticle.contributors) == 0: 

201 # AUTHORS 

202 authors_bloc = soup.select_one("p.sub-title-1") 

203 if authors_bloc: 203 ↛ 221line 203 didn't jump to line 221 because the condition on line 203 was always true

204 authors_node = authors_bloc.find_all("a") 

205 if len(authors_node) > 0: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 txt = f"{url} Fallback for authors" 

207 print(txt) 

208 if settings.CRAWLER_LOG_FILE: 

209 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

210 f_.write(txt + "\n") 

211 for author_node in authors_node: 211 ↛ 212line 211 didn't jump to line 212 because the loop on line 211 never started

212 text_author = author_node.get_text() 

213 text_author = text_author.replace(",", "") 

214 

215 author = create_contributor() 

216 author["role"] = "author" 

217 author["string_name"] = text_author 

218 

219 xarticle.contributors.append(author) 

220 

221 if len(xarticle.streams) == 0: 221 ↛ 229line 221 didn't jump to line 229 because the condition on line 221 was always true

222 # PDF 

223 pdf_node = soup.find("a", text="Full (PDF)") 

224 if pdf_node is not None: 

225 pdf_url = pdf_node.get("href") 

226 if pdf_url: 226 ↛ 229line 226 didn't jump to line 229 because the condition on line 226 was always true

227 add_pdf_link_to_xarticle(xarticle, pdf_url) 

228 

229 if len(xarticle.streams) == 0: 

230 if not url_full_text_node: 

231 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf") 

232 else: 

233 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href")) 

234 

235 if len(xarticle.abstracts) == 0: 235 ↛ 247line 235 didn't jump to line 247 because the condition on line 235 was always true

236 # ABSTRACT 

237 abstract_node = soup.find("article", {"id": "unit-article-abstract"}) 

238 if abstract_node is not None: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 abstract_section_node = abstract_node.find("section") 

240 if abstract_section_node: 

241 abstract = str(abstract_section_node) 

242 xabstract = create_abstract( 

243 tag="abstract", value_tex=abstract, lang=xarticle.lang 

244 ) 

245 xarticle.abstracts.append(xabstract) 

246 

247 if len(xarticle.contributors) == 0 or not xarticle.fpage: 

248 # LANG, PAGES, (AUTHORS) 

249 # EuDML has an export BibTex section with some information (lang, pages, authors) 

250 self.parse_bibtex(soup, xarticle, url) 

251 

252 if xarticle.doi is None: 252 ↛ 328line 252 didn't jump to line 328 because the condition on line 252 was always true

253 # DOI 

254 doi_link = soup.find("article", {"id": "unit-other-ids"}) 

255 if doi_link is not None: 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was never true

256 # Simplify ? 

257 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89 

258 try: 

259 reg_doi = re.compile("doi.org") 

260 doi_array = [ 

261 d.get("href") 

262 for d in doi_link.find_all("a") 

263 if reg_doi.search(str(d.get("href"))) 

264 ] 

265 if doi_array: 

266 if len(doi_array) > 1: 

267 start_dois = len(doi_array) - 1 

268 doi = doi_array[start_dois:][0] 

269 else: 

270 doi = doi_array[0] 

271 

272 doi_array = doi.split("doi.org/") 

273 # strip unwanted chars present 

274 if len(doi_array) > 1: 

275 doi = doi_array[1].encode("ascii", "ignore") 

276 doi = str(doi.decode()) 

277 doi_array = doi.split("\\u") 

278 doi = str(doi_array[0]) 

279 

280 doi = re.sub("}", "", doi) 

281 doi = re.sub("\t", "", doi) 

282 doi = doi.encode("ascii", "ignore") 

283 doi = doi.decode() 

284 

285 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8") 

286 doi = doi.decode() 

287 doi_array = doi.split("\\u") 

288 doi = str(doi_array[0]).strip() 

289 doi = doi.replace(" ", "") 

290 

291 xarticle.doi = doi 

292 except TypeError as e: 

293 print(e) 

294 

295 # You can't get the first link to zbmath.org: it could be in the list of references ! 

296 

297 # has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1 

298 # if not has_zblid: 

299 # # ZBL 

300 # zblid_link = soup.find( 

301 # "a", {"href": re.compile(r"http:\/\/www.zentralblatt-math.org\/zmath\/")} 

302 # ) 

303 # if zblid_link is not None: 

304 # zblid = zblid_link.get("href").split("?q=")[1] 

305 # if zblid: 

306 # print(f"{url} Fallback for zbl-id: {zblid}") 

307 # xarticle.extids.append(("zbl-item-id", zblid)) 

308 

309 # In Other Databases is not (always ?) the publisher 

310 # if not xissue.publisher: 

311 # # PUBLISHER 

312 # section_oai = soup.find("h3", text="In Other Databases") 

313 # if section_oai is not None: 

314 # section_oai_array = section_oai.parent.find_all("dd") 

315 # if section_oai is not None: 

316 # pub = [ 

317 # d.text 

318 # for d in section_oai_array 

319 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"] 

320 # ] 

321 # if pub != "": 

322 # print(f"{url} Fallback for publisher") 

323 # xpub = create_publisherdata() 

324 # xpub.name = pub[0].strip() 

325 # xissue.publisher = xpub 

326 

327 # ARTICLE PID 

328 if xarticle.doi is not None: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true

329 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

330 xarticle.pid = xarticle.pid.replace("pid", "").replace(":", "_") 

331 else: 

332 reg_article = regex.compile(r"\d+") 

333 if not isinstance(xarticle.pid, type(None)): 333 ↛ 339line 333 didn't jump to line 339 because the condition on line 333 was always true

334 pid_array = reg_article.findall(url) 

335 if len(pid_array) > 0: 335 ↛ 339line 335 didn't jump to line 339 because the condition on line 335 was always true

336 id_article = pid_array[0] 

337 xarticle.pid = xissue.pid + "_" + id_article 

338 

339 return xarticle 

340 

341 def parse_bibtex(self, soup, xarticle, url): 

342 """ 

343 Parse the BibTeX section of a EuDML article page. 

344 Extract 

345 - the authors (if no author was already found in the page) 

346 - the article language 

347 - the article pages 

348 """ 

349 bib_div = [p for p in soup.find_all("p") if "@article" in p.text] 

350 

351 if len(bib_div) > 0: 351 ↛ exitline 351 didn't return from function 'parse_bibtex' because the condition on line 351 was always true

352 bib_tex = bib_div[0].get_text() 

353 text = bib_tex.split("\t") 

354 

355 for text_part in text: 

356 # AUTHORS (only if no authors were already found in the page) 

357 if len(xarticle.contributors) == 0: 

358 reg_author = re.compile("author =") 

359 if reg_author.search(text_part): 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 txt = f"{url} Fallback for authors with the bibtex" 

361 print(txt) 

362 if settings.CRAWLER_LOG_FILE: 

363 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

364 f_.write(txt + "\n") 

365 

366 authors_text = ( 

367 text_part.replace("{", "").replace("}", "").replace("author = ", "") 

368 ) 

369 authors_bib = authors_text.split(",") 

370 for index, name in enumerate(authors_bib): 

371 if index % 2 == 1: 

372 author_name = authors_bib[index - 1] + " " + authors_bib[index] 

373 author_name = self.latext_parser.latex_to_text(author_name) 

374 author_name = author_name.replace("\xa0", "") 

375 

376 author = create_contributor() 

377 author["role"] = "author" 

378 author["string_name"] = author_name 

379 xarticle.contributors.append(author) 

380 

381 # LANG 

382 reg_lang = re.compile("language = ") 

383 if reg_lang.search(text_part): 

384 xarticle.lang = ( 

385 text_part.replace("{", "") 

386 .replace("}", "") 

387 .replace("language = ", "") 

388 .replace(",", "") 

389 ) 

390 if len(xarticle.lang) >= 3: 390 ↛ 393line 390 didn't jump to line 393 because the condition on line 390 was always true

391 xarticle.lang = xarticle.lang[:-1] 

392 

393 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 xarticle.abstracts[0]["lang"] = xarticle.lang 

395 

396 if not xarticle.fpage: 

397 # PAGES 

398 reg_pages = re.compile("pages =") 

399 if reg_pages.search(text_part): 

400 pages = ( 

401 text_part.replace("{", "") 

402 .replace("}", "") 

403 .replace("(", "") 

404 .replace(")", "") 

405 .replace("[", "") 

406 .replace("]", "") 

407 .replace("pages = ", "") 

408 ) 

409 if len(pages) > 0 and pages != "null": 409 ↛ 355line 409 didn't jump to line 355 because the condition on line 409 was always true

410 pages = pages.split(",") 

411 if re.compile(r"\d+-\d+").search(pages[0]): 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 txt = f"{url} Fallback for pages with the bibtex" 

413 print(txt) 

414 if settings.CRAWLER_LOG_FILE: 

415 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

416 f_.write(txt + "\n") 

417 

418 pages = pages[0].split("-") 

419 xarticle.fpage = pages[0] 

420 if len(pages) > 1: 

421 reg_digit = re.compile(r"\d+") 

422 if re.search(reg_digit, str(pages[1])): 

423 pages[1] = re.search(reg_digit, str(pages[1]))[0] 

424 xarticle.lpage = pages[1] 

425 # FIXME : wrong page_range format... Maybe this can be deleted ? 

426 xarticle.page_range = pages[0] + "-" + pages[1] 

427 

428 # reg_title = re.compile("title") 

429 # if reg_title.search(text_part): 

430 # if ( 

431 # xarticle.title_html is None 

432 # or xarticle.title_html == "" 

433 # or xarticle.title_html == "Contents" 

434 # ): 

435 # xarticle.title_html = ( 

436 # text_part.replace("{", "") 

437 # .replace("}", "") 

438 # .replace("title = ", "") 

439 # .replace(",", "") 

440 # ) 

441 # xarticle.title_tex = xarticle.title_html 

442 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>" 

443 

444 def download_file(self, url: str): 

445 if url.startswith("https://eudml.org/doc"): 

446 return super().download_file(url) 

447 

448 content = "" 

449 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8") 

450 attempt = 0 

451 while not content and attempt < 3: 

452 attempt += 1 

453 try: 

454 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}" 

455 print(cmd) 

456 execute_cmd(cmd) 

457 

458 if os.path.isfile(filename): 

459 with open(filename) as file_: 

460 content = file_.read() 

461 if not isinstance(self.session, CachedSession): 

462 continue 

463 # Mock an HTTP requests to inject the data into the cache 

464 

465 except subprocess.CalledProcessError: 

466 pass 

467 

468 if not content: 

469 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

470 

471 return content