Coverage for src/crawler/by_source/eudml_crawler.py: 40%

302 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import base64 

2import json 

3import os 

4import re 

5import subprocess 

6 

7import regex 

8import requests 

9from bs4 import BeautifulSoup 

10from django.conf import settings 

11from ptf.model_data import ( 

12 create_articledata, 

13 create_contributor, 

14 create_extlink, 

15 create_issuedata, 

16 create_subj, 

17) 

18from ptf.utils import execute_cmd 

19from requests_cache import CachedSession 

20 

21from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

22 

23 

24class EudmlCrawler(BaseCollectionCrawler): 

25 source_name = "European Digital Mathematics Library" 

26 source_domain = "EUDML" 

27 source_website = "https://eudml.org" 

28 

29 def __init__(self, *args, **kwargs): 

30 super().__init__(*args, **kwargs) 

31 

32 self.source = self.get_or_create_source() 

33 

34 self.has_dynamic_collection_pages = True 

35 

36 def parse_collection_content(self, content): 

37 """ 

38 Parse the HTML page of a EuDml journal and returns a list of xissue. 

39 Each xissue has a list of articles with just an url. 

40 

41 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

42 """ 

43 data = json.loads(content) 

44 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser") 

45 xissues = [] 

46 

47 citation_year_node = soup.find("meta", {"name": "citation_year"}) 

48 if citation_year_node: 48 ↛ 59line 48 didn't jump to line 59 because the condition on line 48 was always true

49 value = citation_year_node.get("content") 

50 values = value.split("-") 

51 try: 

52 self.periode_begin = int(values[0]) 

53 if len(values) > 1: 53 ↛ 57line 53 didn't jump to line 57 because the condition on line 53 was always true

54 self.periode_end = int(values[1]) 

55 except ValueError: 

56 pass 

57 self.periode = self.get_or_create_periode() 

58 

59 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)") 

60 # Extract the list of volumes 

61 volume_count = 0 

62 issue_count = 0 

63 for v in data["volumes"]: 

64 volume_count += 1 

65 volume_number = v["name"] 

66 

67 year_re_groups = volume_year_re.search(v["desc"]) 

68 if year_re_groups is None: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 print("skipping volume : no year found") 

70 continue 

71 year = year_re_groups.group(1) 

72 

73 if len(v["issues"]) > 0 and year != "": 

74 # Extract all the issues 

75 for i in v["issues"]: 

76 issue_count += 1 

77 xissue = self.create_xissue(i, year, i["name"], volume_number) 

78 xissues.append(xissue) 

79 else: 

80 # No issues, articles are directly in the volumeF 

81 xissue = self.create_xissue(v, year, None, volume_number) 

82 xissues.append(xissue) 

83 

84 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list"> 

85 # This info is used to check the number of articles/issues parsed in the page 

86 volumes_to_find = 0 

87 issues_to_find = 0 

88 articles_to_find = 0 

89 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"}) 

90 for article_detail_node in article_details_nodes: 

91 unit_nodes = article_detail_node.find_all("li") 

92 for unit_node in unit_nodes: 

93 strong_node = unit_node.find("strong") 

94 if strong_node is not None: 94 ↛ 92line 94 didn't jump to line 92 because the condition on line 94 was always true

95 text = strong_node.get_text() 

96 if text == "Issue count:": 

97 value = unit_node.get_text()[13:] 

98 issues_to_find += int(value) 

99 elif text == "Volume count:": 

100 value = unit_node.get_text()[14:] 

101 volumes_to_find += int(value) 

102 elif text == "Number of articles:": 

103 value = unit_node.get_text()[20:] 

104 articles_to_find += int(value) 

105 

106 if volume_count != volumes_to_find: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}" 

108 print(txt) 

109 if settings.CRAWLER_LOG_FILE: 

110 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

111 f_.write(txt + "\n") 

112 

113 if issue_count != issues_to_find: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}" 

115 print(txt) 

116 if settings.CRAWLER_LOG_FILE: 

117 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

118 f_.write(txt + "\n") 

119 

120 article_count = sum([len(xissue.articles) for xissue in xissues]) 

121 if article_count != articles_to_find: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}" 

123 print(txt) 

124 if settings.CRAWLER_LOG_FILE: 

125 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

126 f_.write(txt + "\n") 

127 

128 return xissues 

129 

130 def create_xissue(self, issue_data: dict, year_str, issue_number: str | None, volume_number): 

131 """ 

132 EuDML does not have a separate HTML page for an issue. 

133 The list of issues/articles is directly found in the collection page. 

134 

135 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume 

136 The PID is temporary and will be updated with the issue number (if any) 

137 create_xissue directly creates articles, but with just a pid and an url. 

138 """ 

139 xissue = create_issuedata() 

140 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number 

141 if issue_number: 

142 xissue.pid = xissue.pid + "_" + issue_number 

143 xissue.year = year_str 

144 xissue.volume = volume_number 

145 if issue_number: 

146 xissue.number = issue_number 

147 

148 issue_data["articles"].sort(key=lambda a: a["sortKey"]) 

149 for index_article, article_data in enumerate(issue_data["articles"]): 

150 xarticle = create_articledata() 

151 xarticle.pid = "a" + str(index_article) 

152 xarticle.url = article_data["url"] 

153 xissue.articles.append(xarticle) 

154 return xissue 

155 

156 def parse_article_content(self, content, xissue, xarticle, url, pid): 

157 """ 

158 Parse the content with Beautifulsoup and returns an ArticleData 

159 """ 

160 xarticle = create_articledata() 

161 xarticle.pid = pid 

162 soup = BeautifulSoup(content, "xml") 

163 

164 what = [ 

165 "lang", 

166 "title", 

167 "author", 

168 "pdf", 

169 "abstract", 

170 "page", 

171 "doi", 

172 "mr", 

173 "zbl", 

174 "publisher", 

175 "keywords", 

176 ] 

177 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

178 

179 # LINK to SOURCE 

180 url_full_text_node = soup.find("a", text="Access to full text") 

181 if url_full_text_node is not None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 url_full_text = url_full_text_node.get("href") 

183 ext_link = create_extlink() 

184 ext_link["rel"] = "primary-source" 

185 ext_link["location"] = url_full_text 

186 xarticle.ext_links.append(ext_link) 

187 

188 # MSC KEYWORDS 

189 subj_part = soup.find("article", {"id": "unit-subject-areas"}) 

190 if subj_part is not None: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+") 

192 subjs = [a for a in subj_part.find_all("a") if reg_msc.search(a.get("href"))] 

193 for subj in subjs: 

194 type_class = subj.get("href").split("/") 

195 subject = create_subj() 

196 subject["value"] = type_class[3] 

197 subject["type"] = "msc" 

198 subject["lang"] = "en" 

199 xarticle.kwds.append(subject) 

200 

201 # FALLBACK 

202 if not xarticle.title_tex: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 try: 

204 title = soup.find("h1").get_text(strip=True).replace("\xa0", " ") 

205 txt = f"{url} Fallback for title" 

206 print(txt) 

207 if settings.CRAWLER_LOG_FILE: 

208 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

209 f_.write(txt + "\n") 

210 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "") 

211 # FIXME 

212 except: # noqa: E722 

213 pass 

214 

215 if len(xarticle.contributors) == 0: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was never true

216 # AUTHORS 

217 authors_bloc = soup.find("p", {"class": "sub-title-1"}) 

218 if authors_bloc: 

219 authors_node = authors_bloc.find_all("a") 

220 if len(authors_node) > 0: 

221 txt = f"{url} Fallback for authors" 

222 print(txt) 

223 if settings.CRAWLER_LOG_FILE: 

224 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

225 f_.write(txt + "\n") 

226 for author_node in authors_node: 

227 text_author = author_node.get_text() 

228 text_author = text_author.replace(",", "") 

229 

230 author = create_contributor() 

231 author["role"] = "author" 

232 author["string_name"] = text_author 

233 

234 xarticle.contributors.append(author) 

235 

236 if len(xarticle.streams) == 0: 236 ↛ 244line 236 didn't jump to line 244 because the condition on line 236 was always true

237 # PDF 

238 pdf_node = soup.find("a", text="Full (PDF)") 

239 if pdf_node is not None: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 pdf_url = pdf_node.get("href") 

241 if pdf_url: 

242 add_pdf_link_to_xarticle(xarticle, pdf_url) 

243 

244 if len(xarticle.abstracts) == 0: 244 ↛ 260line 244 didn't jump to line 260 because the condition on line 244 was always true

245 # ABSTRACT 

246 abstract_node = soup.find("article", {"id": "unit-article-abstract"}) 

247 if abstract_node is not None: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 abstract_section_node = abstract_node.find("section") 

249 if abstract_section_node: 

250 abstract = str(abstract_section_node) 

251 xabstract = { 

252 "tag": "abstract", 

253 "value_html": "", 

254 "value_tex": abstract, 

255 "value_xml": "", 

256 "lang": "en", 

257 } 

258 xarticle.abstracts.append(xabstract) 

259 

260 if len(xarticle.contributors) == 0 or not xarticle.fpage: 260 ↛ 263line 260 didn't jump to line 263 because the condition on line 260 was never true

261 # LANG, PAGES, (AUTHORS) 

262 # EuDML has an export BibTex section with some information (lang, pages, authors) 

263 self.parse_bibtex(soup, xarticle, url) 

264 

265 if xarticle.doi is None: 265 ↛ 341line 265 didn't jump to line 341 because the condition on line 265 was always true

266 # DOI 

267 doi_link = soup.find("article", {"id": "unit-other-ids"}) 

268 if doi_link is not None: 268 ↛ 271line 268 didn't jump to line 271 because the condition on line 268 was never true

269 # Simplify ? 

270 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89 

271 try: 

272 reg_doi = re.compile("doi.org") 

273 doi_array = [ 

274 d.get("href") 

275 for d in doi_link.find_all("a") 

276 if reg_doi.search(str(d.get("href"))) 

277 ] 

278 if doi_array: 

279 if len(doi_array) > 1: 

280 start_dois = len(doi_array) - 1 

281 doi = doi_array[start_dois:][0] 

282 else: 

283 doi = doi_array[0] 

284 

285 doi_array = doi.split("doi.org/") 

286 # strip unwanted chars present 

287 if len(doi_array) > 1: 

288 doi = doi_array[1].encode("ascii", "ignore") 

289 doi = str(doi.decode()) 

290 doi_array = doi.split("\\u") 

291 doi = str(doi_array[0]) 

292 

293 doi = re.sub("}", "", doi) 

294 doi = re.sub("\t", "", doi) 

295 doi = doi.encode("ascii", "ignore") 

296 doi = doi.decode() 

297 

298 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8") 

299 doi = doi.decode() 

300 doi_array = doi.split("\\u") 

301 doi = str(doi_array[0]).strip() 

302 doi = doi.replace(" ", "") 

303 

304 xarticle.doi = doi 

305 except TypeError as e: 

306 print(e) 

307 

308 # You can't get the first link to zbmath.org: it could be in the list of references ! 

309 

310 # has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1 

311 # if not has_zblid: 

312 # # ZBL 

313 # zblid_link = soup.find( 

314 # "a", {"href": re.compile(r"http:\/\/www.zentralblatt-math.org\/zmath\/")} 

315 # ) 

316 # if zblid_link is not None: 

317 # zblid = zblid_link.get("href").split("?q=")[1] 

318 # if zblid: 

319 # print(f"{url} Fallback for zbl-id: {zblid}") 

320 # xarticle.extids.append(("zbl-item-id", zblid)) 

321 

322 # In Other Databases is not (always ?) the publisher 

323 # if not xissue.publisher: 

324 # # PUBLISHER 

325 # section_oai = soup.find("h3", text="In Other Databases") 

326 # if section_oai is not None: 

327 # section_oai_array = section_oai.parent.find_all("dd") 

328 # if section_oai is not None: 

329 # pub = [ 

330 # d.text 

331 # for d in section_oai_array 

332 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"] 

333 # ] 

334 # if pub != "": 

335 # print(f"{url} Fallback for publisher") 

336 # xpub = create_publisherdata() 

337 # xpub.name = pub[0].strip() 

338 # xissue.publisher = xpub 

339 

340 # ARTICLE PID 

341 if xarticle.doi is not None: 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true

342 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

343 xarticle.pid = xarticle.pid.replace("pid", "").replace(":", "_") 

344 else: 

345 reg_article = regex.compile(r"\d+") 

346 if not isinstance(xarticle.pid, type(None)): 346 ↛ 352line 346 didn't jump to line 352 because the condition on line 346 was always true

347 pid_array = reg_article.findall(url) 

348 if len(pid_array) > 0: 348 ↛ 352line 348 didn't jump to line 352 because the condition on line 348 was always true

349 id_article = pid_array[0] 

350 xarticle.pid = xissue.pid + "_" + id_article 

351 

352 return xarticle 

353 

354 def parse_bibtex(self, soup, xarticle, url): 

355 """ 

356 Parse the BibTeX section of a EuDML article page. 

357 Extract 

358 - the authors (if no author was already found in the page) 

359 - the article language 

360 - the article pages 

361 """ 

362 bib_div = [p for p in soup.find_all("p") if "@article" in p.text] 

363 

364 if len(bib_div) > 0: 

365 bib_tex = bib_div[0].get_text() 

366 text = bib_tex.split("\t") 

367 

368 for text_part in text: 

369 # AUTHORS (only if no authors were already found in the page) 

370 if len(xarticle.contributors) == 0: 

371 reg_author = re.compile("author =") 

372 if reg_author.search(text_part): 

373 txt = f"{url} Fallback for authors with the bibtex" 

374 print(txt) 

375 if settings.CRAWLER_LOG_FILE: 

376 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

377 f_.write(txt + "\n") 

378 

379 authors_text = ( 

380 text_part.replace("{", "").replace("}", "").replace("author = ", "") 

381 ) 

382 authors_bib = authors_text.split(",") 

383 for index, name in enumerate(authors_bib): 

384 if index % 2 == 1: 

385 author_name = authors_bib[index - 1] + " " + authors_bib[index] 

386 author_name = self.latext_parser.latex_to_text(author_name) 

387 author_name = author_name.replace("\xa0", "") 

388 

389 author = create_contributor() 

390 author["role"] = "author" 

391 author["string_name"] = author_name 

392 xarticle.contributors.append(author) 

393 

394 # LANG 

395 reg_lang = re.compile("language = ") 

396 if reg_lang.search(text_part): 

397 xarticle.lang = ( 

398 text_part.replace("{", "") 

399 .replace("}", "") 

400 .replace("language = ", "") 

401 .replace(",", "") 

402 ) 

403 if len(xarticle.lang) >= 3: 

404 xarticle.lang = xarticle.lang[:-1] 

405 

406 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 

407 xarticle.abstracts[0]["lang"] = xarticle.lang 

408 

409 if not xarticle.fpage: 

410 # PAGES 

411 reg_pages = re.compile("pages =") 

412 if reg_pages.search(text_part): 

413 pages = ( 

414 text_part.replace("{", "") 

415 .replace("}", "") 

416 .replace("(", "") 

417 .replace(")", "") 

418 .replace("[", "") 

419 .replace("]", "") 

420 .replace("pages = ", "") 

421 ) 

422 if len(pages) > 0 and pages != "null": 

423 pages = pages.split(",") 

424 if re.compile(r"\d+-\d+").search(pages[0]): 

425 txt = f"{url} Fallback for pages with the bibtex" 

426 print(txt) 

427 if settings.CRAWLER_LOG_FILE: 

428 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

429 f_.write(txt + "\n") 

430 

431 pages = pages[0].split("-") 

432 xarticle.fpage = pages[0] 

433 if len(pages) > 1: 

434 reg_digit = re.compile(r"\d+") 

435 if re.search(reg_digit, str(pages[1])): 

436 pages[1] = re.search(reg_digit, str(pages[1]))[0] 

437 xarticle.lpage = pages[1] 

438 xarticle.page_range = pages[0] + "-" + pages[1] 

439 

440 # reg_title = re.compile("title") 

441 # if reg_title.search(text_part): 

442 # if ( 

443 # xarticle.title_html is None 

444 # or xarticle.title_html == "" 

445 # or xarticle.title_html == "Contents" 

446 # ): 

447 # xarticle.title_html = ( 

448 # text_part.replace("{", "") 

449 # .replace("}", "") 

450 # .replace("title = ", "") 

451 # .replace(",", "") 

452 # ) 

453 # xarticle.title_tex = xarticle.title_html 

454 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>" 

455 

456 def get_page_content(self, url: str, force_download=False): 

457 if url.startswith("https://eudml.org/doc"): 

458 return self.download_file(url) 

459 

460 content = "" 

461 

462 def set_progress_bar_title(): 

463 if not self.progress_bar: 463 ↛ 465line 463 didn't jump to line 465 because the condition on line 463 was always true

464 return 

465 self.progress_bar.text(f"Download {url}") 

466 

467 set_progress_bar_title() 

468 content = self.download_file_dynamic( 

469 url, 

470 filename="/tmp/crawler/puppeteer/" 

471 + str(base64.b64encode(url.encode("utf-8")), "utf-8"), 

472 ) 

473 

474 return content 

475 

476 def download_file_dynamic(self, url: str, filename: str): 

477 """ 

478 Runs a NodeJS subprocess to parse an EUDML Collection 

479 """ 

480 

481 txt = f"Download {url}" 

482 if settings.CRAWLER_LOG_FILE: 

483 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

484 f_.write(txt + "\n") 

485 

486 content = "" 

487 attempt = 0 

488 while not content and attempt < 3: 

489 attempt += 1 

490 try: 

491 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}" 

492 print(cmd) 

493 execute_cmd(cmd) 

494 

495 if os.path.isfile(filename): 

496 with open(filename) as file_: 

497 content = file_.read() 

498 if not isinstance(self.session, CachedSession): 

499 continue 

500 # Mock an HTTP requests to inject the data into the cache 

501 

502 except subprocess.CalledProcessError: 

503 pass 

504 

505 if not content: 

506 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

507 

508 return content