Coverage for src / crawler / by_source / eudml_crawler.py: 59%

265 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import base64 

2import json 

3import os 

4import re 

5import subprocess 

6import time 

7 

8import regex 

9import requests 

10from bs4 import BeautifulSoup 

11from ptf.model_data import ( 

12 ArticleData, 

13 create_abstract, 

14 create_articledata, 

15 create_contributor, 

16 create_extlink, 

17 create_issuedata, 

18 create_subj, 

19) 

20from ptf.utils import execute_cmd 

21from requests_cache import CachedSession 

22 

23from crawler.base_crawler import BaseCollectionCrawler 

24from crawler.models import ExtlinkChecked 

25from crawler.utils import add_pdf_link_to_xarticle 

26 

27 

28class EudmlCrawler(BaseCollectionCrawler): 

29 source_name = "European Digital Mathematics Library" 

30 source_domain = "EUDML" 

31 source_website = "https://eudml.org" 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Parse the HTML page of a EuDml journal and returns a list of xissue. 

36 Each xissue has a list of articles with just an url. 

37 """ 

38 data = json.loads(content) 

39 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser") 

40 xissues = [] 

41 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)") 

42 # Extract the list of volumes 

43 volume_count = 0 

44 issue_count = 0 

45 for v in data["volumes"]: 

46 volume_count += 1 

47 volume_number = v["name"] 

48 

49 year_re_groups = volume_year_re.search(v["desc"]) 

50 if year_re_groups is None: 

51 self.logger.debug("skipping volume : no year found") 

52 continue 

53 year = year_re_groups.group(1) 

54 if year == "": 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 self.logger.debug("volume year is an empty string... Skipping") 

56 continue 

57 if len(v["issues"]) > 0: 

58 # Extract all the issues 

59 for i in v["issues"]: 

60 issue_count += 1 

61 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number) 

62 xissues.append(xissue) 

63 else: 

64 # No issues, articles are directly in the volumeF 

65 xissue = self.create_eudml_xissue(v, year, None, volume_number) 

66 xissues.append(xissue) 

67 

68 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list"> 

69 # This info is used to check the number of articles/issues parsed in the page 

70 volumes_to_find = 0 

71 issues_to_find = 0 

72 articles_to_find = 0 

73 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"}) 

74 for article_detail_node in article_details_nodes: 

75 unit_nodes = article_detail_node.find_all("li") 

76 for unit_node in unit_nodes: 

77 strong_node = unit_node.find("strong") 

78 if strong_node is not None: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true

79 text = strong_node.get_text() 

80 if text == "Issue count:": 

81 value = unit_node.get_text()[13:] 

82 issues_to_find += int(value) 

83 elif text == "Volume count:": 

84 value = unit_node.get_text()[14:] 

85 volumes_to_find += int(value) 

86 elif text == "Number of articles:": 

87 value = unit_node.get_text()[20:] 

88 articles_to_find += int(value) 

89 

90 if volume_count != volumes_to_find: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}" 

92 self.logger.debug(txt) 

93 

94 if issue_count != issues_to_find: 

95 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}" 

96 self.logger.debug(txt) 

97 

98 article_count = sum([len(xissue.articles) for xissue in xissues]) 

99 if article_count != articles_to_find: 

100 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}" 

101 self.logger.debug(txt) 

102 

103 return xissues 

104 

105 def create_eudml_xissue( 

106 self, issue_data: dict, year_str, issue_number: str | None, volume_number 

107 ): 

108 """ 

109 EuDML does not have a separate HTML page for an issue. 

110 The list of issues/articles is directly found in the collection page. 

111 

112 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume 

113 The PID is temporary and will be updated with the issue number (if any) 

114 create_xissue directly creates articles, but with just a pid and an url. 

115 """ 

116 xissue = create_issuedata() 

117 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number 

118 if issue_number: 

119 xissue.pid = xissue.pid + "_" + issue_number 

120 xissue.year = year_str 

121 xissue.volume = volume_number 

122 if issue_number: 

123 xissue.number = issue_number 

124 

125 issue_data["articles"].sort(key=lambda a: a["sortKey"]) 

126 for index_article, article_data in enumerate(issue_data["articles"]): 

127 xarticle = create_articledata() 

128 xarticle.pid = "a" + str(index_article) 

129 xarticle.url = article_data["url"] 

130 xissue.articles.append(xarticle) 

131 return xissue 

132 

133 def parse_article_content(self, content, xissue, xarticle, url): 

134 """ 

135 Parse the content with Beautifulsoup and returns an ArticleData 

136 """ 

137 soup = BeautifulSoup(content, "xml") 

138 

139 self.get_metadata_using_citation_meta( 

140 xarticle, 

141 xissue, 

142 soup, 

143 [ 

144 "lang", 

145 "title", 

146 "author", 

147 "pdf", 

148 "abstract", 

149 "page", 

150 "doi", 

151 "mr", 

152 "zbl", 

153 "publisher", 

154 "keywords", 

155 ], 

156 ) 

157 

158 # LINK to SOURCE 

159 url_full_text_node = soup.find("a", text="Access to full text") 

160 if url_full_text_node is not None: 

161 url_full_text = url_full_text_node.get("href") 

162 if isinstance(url_full_text, str): 162 ↛ 167line 162 didn't jump to line 167 because the condition on line 162 was always true

163 ext_link = create_extlink(rel="primary-source", location=url_full_text) 

164 xarticle.ext_links.append(ext_link) 

165 

166 # MSC KEYWORDS 

167 subj_part = soup.select_one("article#unit-subject-areas") 

168 if subj_part is not None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+") 

170 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))] 

171 for subj in subjs: 

172 type_class = subj.get("href").split("/") 

173 subject = create_subj(type="msc", lang=xarticle.lang) 

174 subject["value"] = type_class[3] 

175 xarticle.kwds.append(subject) 

176 

177 # FALLBACK 

178 if not xarticle.title_tex: 

179 try: 

180 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ") 

181 txt = f"{url} Fallback for title" 

182 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

183 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "") 

184 # FIXME 

185 except: # noqa: E722 

186 pass 

187 

188 if len(xarticle.contributors) == 0: 

189 # AUTHORS 

190 authors_bloc = soup.select_one("p.sub-title-1") 

191 if authors_bloc: 191 ↛ 206line 191 didn't jump to line 206 because the condition on line 191 was always true

192 authors_node = authors_bloc.find_all("a") 

193 if len(authors_node) > 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 txt = f"{url} Fallback for authors" 

195 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

196 for author_node in authors_node: 196 ↛ 197line 196 didn't jump to line 197 because the loop on line 196 never started

197 text_author = author_node.get_text() 

198 text_author = text_author.replace(",", "") 

199 

200 author = create_contributor() 

201 author["role"] = "author" 

202 author["string_name"] = text_author 

203 

204 xarticle.contributors.append(author) 

205 

206 if len(xarticle.streams) == 0: 206 ↛ 214line 206 didn't jump to line 214 because the condition on line 206 was always true

207 # PDF 

208 pdf_node = soup.find("a", text="Full (PDF)") 

209 if pdf_node is not None: 

210 pdf_url = pdf_node.get("href") 

211 if pdf_url: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true

212 add_pdf_link_to_xarticle(xarticle, pdf_url) 

213 

214 if len(xarticle.streams) == 0: 

215 if not url_full_text_node: 

216 self.logger.debug("Couldn't find pdf", extra={"pid": xarticle.pid}) 

217 else: 

218 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href")) 

219 

220 if len(xarticle.abstracts) == 0: 220 ↛ 232line 220 didn't jump to line 232 because the condition on line 220 was always true

221 # ABSTRACT 

222 abstract_node = soup.find("article", {"id": "unit-article-abstract"}) 

223 if abstract_node is not None: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 abstract_section_node = abstract_node.find("section") 

225 if abstract_section_node: 

226 abstract = str(abstract_section_node) 

227 

228 xarticle.abstracts.append( 

229 create_abstract(value_tex=abstract, lang=xarticle.lang) 

230 ) 

231 

232 if len(xarticle.contributors) == 0 or not xarticle.fpage: 

233 # LANG, PAGES, (AUTHORS) 

234 # EuDML has an export BibTex section with some information (lang, pages, authors) 

235 self.parse_bibtex(soup, xarticle, url) 

236 

237 if xarticle.doi is None: 237 ↛ 280line 237 didn't jump to line 280 because the condition on line 237 was always true

238 # DOI 

239 doi_link = soup.find("article", {"id": "unit-other-ids"}) 

240 if doi_link is not None: 240 ↛ 243line 240 didn't jump to line 243 because the condition on line 240 was never true

241 # Simplify ? 

242 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89 

243 try: 

244 reg_doi = re.compile("doi.org") 

245 doi_array = [ 

246 d.get("href") 

247 for d in doi_link.find_all("a") 

248 if reg_doi.search(str(d.get("href"))) 

249 ] 

250 if doi_array: 

251 if len(doi_array) > 1: 

252 start_dois = len(doi_array) - 1 

253 doi = doi_array[start_dois:][0] 

254 else: 

255 doi = doi_array[0] 

256 

257 doi_array = doi.split("doi.org/") 

258 # strip unwanted chars present 

259 if len(doi_array) > 1: 

260 doi = doi_array[1].encode("ascii", "ignore") 

261 doi = str(doi.decode()) 

262 doi_array = doi.split("\\u") 

263 doi = str(doi_array[0]) 

264 

265 doi = re.sub("}", "", doi) 

266 doi = re.sub("\t", "", doi) 

267 doi = doi.encode("ascii", "ignore") 

268 doi = doi.decode() 

269 

270 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8") 

271 doi = doi.decode() 

272 doi_array = doi.split("\\u") 

273 doi = str(doi_array[0]).strip() 

274 doi = doi.replace(" ", "") 

275 

276 xarticle.doi = doi 

277 except TypeError as e: 

278 self.logger.debug(e, {"pid": xarticle.pid}) 

279 

280 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1 

281 if not has_zblid: 

282 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')") 

283 if zb_tag: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 zb_href = zb_tag.get("href") 

285 if not isinstance(zb_href, str): 

286 raise ValueError("Couldn't parse zbmath href") 

287 zblid = zb_href.removeprefix( 

288 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

289 ) 

290 xarticle.extids.append(("zbl-item-id", zblid)) 

291 

292 # In Other Databases is not (always ?) the publisher 

293 # if not xissue.publisher: 

294 # # PUBLISHER 

295 # section_oai = soup.find("h3", text="In Other Databases") 

296 # if section_oai is not None: 

297 # section_oai_array = section_oai.parent.find_all("dd") 

298 # if section_oai is not None: 

299 # pub = [ 

300 # d.text 

301 # for d in section_oai_array 

302 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"] 

303 # ] 

304 # if pub != "": 

305 # print(f"{url} Fallback for publisher") 

306 # xpub = create_publisherdata() 

307 # xpub.name = pub[0].strip() 

308 # xissue.publisher = xpub 

309 return xarticle 

310 

311 def parse_bibtex(self, soup, xarticle: ArticleData, url): 

312 """ 

313 Parse the BibTeX section of a EuDML article page. 

314 Extract 

315 - the authors (if no author was already found in the page) 

316 - the article language 

317 - the article pages 

318 """ 

319 bib_div = [p for p in soup.find_all("p") if "@article" in p.text] 

320 

321 if len(bib_div) > 0: 321 ↛ exitline 321 didn't return from function 'parse_bibtex' because the condition on line 321 was always true

322 bib_tex = bib_div[0].get_text() 

323 text = bib_tex.split("\t") 

324 

325 for text_part in text: 

326 # AUTHORS (only if no authors were already found in the page) 

327 if len(xarticle.contributors) == 0: 

328 reg_author = re.compile("author =") 

329 if reg_author.search(text_part): 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true

330 txt = f"{url} Fallback for authors with the bibtex" 

331 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

332 

333 authors_text = ( 

334 text_part.replace("{", "").replace("}", "").replace("author = ", "") 

335 ) 

336 authors_bib = authors_text.split(",") 

337 for index, name in enumerate(authors_bib): 

338 if index % 2 == 1: 

339 author_name = authors_bib[index - 1] + " " + authors_bib[index] 

340 author_name = self.latext_parser.latex_to_text(author_name) 

341 author_name = author_name.replace("\xa0", "") 

342 

343 author = create_contributor() 

344 author["role"] = "author" 

345 author["string_name"] = author_name 

346 xarticle.contributors.append(author) 

347 

348 # LANG 

349 reg_lang = re.compile("language = ") 

350 if reg_lang.search(text_part): 

351 xarticle.lang = ( 

352 text_part.replace("{", "") 

353 .replace("}", "") 

354 .replace("language = ", "") 

355 .replace(",", "") 

356 ) 

357 if len(xarticle.lang) >= 3: 357 ↛ 360line 357 didn't jump to line 360 because the condition on line 357 was always true

358 xarticle.lang = xarticle.lang[:-1] 

359 

360 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 360 ↛ 361line 360 didn't jump to line 361 because the condition on line 360 was never true

361 xarticle.abstracts[0]["lang"] = xarticle.lang 

362 

363 if not xarticle.fpage: 

364 # PAGES 

365 reg_pages = re.compile("pages =") 

366 if reg_pages.search(text_part): 

367 pages = ( 

368 text_part.replace("{", "") 

369 .replace("}", "") 

370 .replace("(", "") 

371 .replace(")", "") 

372 .replace("[", "") 

373 .replace("]", "") 

374 .replace("pages = ", "") 

375 ) 

376 if len(pages) > 0 and pages != "null": 376 ↛ 325line 376 didn't jump to line 325 because the condition on line 376 was always true

377 pages = pages.split(",") 

378 if re.compile(r"\d+-\d+").search(pages[0]): 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true

379 txt = f"{url} Fallback for pages with the bibtex" 

380 self.logger.debug(txt, {"pid", xarticle.pid}) 

381 

382 pages = pages[0].split("-") 

383 xarticle.fpage = pages[0] 

384 if len(pages) > 1: 

385 reg_digit = re.compile(r"\d+") 

386 if re.search(reg_digit, str(pages[1])): 

387 pages[1] = re.search(reg_digit, str(pages[1]))[0] 

388 xarticle.lpage = pages[1] 

389 # FIXME : wrong page_range format... Maybe this can be deleted ? 

390 xarticle.page_range = pages[0] + "-" + pages[1] 

391 

392 # reg_title = re.compile("title") 

393 # if reg_title.search(text_part): 

394 # if ( 

395 # xarticle.title_html is None 

396 # or xarticle.title_html == "" 

397 # or xarticle.title_html == "Contents" 

398 # ): 

399 # xarticle.title_html = ( 

400 # text_part.replace("{", "") 

401 # .replace("}", "") 

402 # .replace("title = ", "") 

403 # .replace(",", "") 

404 # ) 

405 # xarticle.title_tex = xarticle.title_html 

406 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>" 

407 

408 def download_file(self, url: str): 

409 if url.startswith("https://eudml.org/doc"): 

410 return super().download_file(url) 

411 

412 content = "" 

413 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8") 

414 attempt = 0 

415 while not content and attempt < 3: 

416 attempt += 1 

417 try: 

418 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}" 

419 execute_cmd(cmd, force_execute=True) 

420 

421 if os.path.isfile(filename): 

422 with open(filename) as file_: 

423 content = file_.read() 

424 if not isinstance(self.session, CachedSession): 

425 continue 

426 # Mock an HTTP requests to inject the data into the cache 

427 

428 except subprocess.CalledProcessError: 

429 pass 

430 

431 if not content: 

432 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

433 

434 return content 

435 

436 @classmethod 

437 def check_extlink_validity(cls, extlink): 

438 """ 

439 Method used by rot_monitoring to check if links have expired 

440 """ 

441 defaults: dict = {"date": time.time(), "status": ExtlinkChecked.Status.OK} 

442 

443 if not extlink.location.startswith("http://gdz.sub.uni-goettingen.de"): 

444 return super().check_extlink_validity(extlink) 

445 

446 response = requests.get(extlink.location) 

447 

448 defaults["http_status"] = response.status_code 

449 

450 if response.status_code not in (200, 206): 

451 defaults["status"] = ExtlinkChecked.Status.ERROR 

452 

453 ExtlinkChecked.objects.update_or_create(extlink=extlink, defaults=defaults)