Coverage for src/crawler/by_source/eudml_crawler.py: 60%

254 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import base64 

2import json 

3import os 

4import re 

5import subprocess 

6 

7import regex 

8import requests 

9from bs4 import BeautifulSoup 

10from ptf.model_data import ( 

11 ArticleData, 

12 create_abstract, 

13 create_articledata, 

14 create_contributor, 

15 create_extlink, 

16 create_issuedata, 

17 create_subj, 

18) 

19from ptf.utils import execute_cmd 

20from requests_cache import CachedSession 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import add_pdf_link_to_xarticle 

24 

25 

26class EudmlCrawler(BaseCollectionCrawler): 

27 source_name = "European Digital Mathematics Library" 

28 source_domain = "EUDML" 

29 source_website = "https://eudml.org" 

30 

31 def parse_collection_content(self, content): 

32 """ 

33 Parse the HTML page of a EuDml journal and returns a list of xissue. 

34 Each xissue has a list of articles with just an url. 

35 """ 

36 data = json.loads(content) 

37 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser") 

38 xissues = [] 

39 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)") 

40 # Extract the list of volumes 

41 volume_count = 0 

42 issue_count = 0 

43 for v in data["volumes"]: 

44 volume_count += 1 

45 volume_number = v["name"] 

46 

47 year_re_groups = volume_year_re.search(v["desc"]) 

48 if year_re_groups is None: 

49 self.logger.debug("skipping volume : no year found") 

50 continue 

51 year = year_re_groups.group(1) 

52 if year == "": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 self.logger.debug("volume year is an empty string... Skipping") 

54 continue 

55 if len(v["issues"]) > 0: 

56 # Extract all the issues 

57 for i in v["issues"]: 

58 issue_count += 1 

59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number) 

60 xissues.append(xissue) 

61 else: 

62 # No issues, articles are directly in the volumeF 

63 xissue = self.create_eudml_xissue(v, year, None, volume_number) 

64 xissues.append(xissue) 

65 

66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list"> 

67 # This info is used to check the number of articles/issues parsed in the page 

68 volumes_to_find = 0 

69 issues_to_find = 0 

70 articles_to_find = 0 

71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"}) 

72 for article_detail_node in article_details_nodes: 

73 unit_nodes = article_detail_node.find_all("li") 

74 for unit_node in unit_nodes: 

75 strong_node = unit_node.find("strong") 

76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true

77 text = strong_node.get_text() 

78 if text == "Issue count:": 

79 value = unit_node.get_text()[13:] 

80 issues_to_find += int(value) 

81 elif text == "Volume count:": 

82 value = unit_node.get_text()[14:] 

83 volumes_to_find += int(value) 

84 elif text == "Number of articles:": 

85 value = unit_node.get_text()[20:] 

86 articles_to_find += int(value) 

87 

88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}" 

90 self.logger.debug(txt) 

91 

92 if issue_count != issues_to_find: 

93 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}" 

94 self.logger.debug(txt) 

95 

96 article_count = sum([len(xissue.articles) for xissue in xissues]) 

97 if article_count != articles_to_find: 

98 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}" 

99 self.logger.debug(txt) 

100 

101 return xissues 

102 

103 def create_eudml_xissue( 

104 self, issue_data: dict, year_str, issue_number: str | None, volume_number 

105 ): 

106 """ 

107 EuDML does not have a separate HTML page for an issue. 

108 The list of issues/articles is directly found in the collection page. 

109 

110 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume 

111 The PID is temporary and will be updated with the issue number (if any) 

112 create_xissue directly creates articles, but with just a pid and an url. 

113 """ 

114 xissue = create_issuedata() 

115 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number 

116 if issue_number: 

117 xissue.pid = xissue.pid + "_" + issue_number 

118 xissue.year = year_str 

119 xissue.volume = volume_number 

120 if issue_number: 

121 xissue.number = issue_number 

122 

123 issue_data["articles"].sort(key=lambda a: a["sortKey"]) 

124 for index_article, article_data in enumerate(issue_data["articles"]): 

125 xarticle = create_articledata() 

126 xarticle.pid = "a" + str(index_article) 

127 xarticle.url = article_data["url"] 

128 xissue.articles.append(xarticle) 

129 return xissue 

130 

131 def parse_article_content(self, content, xissue, xarticle, url): 

132 """ 

133 Parse the content with Beautifulsoup and returns an ArticleData 

134 """ 

135 soup = BeautifulSoup(content, "xml") 

136 

137 self.get_metadata_using_citation_meta( 

138 xarticle, 

139 xissue, 

140 soup, 

141 [ 

142 "lang", 

143 "title", 

144 "author", 

145 "pdf", 

146 "abstract", 

147 "page", 

148 "doi", 

149 "mr", 

150 "zbl", 

151 "publisher", 

152 "keywords", 

153 ], 

154 ) 

155 

156 # LINK to SOURCE 

157 url_full_text_node = soup.find("a", text="Access to full text") 

158 if url_full_text_node is not None: 

159 url_full_text = url_full_text_node.get("href") 

160 if isinstance(url_full_text, str): 160 ↛ 165line 160 didn't jump to line 165 because the condition on line 160 was always true

161 ext_link = create_extlink(rel="primary-source", location=url_full_text) 

162 xarticle.ext_links.append(ext_link) 

163 

164 # MSC KEYWORDS 

165 subj_part = soup.select_one("article#unit-subject-areas") 

166 if subj_part is not None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+") 

168 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))] 

169 for subj in subjs: 

170 type_class = subj.get("href").split("/") 

171 subject = create_subj(type="msc", lang=xarticle.lang) 

172 subject["value"] = type_class[3] 

173 xarticle.kwds.append(subject) 

174 

175 # FALLBACK 

176 if not xarticle.title_tex: 

177 try: 

178 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ") 

179 txt = f"{url} Fallback for title" 

180 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

181 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "") 

182 # FIXME 

183 except: # noqa: E722 

184 pass 

185 

186 if len(xarticle.contributors) == 0: 

187 # AUTHORS 

188 authors_bloc = soup.select_one("p.sub-title-1") 

189 if authors_bloc: 189 ↛ 204line 189 didn't jump to line 204 because the condition on line 189 was always true

190 authors_node = authors_bloc.find_all("a") 

191 if len(authors_node) > 0: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 txt = f"{url} Fallback for authors" 

193 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

194 for author_node in authors_node: 194 ↛ 195line 194 didn't jump to line 195 because the loop on line 194 never started

195 text_author = author_node.get_text() 

196 text_author = text_author.replace(",", "") 

197 

198 author = create_contributor() 

199 author["role"] = "author" 

200 author["string_name"] = text_author 

201 

202 xarticle.contributors.append(author) 

203 

204 if len(xarticle.streams) == 0: 204 ↛ 212line 204 didn't jump to line 212 because the condition on line 204 was always true

205 # PDF 

206 pdf_node = soup.find("a", text="Full (PDF)") 

207 if pdf_node is not None: 

208 pdf_url = pdf_node.get("href") 

209 if pdf_url: 209 ↛ 212line 209 didn't jump to line 212 because the condition on line 209 was always true

210 add_pdf_link_to_xarticle(xarticle, pdf_url) 

211 

212 if len(xarticle.streams) == 0: 

213 if not url_full_text_node: 

214 self.logger.debug("Couldn't find pdf", extra={"pid": xarticle.pid}) 

215 else: 

216 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href")) 

217 

218 if len(xarticle.abstracts) == 0: 218 ↛ 230line 218 didn't jump to line 230 because the condition on line 218 was always true

219 # ABSTRACT 

220 abstract_node = soup.find("article", {"id": "unit-article-abstract"}) 

221 if abstract_node is not None: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 abstract_section_node = abstract_node.find("section") 

223 if abstract_section_node: 

224 abstract = str(abstract_section_node) 

225 xabstract = create_abstract( 

226 tag="abstract", value_tex=abstract, lang=xarticle.lang 

227 ) 

228 xarticle.abstracts.append(xabstract) 

229 

230 if len(xarticle.contributors) == 0 or not xarticle.fpage: 

231 # LANG, PAGES, (AUTHORS) 

232 # EuDML has an export BibTex section with some information (lang, pages, authors) 

233 self.parse_bibtex(soup, xarticle, url) 

234 

235 if xarticle.doi is None: 235 ↛ 278line 235 didn't jump to line 278 because the condition on line 235 was always true

236 # DOI 

237 doi_link = soup.find("article", {"id": "unit-other-ids"}) 

238 if doi_link is not None: 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was never true

239 # Simplify ? 

240 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89 

241 try: 

242 reg_doi = re.compile("doi.org") 

243 doi_array = [ 

244 d.get("href") 

245 for d in doi_link.find_all("a") 

246 if reg_doi.search(str(d.get("href"))) 

247 ] 

248 if doi_array: 

249 if len(doi_array) > 1: 

250 start_dois = len(doi_array) - 1 

251 doi = doi_array[start_dois:][0] 

252 else: 

253 doi = doi_array[0] 

254 

255 doi_array = doi.split("doi.org/") 

256 # strip unwanted chars present 

257 if len(doi_array) > 1: 

258 doi = doi_array[1].encode("ascii", "ignore") 

259 doi = str(doi.decode()) 

260 doi_array = doi.split("\\u") 

261 doi = str(doi_array[0]) 

262 

263 doi = re.sub("}", "", doi) 

264 doi = re.sub("\t", "", doi) 

265 doi = doi.encode("ascii", "ignore") 

266 doi = doi.decode() 

267 

268 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8") 

269 doi = doi.decode() 

270 doi_array = doi.split("\\u") 

271 doi = str(doi_array[0]).strip() 

272 doi = doi.replace(" ", "") 

273 

274 xarticle.doi = doi 

275 except TypeError as e: 

276 self.logger.debug(e, {"pid": xarticle.pid}) 

277 

278 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1 

279 if not has_zblid: 

280 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')") 

281 if zb_tag: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 zb_href = zb_tag.get("href") 

283 if not isinstance(zb_href, str): 

284 raise ValueError("Couldn't parse zbmath href") 

285 zblid = zb_href.removeprefix( 

286 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

287 ) 

288 xarticle.extids.append(("zbl-item-id", zblid)) 

289 

290 # In Other Databases is not (always ?) the publisher 

291 # if not xissue.publisher: 

292 # # PUBLISHER 

293 # section_oai = soup.find("h3", text="In Other Databases") 

294 # if section_oai is not None: 

295 # section_oai_array = section_oai.parent.find_all("dd") 

296 # if section_oai is not None: 

297 # pub = [ 

298 # d.text 

299 # for d in section_oai_array 

300 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"] 

301 # ] 

302 # if pub != "": 

303 # print(f"{url} Fallback for publisher") 

304 # xpub = create_publisherdata() 

305 # xpub.name = pub[0].strip() 

306 # xissue.publisher = xpub 

307 return xarticle 

308 

309 def parse_bibtex(self, soup, xarticle: ArticleData, url): 

310 """ 

311 Parse the BibTeX section of a EuDML article page. 

312 Extract 

313 - the authors (if no author was already found in the page) 

314 - the article language 

315 - the article pages 

316 """ 

317 bib_div = [p for p in soup.find_all("p") if "@article" in p.text] 

318 

319 if len(bib_div) > 0: 319 ↛ exitline 319 didn't return from function 'parse_bibtex' because the condition on line 319 was always true

320 bib_tex = bib_div[0].get_text() 

321 text = bib_tex.split("\t") 

322 

323 for text_part in text: 

324 # AUTHORS (only if no authors were already found in the page) 

325 if len(xarticle.contributors) == 0: 

326 reg_author = re.compile("author =") 

327 if reg_author.search(text_part): 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true

328 txt = f"{url} Fallback for authors with the bibtex" 

329 self.logger.debug(txt, extra={"pid": xarticle.pid}) 

330 

331 authors_text = ( 

332 text_part.replace("{", "").replace("}", "").replace("author = ", "") 

333 ) 

334 authors_bib = authors_text.split(",") 

335 for index, name in enumerate(authors_bib): 

336 if index % 2 == 1: 

337 author_name = authors_bib[index - 1] + " " + authors_bib[index] 

338 author_name = self.latext_parser.latex_to_text(author_name) 

339 author_name = author_name.replace("\xa0", "") 

340 

341 author = create_contributor() 

342 author["role"] = "author" 

343 author["string_name"] = author_name 

344 xarticle.contributors.append(author) 

345 

346 # LANG 

347 reg_lang = re.compile("language = ") 

348 if reg_lang.search(text_part): 

349 xarticle.lang = ( 

350 text_part.replace("{", "") 

351 .replace("}", "") 

352 .replace("language = ", "") 

353 .replace(",", "") 

354 ) 

355 if len(xarticle.lang) >= 3: 355 ↛ 358line 355 didn't jump to line 358 because the condition on line 355 was always true

356 xarticle.lang = xarticle.lang[:-1] 

357 

358 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 xarticle.abstracts[0]["lang"] = xarticle.lang 

360 

361 if not xarticle.fpage: 

362 # PAGES 

363 reg_pages = re.compile("pages =") 

364 if reg_pages.search(text_part): 

365 pages = ( 

366 text_part.replace("{", "") 

367 .replace("}", "") 

368 .replace("(", "") 

369 .replace(")", "") 

370 .replace("[", "") 

371 .replace("]", "") 

372 .replace("pages = ", "") 

373 ) 

374 if len(pages) > 0 and pages != "null": 374 ↛ 323line 374 didn't jump to line 323 because the condition on line 374 was always true

375 pages = pages.split(",") 

376 if re.compile(r"\d+-\d+").search(pages[0]): 376 ↛ 377line 376 didn't jump to line 377 because the condition on line 376 was never true

377 txt = f"{url} Fallback for pages with the bibtex" 

378 self.logger.debug(txt, {"pid", xarticle.pid}) 

379 

380 pages = pages[0].split("-") 

381 xarticle.fpage = pages[0] 

382 if len(pages) > 1: 

383 reg_digit = re.compile(r"\d+") 

384 if re.search(reg_digit, str(pages[1])): 

385 pages[1] = re.search(reg_digit, str(pages[1]))[0] 

386 xarticle.lpage = pages[1] 

387 # FIXME : wrong page_range format... Maybe this can be deleted ? 

388 xarticle.page_range = pages[0] + "-" + pages[1] 

389 

390 # reg_title = re.compile("title") 

391 # if reg_title.search(text_part): 

392 # if ( 

393 # xarticle.title_html is None 

394 # or xarticle.title_html == "" 

395 # or xarticle.title_html == "Contents" 

396 # ): 

397 # xarticle.title_html = ( 

398 # text_part.replace("{", "") 

399 # .replace("}", "") 

400 # .replace("title = ", "") 

401 # .replace(",", "") 

402 # ) 

403 # xarticle.title_tex = xarticle.title_html 

404 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>" 

405 

406 def download_file(self, url: str): 

407 if url.startswith("https://eudml.org/doc"): 

408 return super().download_file(url) 

409 

410 content = "" 

411 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8") 

412 attempt = 0 

413 while not content and attempt < 3: 

414 attempt += 1 

415 try: 

416 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}" 

417 execute_cmd(cmd, force_execute=True) 

418 

419 if os.path.isfile(filename): 

420 with open(filename) as file_: 

421 content = file_.read() 

422 if not isinstance(self.session, CachedSession): 

423 continue 

424 # Mock an HTTP requests to inject the data into the cache 

425 

426 except subprocess.CalledProcessError: 

427 pass 

428 

429 if not content: 

430 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

431 

432 return content