Coverage for src/crawler/by_source/eudml_crawler.py: 60%

255 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import base64 

2import json 

3import os 

4import re 

5import subprocess 

6 

7import regex 

8import requests 

9from bs4 import BeautifulSoup 

10from ptf.model_data import ( 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_extlink, 

15 create_issuedata, 

16 create_subj, 

17) 

18from ptf.utils import execute_cmd 

19from requests_cache import CachedSession 

20 

21from crawler.base_crawler import BaseCollectionCrawler 

22from crawler.utils import add_pdf_link_to_xarticle 

23 

24 

25class EudmlCrawler(BaseCollectionCrawler): 

26 source_name = "European Digital Mathematics Library" 

27 source_domain = "EUDML" 

28 source_website = "https://eudml.org" 

29 

30 def parse_collection_content(self, content): 

31 """ 

32 Parse the HTML page of a EuDml journal and returns a list of xissue. 

33 Each xissue has a list of articles with just an url. 

34 """ 

35 data = json.loads(content) 

36 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser") 

37 xissues = [] 

38 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)") 

39 # Extract the list of volumes 

40 volume_count = 0 

41 issue_count = 0 

42 for v in data["volumes"]: 

43 volume_count += 1 

44 volume_number = v["name"] 

45 

46 year_re_groups = volume_year_re.search(v["desc"]) 

47 if year_re_groups is None: 

48 print("skipping volume : no year found") 

49 continue 

50 year = year_re_groups.group(1) 

51 if year == "": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 print("volume year is an empty string... Skipping") 

53 continue 

54 if len(v["issues"]) > 0: 

55 # Extract all the issues 

56 for i in v["issues"]: 

57 issue_count += 1 

58 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number) 

59 xissues.append(xissue) 

60 else: 

61 # No issues, articles are directly in the volumeF 

62 xissue = self.create_eudml_xissue(v, year, None, volume_number) 

63 xissues.append(xissue) 

64 

65 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list"> 

66 # This info is used to check the number of articles/issues parsed in the page 

67 volumes_to_find = 0 

68 issues_to_find = 0 

69 articles_to_find = 0 

70 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"}) 

71 for article_detail_node in article_details_nodes: 

72 unit_nodes = article_detail_node.find_all("li") 

73 for unit_node in unit_nodes: 

74 strong_node = unit_node.find("strong") 

75 if strong_node is not None: 75 ↛ 73line 75 didn't jump to line 73 because the condition on line 75 was always true

76 text = strong_node.get_text() 

77 if text == "Issue count:": 

78 value = unit_node.get_text()[13:] 

79 issues_to_find += int(value) 

80 elif text == "Volume count:": 

81 value = unit_node.get_text()[14:] 

82 volumes_to_find += int(value) 

83 elif text == "Number of articles:": 

84 value = unit_node.get_text()[20:] 

85 articles_to_find += int(value) 

86 

87 if volume_count != volumes_to_find: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}" 

89 print(txt) 

90 

91 if issue_count != issues_to_find: 

92 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}" 

93 print(txt) 

94 

95 article_count = sum([len(xissue.articles) for xissue in xissues]) 

96 if article_count != articles_to_find: 

97 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}" 

98 print(txt) 

99 

100 return xissues 

101 

102 def create_eudml_xissue( 

103 self, issue_data: dict, year_str, issue_number: str | None, volume_number 

104 ): 

105 """ 

106 EuDML does not have a separate HTML page for an issue. 

107 The list of issues/articles is directly found in the collection page. 

108 

109 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume 

110 The PID is temporary and will be updated with the issue number (if any) 

111 create_xissue directly creates articles, but with just a pid and an url. 

112 """ 

113 xissue = create_issuedata() 

114 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number 

115 if issue_number: 

116 xissue.pid = xissue.pid + "_" + issue_number 

117 xissue.year = year_str 

118 xissue.volume = volume_number 

119 if issue_number: 

120 xissue.number = issue_number 

121 

122 issue_data["articles"].sort(key=lambda a: a["sortKey"]) 

123 for index_article, article_data in enumerate(issue_data["articles"]): 

124 xarticle = create_articledata() 

125 xarticle.pid = "a" + str(index_article) 

126 xarticle.url = article_data["url"] 

127 xissue.articles.append(xarticle) 

128 return xissue 

129 

130 def parse_article_content(self, content, xissue, xarticle, url): 

131 """ 

132 Parse the content with Beautifulsoup and returns an ArticleData 

133 """ 

134 soup = BeautifulSoup(content, "xml") 

135 

136 self.get_metadata_using_citation_meta( 

137 xarticle, 

138 xissue, 

139 soup, 

140 [ 

141 "lang", 

142 "title", 

143 "author", 

144 "pdf", 

145 "abstract", 

146 "page", 

147 "doi", 

148 "mr", 

149 "zbl", 

150 "publisher", 

151 "keywords", 

152 ], 

153 ) 

154 

155 # LINK to SOURCE 

156 url_full_text_node = soup.find("a", text="Access to full text") 

157 if url_full_text_node is not None: 

158 url_full_text = url_full_text_node.get("href") 

159 if isinstance(url_full_text, str): 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 ext_link = create_extlink(rel="primary-source", location=url_full_text) 

161 xarticle.ext_links.append(ext_link) 

162 

163 # MSC KEYWORDS 

164 subj_part = soup.select_one("article#unit-subject-areas") 

165 if subj_part is not None: 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+") 

167 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))] 

168 for subj in subjs: 

169 type_class = subj.get("href").split("/") 

170 subject = create_subj(type="msc", lang=xarticle.lang) 

171 subject["value"] = type_class[3] 

172 xarticle.kwds.append(subject) 

173 

174 # FALLBACK 

175 if not xarticle.title_tex: 

176 try: 

177 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ") 

178 txt = f"{url} Fallback for title" 

179 print(txt) 

180 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "") 

181 # FIXME 

182 except: # noqa: E722 

183 pass 

184 

185 if len(xarticle.contributors) == 0: 

186 # AUTHORS 

187 authors_bloc = soup.select_one("p.sub-title-1") 

188 if authors_bloc: 188 ↛ 203line 188 didn't jump to line 203 because the condition on line 188 was always true

189 authors_node = authors_bloc.find_all("a") 

190 if len(authors_node) > 0: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 txt = f"{url} Fallback for authors" 

192 print(txt) 

193 for author_node in authors_node: 193 ↛ 194line 193 didn't jump to line 194 because the loop on line 193 never started

194 text_author = author_node.get_text() 

195 text_author = text_author.replace(",", "") 

196 

197 author = create_contributor() 

198 author["role"] = "author" 

199 author["string_name"] = text_author 

200 

201 xarticle.contributors.append(author) 

202 

203 if len(xarticle.streams) == 0: 203 ↛ 211line 203 didn't jump to line 211 because the condition on line 203 was always true

204 # PDF 

205 pdf_node = soup.find("a", text="Full (PDF)") 

206 if pdf_node is not None: 

207 pdf_url = pdf_node.get("href") 

208 if pdf_url: 208 ↛ 211line 208 didn't jump to line 211 because the condition on line 208 was always true

209 add_pdf_link_to_xarticle(xarticle, pdf_url) 

210 

211 if len(xarticle.streams) == 0: 

212 if not url_full_text_node: 

213 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf") 

214 else: 

215 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href")) 

216 

217 if len(xarticle.abstracts) == 0: 217 ↛ 229line 217 didn't jump to line 229 because the condition on line 217 was always true

218 # ABSTRACT 

219 abstract_node = soup.find("article", {"id": "unit-article-abstract"}) 

220 if abstract_node is not None: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 abstract_section_node = abstract_node.find("section") 

222 if abstract_section_node: 

223 abstract = str(abstract_section_node) 

224 xabstract = create_abstract( 

225 tag="abstract", value_tex=abstract, lang=xarticle.lang 

226 ) 

227 xarticle.abstracts.append(xabstract) 

228 

229 if len(xarticle.contributors) == 0 or not xarticle.fpage: 

230 # LANG, PAGES, (AUTHORS) 

231 # EuDML has an export BibTex section with some information (lang, pages, authors) 

232 self.parse_bibtex(soup, xarticle, url) 

233 

234 if xarticle.doi is None: 234 ↛ 277line 234 didn't jump to line 277 because the condition on line 234 was always true

235 # DOI 

236 doi_link = soup.find("article", {"id": "unit-other-ids"}) 

237 if doi_link is not None: 237 ↛ 240line 237 didn't jump to line 240 because the condition on line 237 was never true

238 # Simplify ? 

239 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89 

240 try: 

241 reg_doi = re.compile("doi.org") 

242 doi_array = [ 

243 d.get("href") 

244 for d in doi_link.find_all("a") 

245 if reg_doi.search(str(d.get("href"))) 

246 ] 

247 if doi_array: 

248 if len(doi_array) > 1: 

249 start_dois = len(doi_array) - 1 

250 doi = doi_array[start_dois:][0] 

251 else: 

252 doi = doi_array[0] 

253 

254 doi_array = doi.split("doi.org/") 

255 # strip unwanted chars present 

256 if len(doi_array) > 1: 

257 doi = doi_array[1].encode("ascii", "ignore") 

258 doi = str(doi.decode()) 

259 doi_array = doi.split("\\u") 

260 doi = str(doi_array[0]) 

261 

262 doi = re.sub("}", "", doi) 

263 doi = re.sub("\t", "", doi) 

264 doi = doi.encode("ascii", "ignore") 

265 doi = doi.decode() 

266 

267 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8") 

268 doi = doi.decode() 

269 doi_array = doi.split("\\u") 

270 doi = str(doi_array[0]).strip() 

271 doi = doi.replace(" ", "") 

272 

273 xarticle.doi = doi 

274 except TypeError as e: 

275 print(e) 

276 

277 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1 

278 if not has_zblid: 

279 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')") 

280 if zb_tag: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 zb_href = zb_tag.get("href") 

282 if not isinstance(zb_href, str): 

283 raise ValueError("Couldn't parse zbmath href") 

284 zblid = zb_href.removeprefix( 

285 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

286 ) 

287 xarticle.extids.append(("zbl-item-id", zblid)) 

288 

289 # In Other Databases is not (always ?) the publisher 

290 # if not xissue.publisher: 

291 # # PUBLISHER 

292 # section_oai = soup.find("h3", text="In Other Databases") 

293 # if section_oai is not None: 

294 # section_oai_array = section_oai.parent.find_all("dd") 

295 # if section_oai is not None: 

296 # pub = [ 

297 # d.text 

298 # for d in section_oai_array 

299 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"] 

300 # ] 

301 # if pub != "": 

302 # print(f"{url} Fallback for publisher") 

303 # xpub = create_publisherdata() 

304 # xpub.name = pub[0].strip() 

305 # xissue.publisher = xpub 

306 return xarticle 

307 

308 def parse_bibtex(self, soup, xarticle, url): 

309 """ 

310 Parse the BibTeX section of a EuDML article page. 

311 Extract 

312 - the authors (if no author was already found in the page) 

313 - the article language 

314 - the article pages 

315 """ 

316 bib_div = [p for p in soup.find_all("p") if "@article" in p.text] 

317 

318 if len(bib_div) > 0: 318 ↛ exitline 318 didn't return from function 'parse_bibtex' because the condition on line 318 was always true

319 bib_tex = bib_div[0].get_text() 

320 text = bib_tex.split("\t") 

321 

322 for text_part in text: 

323 # AUTHORS (only if no authors were already found in the page) 

324 if len(xarticle.contributors) == 0: 

325 reg_author = re.compile("author =") 

326 if reg_author.search(text_part): 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 txt = f"{url} Fallback for authors with the bibtex" 

328 print(txt) 

329 

330 authors_text = ( 

331 text_part.replace("{", "").replace("}", "").replace("author = ", "") 

332 ) 

333 authors_bib = authors_text.split(",") 

334 for index, name in enumerate(authors_bib): 

335 if index % 2 == 1: 

336 author_name = authors_bib[index - 1] + " " + authors_bib[index] 

337 author_name = self.latext_parser.latex_to_text(author_name) 

338 author_name = author_name.replace("\xa0", "") 

339 

340 author = create_contributor() 

341 author["role"] = "author" 

342 author["string_name"] = author_name 

343 xarticle.contributors.append(author) 

344 

345 # LANG 

346 reg_lang = re.compile("language = ") 

347 if reg_lang.search(text_part): 

348 xarticle.lang = ( 

349 text_part.replace("{", "") 

350 .replace("}", "") 

351 .replace("language = ", "") 

352 .replace(",", "") 

353 ) 

354 if len(xarticle.lang) >= 3: 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true

355 xarticle.lang = xarticle.lang[:-1] 

356 

357 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 xarticle.abstracts[0]["lang"] = xarticle.lang 

359 

360 if not xarticle.fpage: 

361 # PAGES 

362 reg_pages = re.compile("pages =") 

363 if reg_pages.search(text_part): 

364 pages = ( 

365 text_part.replace("{", "") 

366 .replace("}", "") 

367 .replace("(", "") 

368 .replace(")", "") 

369 .replace("[", "") 

370 .replace("]", "") 

371 .replace("pages = ", "") 

372 ) 

373 if len(pages) > 0 and pages != "null": 373 ↛ 322line 373 didn't jump to line 322 because the condition on line 373 was always true

374 pages = pages.split(",") 

375 if re.compile(r"\d+-\d+").search(pages[0]): 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true

376 txt = f"{url} Fallback for pages with the bibtex" 

377 print(txt) 

378 

379 pages = pages[0].split("-") 

380 xarticle.fpage = pages[0] 

381 if len(pages) > 1: 

382 reg_digit = re.compile(r"\d+") 

383 if re.search(reg_digit, str(pages[1])): 

384 pages[1] = re.search(reg_digit, str(pages[1]))[0] 

385 xarticle.lpage = pages[1] 

386 # FIXME : wrong page_range format... Maybe this can be deleted ? 

387 xarticle.page_range = pages[0] + "-" + pages[1] 

388 

389 # reg_title = re.compile("title") 

390 # if reg_title.search(text_part): 

391 # if ( 

392 # xarticle.title_html is None 

393 # or xarticle.title_html == "" 

394 # or xarticle.title_html == "Contents" 

395 # ): 

396 # xarticle.title_html = ( 

397 # text_part.replace("{", "") 

398 # .replace("}", "") 

399 # .replace("title = ", "") 

400 # .replace(",", "") 

401 # ) 

402 # xarticle.title_tex = xarticle.title_html 

403 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>" 

404 

405 def download_file(self, url: str): 

406 if url.startswith("https://eudml.org/doc"): 

407 return super().download_file(url) 

408 

409 content = "" 

410 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8") 

411 attempt = 0 

412 while not content and attempt < 3: 

413 attempt += 1 

414 try: 

415 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}" 

416 print(cmd) 

417 execute_cmd(cmd, force_execute=True) 

418 

419 if os.path.isfile(filename): 

420 with open(filename) as file_: 

421 content = file_.read() 

422 if not isinstance(self.session, CachedSession): 

423 continue 

424 # Mock an HTTP requests to inject the data into the cache 

425 

426 except subprocess.CalledProcessError: 

427 pass 

428 

429 if not content: 

430 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

431 

432 return content