Coverage for src/crawler/by_source/eudml

1import base64

2import json

3import os

4import re

5import subprocess

7import regex

8import requests

9from bs4 import BeautifulSoup

10from django.conf import settings

11from ptf.model_data import (

12 create_abstract,

13 create_articledata,

14 create_contributor,

15 create_extlink,

16 create_issuedata,

17 create_subj,

18)

19from ptf.utils import execute_cmd

20from requests_cache import CachedSession

22from crawler.base_crawler import BaseCollectionCrawler

23from crawler.utils import add_pdf_link_to_xarticle

26class EudmlCrawler(BaseCollectionCrawler):

27 source_name = "European Digital Mathematics Library"

28 source_domain = "EUDML"

29 source_website = "https://eudml.org"

31 def parse_collection_content(self, content):

32 """

33 Parse the HTML page of a EuDml journal and returns a list of xissue.

34 Each xissue has a list of articles with just an url.

36 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page

37 """

38 data = json.loads(content)

39 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")

40 xissues = []

41 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")

42 # Extract the list of volumes

43 volume_count = 0

44 issue_count = 0

45 for v in data["volumes"]:

46 volume_count += 1

47 volume_number = v["name"]

49 year_re_groups = volume_year_re.search(v["desc"])

50 if year_re_groups is None:

51 print("skipping volume : no year found")

52 continue

53 year = year_re_groups.group(1)

55 if len(v["issues"]) > 0 and year != "":

56 # Extract all the issues

57 for i in v["issues"]:

58 issue_count += 1

59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)

60 xissues.append(xissue)

61 else:

62 # No issues, articles are directly in the volumeF

63 xissue = self.create_eudml_xissue(v, year, None, volume_number)

64 xissues.append(xissue)

66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">

67 # This info is used to check the number of articles/issues parsed in the page

68 volumes_to_find = 0

69 issues_to_find = 0

70 articles_to_find = 0

71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})

72 for article_detail_node in article_details_nodes:

73 unit_nodes = article_detail_node.find_all("li")

74 for unit_node in unit_nodes:

75 strong_node = unit_node.find("strong")

76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true

77 text = strong_node.get_text()

78 if text == "Issue count:":

79 value = unit_node.get_text()[13:]

80 issues_to_find += int(value)

81 elif text == "Volume count:":

82 value = unit_node.get_text()[14:]

83 volumes_to_find += int(value)

84 elif text == "Number of articles:":

85 value = unit_node.get_text()[20:]

86 articles_to_find += int(value)

88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"

90 print(txt)

91 if settings.CRAWLER_LOG_FILE:

92 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

93 f_.write(txt + "\n")

95 if issue_count != issues_to_find:

96 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"

97 print(txt)

98 if settings.CRAWLER_LOG_FILE: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

100 f_.write(txt + "\n")

101

102 article_count = sum([len(xissue.articles) for xissue in xissues])

103 if article_count != articles_to_find:

104 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"

105 print(txt)

106 if settings.CRAWLER_LOG_FILE: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

108 f_.write(txt + "\n")

109

110 return xissues

111

112 def create_eudml_xissue(

113 self, issue_data: dict, year_str, issue_number: str | None, volume_number

114 ):

115 """

116 EuDML does not have a separate HTML page for an issue.

117 The list of issues/articles is directly found in the collection page.

118

119 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume

120 The PID is temporary and will be updated with the issue number (if any)

121 create_xissue directly creates articles, but with just a pid and an url.

122 """

123 xissue = create_issuedata()

124 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number

125 if issue_number:

126 xissue.pid = xissue.pid + "_" + issue_number

127 xissue.year = year_str

128 xissue.volume = volume_number

129 if issue_number:

130 xissue.number = issue_number

131

132 issue_data["articles"].sort(key=lambda a: a["sortKey"])

133 for index_article, article_data in enumerate(issue_data["articles"]):

134 xarticle = create_articledata()

135 xarticle.pid = "a" + str(index_article)

136 xarticle.url = article_data["url"]

137 xissue.articles.append(xarticle)

138 return xissue

139

140 def parse_article_content(self, content, xissue, xarticle, url, pid):

141 """

142 Parse the content with Beautifulsoup and returns an ArticleData

143 """

144 xarticle.pid = pid

145 soup = BeautifulSoup(content, "xml")

146

147 self.get_metadata_using_citation_meta(

148 xarticle,

149 xissue,

150 soup,

151 [

152 "lang",

153 "title",

154 "author",

155 "pdf",

156 "abstract",

157 "page",

158 "doi",

159 "mr",

160 "zbl",

161 "publisher",

162 "keywords",

163 ],

164 )

165

166 # LINK to SOURCE

167 url_full_text_node = soup.find("a", text="Access to full text")

168 if url_full_text_node is not None:

169 url_full_text = url_full_text_node.get("href")

170 if isinstance(url_full_text, str): 170 ↛ 175line 170 didn't jump to line 175 because the condition on line 170 was always true

171 ext_link = create_extlink(rel="primary-source", location=url_full_text)

172 xarticle.ext_links.append(ext_link)

173

174 # MSC KEYWORDS

175 subj_part = soup.select_one("article#unit-subject-areas")

176 if subj_part is not None: 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true

177 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")

178 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]

179 for subj in subjs:

180 type_class = subj.get("href").split("/")

181 subject = create_subj(type="msc", lang=xarticle.lang)

182 subject["value"] = type_class[3]

183 xarticle.kwds.append(subject)

184

185 # FALLBACK

186 if not xarticle.title_tex:

187 try:

188 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")

189 txt = f"{url} Fallback for title"

190 print(txt)

191 if settings.CRAWLER_LOG_FILE: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

193 f_.write(txt + "\n")

194 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")

195 # FIXME

196 except: # noqa: E722

197 pass

198

199 if len(xarticle.contributors) == 0:

200 # AUTHORS

201 authors_bloc = soup.select_one("p.sub-title-1")

202 if authors_bloc: 202 ↛ 220line 202 didn't jump to line 220 because the condition on line 202 was always true

203 authors_node = authors_bloc.find_all("a")

204 if len(authors_node) > 0: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 txt = f"{url} Fallback for authors"

206 print(txt)

207 if settings.CRAWLER_LOG_FILE:

208 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

209 f_.write(txt + "\n")

210 for author_node in authors_node: 210 ↛ 211line 210 didn't jump to line 211 because the loop on line 210 never started

211 text_author = author_node.get_text()

212 text_author = text_author.replace(",", "")

213

214 author = create_contributor()

215 author["role"] = "author"

216 author["string_name"] = text_author

217

218 xarticle.contributors.append(author)

219

220 if len(xarticle.streams) == 0: 220 ↛ 228line 220 didn't jump to line 228 because the condition on line 220 was always true

221 # PDF

222 pdf_node = soup.find("a", text="Full (PDF)")

223 if pdf_node is not None:

224 pdf_url = pdf_node.get("href")

225 if pdf_url: 225 ↛ 228line 225 didn't jump to line 228 because the condition on line 225 was always true

226 add_pdf_link_to_xarticle(xarticle, pdf_url)

227

228 if len(xarticle.streams) == 0:

229 if not url_full_text_node:

230 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf")

231 else:

232 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))

233

234 if len(xarticle.abstracts) == 0: 234 ↛ 246line 234 didn't jump to line 246 because the condition on line 234 was always true

235 # ABSTRACT

236 abstract_node = soup.find("article", {"id": "unit-article-abstract"})

237 if abstract_node is not None: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 abstract_section_node = abstract_node.find("section")

239 if abstract_section_node:

240 abstract = str(abstract_section_node)

241 xabstract = create_abstract(

242 tag="abstract", value_tex=abstract, lang=xarticle.lang

243 )

244 xarticle.abstracts.append(xabstract)

245

246 if len(xarticle.contributors) == 0 or not xarticle.fpage:

247 # LANG, PAGES, (AUTHORS)

248 # EuDML has an export BibTex section with some information (lang, pages, authors)

249 self.parse_bibtex(soup, xarticle, url)

250

251 if xarticle.doi is None: 251 ↛ 327line 251 didn't jump to line 327 because the condition on line 251 was always true

252 # DOI

253 doi_link = soup.find("article", {"id": "unit-other-ids"})

254 if doi_link is not None: 254 ↛ 257line 254 didn't jump to line 257 because the condition on line 254 was never true

255 # Simplify ?

256 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89

257 try:

258 reg_doi = re.compile("doi.org")

259 doi_array = [

260 d.get("href")

261 for d in doi_link.find_all("a")

262 if reg_doi.search(str(d.get("href")))

263 ]

264 if doi_array:

265 if len(doi_array) > 1:

266 start_dois = len(doi_array) - 1

267 doi = doi_array[start_dois:][0]

268 else:

269 doi = doi_array[0]

270

271 doi_array = doi.split("doi.org/")

272 # strip unwanted chars present

273 if len(doi_array) > 1:

274 doi = doi_array[1].encode("ascii", "ignore")

275 doi = str(doi.decode())

276 doi_array = doi.split("\\u")

277 doi = str(doi_array[0])

278

279 doi = re.sub("}", "", doi)

280 doi = re.sub("\t", "", doi)

281 doi = doi.encode("ascii", "ignore")

282 doi = doi.decode()

283

284 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")

285 doi = doi.decode()

286 doi_array = doi.split("\\u")

287 doi = str(doi_array[0]).strip()

288 doi = doi.replace(" ", "")

289

290 xarticle.doi = doi

291 except TypeError as e:

292 print(e)

293

294 # You can't get the first link to zbmath.org: it could be in the list of references !

295

296 # has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1

297 # if not has_zblid:

298 # # ZBL

299 # zblid_link = soup.find(

300 # "a", {"href": re.compile(r"http:\/\/www.zentralblatt-math.org\/zmath\/")}

301 # )

302 # if zblid_link is not None:

303 # zblid = zblid_link.get("href").split("?q=")[1]

304 # if zblid:

305 # print(f"{url} Fallback for zbl-id: {zblid}")

306 # xarticle.extids.append(("zbl-item-id", zblid))

307

308 # In Other Databases is not (always ?) the publisher

309 # if not xissue.publisher:

310 # # PUBLISHER

311 # section_oai = soup.find("h3", text="In Other Databases")

312 # if section_oai is not None:

313 # section_oai_array = section_oai.parent.find_all("dd")

314 # if section_oai is not None:

315 # pub = [

316 # d.text

317 # for d in section_oai_array

318 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]

319 # ]

320 # if pub != "":

321 # print(f"{url} Fallback for publisher")

322 # xpub = create_publisherdata()

323 # xpub.name = pub[0].strip()

324 # xissue.publisher = xpub

325

326 # ARTICLE PID

327 if xarticle.doi is not None: 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true

328 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")

329 xarticle.pid = xarticle.pid.replace("pid", "").replace(":", "_")

330 else:

331 reg_article = regex.compile(r"\d+")

332 if not isinstance(xarticle.pid, type(None)): 332 ↛ 338line 332 didn't jump to line 338 because the condition on line 332 was always true

333 pid_array = reg_article.findall(url)

334 if len(pid_array) > 0: 334 ↛ 338line 334 didn't jump to line 338 because the condition on line 334 was always true

335 id_article = pid_array[0]

336 xarticle.pid = xissue.pid + "_" + id_article

337

338 return xarticle

339

340 def parse_bibtex(self, soup, xarticle, url):

341 """

342 Parse the BibTeX section of a EuDML article page.

343 Extract

344 - the authors (if no author was already found in the page)

345 - the article language

346 - the article pages

347 """

348 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]

349

350 if len(bib_div) > 0: 350 ↛ exitline 350 didn't return from function 'parse_bibtex' because the condition on line 350 was always true

351 bib_tex = bib_div[0].get_text()

352 text = bib_tex.split("\t")

353

354 for text_part in text:

355 # AUTHORS (only if no authors were already found in the page)

356 if len(xarticle.contributors) == 0:

357 reg_author = re.compile("author =")

358 if reg_author.search(text_part): 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 txt = f"{url} Fallback for authors with the bibtex"

360 print(txt)

361 if settings.CRAWLER_LOG_FILE:

362 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

363 f_.write(txt + "\n")

364

365 authors_text = (

366 text_part.replace("{", "").replace("}", "").replace("author = ", "")

367 )

368 authors_bib = authors_text.split(",")

369 for index, name in enumerate(authors_bib):

370 if index % 2 == 1:

371 author_name = authors_bib[index - 1] + " " + authors_bib[index]

372 author_name = self.latext_parser.latex_to_text(author_name)

373 author_name = author_name.replace("\xa0", "")

374

375 author = create_contributor()

376 author["role"] = "author"

377 author["string_name"] = author_name

378 xarticle.contributors.append(author)

379

380 # LANG

381 reg_lang = re.compile("language = ")

382 if reg_lang.search(text_part):

383 xarticle.lang = (

384 text_part.replace("{", "")

385 .replace("}", "")

386 .replace("language = ", "")

387 .replace(",", "")

388 )

389 if len(xarticle.lang) >= 3: 389 ↛ 392line 389 didn't jump to line 392 because the condition on line 389 was always true

390 xarticle.lang = xarticle.lang[:-1]

391

392 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 xarticle.abstracts[0]["lang"] = xarticle.lang

394

395 if not xarticle.fpage:

396 # PAGES

397 reg_pages = re.compile("pages =")

398 if reg_pages.search(text_part):

399 pages = (

400 text_part.replace("{", "")

401 .replace("}", "")

402 .replace("(", "")

403 .replace(")", "")

404 .replace("[", "")

405 .replace("]", "")

406 .replace("pages = ", "")

407 )

408 if len(pages) > 0 and pages != "null": 408 ↛ 354line 408 didn't jump to line 354 because the condition on line 408 was always true

409 pages = pages.split(",")

410 if re.compile(r"\d+-\d+").search(pages[0]): 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 txt = f"{url} Fallback for pages with the bibtex"

412 print(txt)

413 if settings.CRAWLER_LOG_FILE:

414 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

415 f_.write(txt + "\n")

416

417 pages = pages[0].split("-")

418 xarticle.fpage = pages[0]

419 if len(pages) > 1:

420 reg_digit = re.compile(r"\d+")

421 if re.search(reg_digit, str(pages[1])):

422 pages[1] = re.search(reg_digit, str(pages[1]))[0]

423 xarticle.lpage = pages[1]

424 # FIXME : wrong page_range format... Maybe this can be deleted ?

425 xarticle.page_range = pages[0] + "-" + pages[1]

426

427 # reg_title = re.compile("title")

428 # if reg_title.search(text_part):

429 # if (

430 # xarticle.title_html is None

431 # or xarticle.title_html == ""

432 # or xarticle.title_html == "Contents"

433 # ):

434 # xarticle.title_html = (

435 # text_part.replace("{", "")

436 # .replace("}", "")

437 # .replace("title = ", "")

438 # .replace(",", "")

439 # )

440 # xarticle.title_tex = xarticle.title_html

441 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"

442

443 def download_file(self, url: str):

444 if url.startswith("https://eudml.org/doc"):

445 return super().download_file(url)

446

447 content = ""

448 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")

449 attempt = 0

450 while not content and attempt < 3:

451 attempt += 1

452 try:

453 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"

454 print(cmd)

455 execute_cmd(cmd)

456

457 if os.path.isfile(filename):

458 with open(filename) as file_:

459 content = file_.read()

460 if not isinstance(self.session, CachedSession):

461 continue

462 # Mock an HTTP requests to inject the data into the cache

463

464 except subprocess.CalledProcessError:

465 pass

466

467 if not content:

468 raise requests.exceptions.HTTPError(f"Unable to download {url}")

469

470 return content

Coverage for src/crawler/by_source/eudml_crawler.py: 57%

275 statements