Coverage for src/crawler/by_source/eudml_crawler.py: 57%
276 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import base64
2import json
3import os
4import re
5import subprocess
7import regex
8import requests
9from bs4 import BeautifulSoup
10from django.conf import settings
11from ptf.model_data import (
12 create_abstract,
13 create_articledata,
14 create_contributor,
15 create_extlink,
16 create_issuedata,
17 create_subj,
18)
19from ptf.utils import execute_cmd
20from requests_cache import CachedSession
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle
26class EudmlCrawler(BaseCollectionCrawler):
27 source_name = "European Digital Mathematics Library"
28 source_domain = "EUDML"
29 source_website = "https://eudml.org"
31 def parse_collection_content(self, content):
32 """
33 Parse the HTML page of a EuDml journal and returns a list of xissue.
34 Each xissue has a list of articles with just an url.
36 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
37 """
38 data = json.loads(content)
39 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
40 xissues = []
41 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
42 # Extract the list of volumes
43 volume_count = 0
44 issue_count = 0
45 for v in data["volumes"]:
46 volume_count += 1
47 volume_number = v["name"]
49 year_re_groups = volume_year_re.search(v["desc"])
50 if year_re_groups is None:
51 print("skipping volume : no year found")
52 continue
53 year = year_re_groups.group(1)
55 if len(v["issues"]) > 0 and year != "":
56 # Extract all the issues
57 for i in v["issues"]:
58 issue_count += 1
59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)
60 xissues.append(xissue)
61 else:
62 # No issues, articles are directly in the volumeF
63 xissue = self.create_eudml_xissue(v, year, None, volume_number)
64 xissues.append(xissue)
66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
67 # This info is used to check the number of articles/issues parsed in the page
68 volumes_to_find = 0
69 issues_to_find = 0
70 articles_to_find = 0
71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
72 for article_detail_node in article_details_nodes:
73 unit_nodes = article_detail_node.find_all("li")
74 for unit_node in unit_nodes:
75 strong_node = unit_node.find("strong")
76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true
77 text = strong_node.get_text()
78 if text == "Issue count:":
79 value = unit_node.get_text()[13:]
80 issues_to_find += int(value)
81 elif text == "Volume count:":
82 value = unit_node.get_text()[14:]
83 volumes_to_find += int(value)
84 elif text == "Number of articles:":
85 value = unit_node.get_text()[20:]
86 articles_to_find += int(value)
88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
90 print(txt)
91 if settings.CRAWLER_LOG_FILE:
92 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
93 f_.write(txt + "\n")
95 if issue_count != issues_to_find:
96 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
97 print(txt)
98 if settings.CRAWLER_LOG_FILE: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
100 f_.write(txt + "\n")
102 article_count = sum([len(xissue.articles) for xissue in xissues])
103 if article_count != articles_to_find:
104 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
105 print(txt)
106 if settings.CRAWLER_LOG_FILE: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
108 f_.write(txt + "\n")
110 return xissues
112 def create_eudml_xissue(
113 self, issue_data: dict, year_str, issue_number: str | None, volume_number
114 ):
115 """
116 EuDML does not have a separate HTML page for an issue.
117 The list of issues/articles is directly found in the collection page.
119 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
120 The PID is temporary and will be updated with the issue number (if any)
121 create_xissue directly creates articles, but with just a pid and an url.
122 """
123 xissue = create_issuedata()
124 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
125 if issue_number:
126 xissue.pid = xissue.pid + "_" + issue_number
127 xissue.year = year_str
128 xissue.volume = volume_number
129 if issue_number:
130 xissue.number = issue_number
132 issue_data["articles"].sort(key=lambda a: a["sortKey"])
133 for index_article, article_data in enumerate(issue_data["articles"]):
134 xarticle = create_articledata()
135 xarticle.pid = "a" + str(index_article)
136 xarticle.url = article_data["url"]
137 xissue.articles.append(xarticle)
138 return xissue
140 def parse_article_content(self, content, xissue, xarticle, url, pid):
141 """
142 Parse the content with Beautifulsoup and returns an ArticleData
143 """
144 xarticle = create_articledata()
145 xarticle.pid = pid
146 soup = BeautifulSoup(content, "xml")
148 self.get_metadata_using_citation_meta(
149 xarticle,
150 xissue,
151 soup,
152 [
153 "lang",
154 "title",
155 "author",
156 "pdf",
157 "abstract",
158 "page",
159 "doi",
160 "mr",
161 "zbl",
162 "publisher",
163 "keywords",
164 ],
165 )
167 # LINK to SOURCE
168 url_full_text_node = soup.find("a", text="Access to full text")
169 if url_full_text_node is not None:
170 url_full_text = url_full_text_node.get("href")
171 if isinstance(url_full_text, str): 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true
172 ext_link = create_extlink(rel="primary-source", location=url_full_text)
173 xarticle.ext_links.append(ext_link)
175 # MSC KEYWORDS
176 subj_part = soup.select_one("article#unit-subject-areas")
177 if subj_part is not None: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
179 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]
180 for subj in subjs:
181 type_class = subj.get("href").split("/")
182 subject = create_subj(type="msc", lang=xarticle.lang)
183 subject["value"] = type_class[3]
184 xarticle.kwds.append(subject)
186 # FALLBACK
187 if not xarticle.title_tex:
188 try:
189 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")
190 txt = f"{url} Fallback for title"
191 print(txt)
192 if settings.CRAWLER_LOG_FILE: 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
194 f_.write(txt + "\n")
195 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
196 # FIXME
197 except: # noqa: E722
198 pass
200 if len(xarticle.contributors) == 0:
201 # AUTHORS
202 authors_bloc = soup.select_one("p.sub-title-1")
203 if authors_bloc: 203 ↛ 221line 203 didn't jump to line 221 because the condition on line 203 was always true
204 authors_node = authors_bloc.find_all("a")
205 if len(authors_node) > 0: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 txt = f"{url} Fallback for authors"
207 print(txt)
208 if settings.CRAWLER_LOG_FILE:
209 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
210 f_.write(txt + "\n")
211 for author_node in authors_node: 211 ↛ 212line 211 didn't jump to line 212 because the loop on line 211 never started
212 text_author = author_node.get_text()
213 text_author = text_author.replace(",", "")
215 author = create_contributor()
216 author["role"] = "author"
217 author["string_name"] = text_author
219 xarticle.contributors.append(author)
221 if len(xarticle.streams) == 0: 221 ↛ 229line 221 didn't jump to line 229 because the condition on line 221 was always true
222 # PDF
223 pdf_node = soup.find("a", text="Full (PDF)")
224 if pdf_node is not None:
225 pdf_url = pdf_node.get("href")
226 if pdf_url: 226 ↛ 229line 226 didn't jump to line 229 because the condition on line 226 was always true
227 add_pdf_link_to_xarticle(xarticle, pdf_url)
229 if len(xarticle.streams) == 0:
230 if not url_full_text_node:
231 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf")
232 else:
233 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))
235 if len(xarticle.abstracts) == 0: 235 ↛ 247line 235 didn't jump to line 247 because the condition on line 235 was always true
236 # ABSTRACT
237 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
238 if abstract_node is not None: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 abstract_section_node = abstract_node.find("section")
240 if abstract_section_node:
241 abstract = str(abstract_section_node)
242 xabstract = create_abstract(
243 tag="abstract", value_tex=abstract, lang=xarticle.lang
244 )
245 xarticle.abstracts.append(xabstract)
247 if len(xarticle.contributors) == 0 or not xarticle.fpage:
248 # LANG, PAGES, (AUTHORS)
249 # EuDML has an export BibTex section with some information (lang, pages, authors)
250 self.parse_bibtex(soup, xarticle, url)
252 if xarticle.doi is None: 252 ↛ 328line 252 didn't jump to line 328 because the condition on line 252 was always true
253 # DOI
254 doi_link = soup.find("article", {"id": "unit-other-ids"})
255 if doi_link is not None: 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was never true
256 # Simplify ?
257 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
258 try:
259 reg_doi = re.compile("doi.org")
260 doi_array = [
261 d.get("href")
262 for d in doi_link.find_all("a")
263 if reg_doi.search(str(d.get("href")))
264 ]
265 if doi_array:
266 if len(doi_array) > 1:
267 start_dois = len(doi_array) - 1
268 doi = doi_array[start_dois:][0]
269 else:
270 doi = doi_array[0]
272 doi_array = doi.split("doi.org/")
273 # strip unwanted chars present
274 if len(doi_array) > 1:
275 doi = doi_array[1].encode("ascii", "ignore")
276 doi = str(doi.decode())
277 doi_array = doi.split("\\u")
278 doi = str(doi_array[0])
280 doi = re.sub("}", "", doi)
281 doi = re.sub("\t", "", doi)
282 doi = doi.encode("ascii", "ignore")
283 doi = doi.decode()
285 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
286 doi = doi.decode()
287 doi_array = doi.split("\\u")
288 doi = str(doi_array[0]).strip()
289 doi = doi.replace(" ", "")
291 xarticle.doi = doi
292 except TypeError as e:
293 print(e)
295 # You can't get the first link to zbmath.org: it could be in the list of references !
297 # has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
298 # if not has_zblid:
299 # # ZBL
300 # zblid_link = soup.find(
301 # "a", {"href": re.compile(r"http:\/\/www.zentralblatt-math.org\/zmath\/")}
302 # )
303 # if zblid_link is not None:
304 # zblid = zblid_link.get("href").split("?q=")[1]
305 # if zblid:
306 # print(f"{url} Fallback for zbl-id: {zblid}")
307 # xarticle.extids.append(("zbl-item-id", zblid))
309 # In Other Databases is not (always ?) the publisher
310 # if not xissue.publisher:
311 # # PUBLISHER
312 # section_oai = soup.find("h3", text="In Other Databases")
313 # if section_oai is not None:
314 # section_oai_array = section_oai.parent.find_all("dd")
315 # if section_oai is not None:
316 # pub = [
317 # d.text
318 # for d in section_oai_array
319 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
320 # ]
321 # if pub != "":
322 # print(f"{url} Fallback for publisher")
323 # xpub = create_publisherdata()
324 # xpub.name = pub[0].strip()
325 # xissue.publisher = xpub
327 # ARTICLE PID
328 if xarticle.doi is not None: 328 ↛ 329line 328 didn't jump to line 329 because the condition on line 328 was never true
329 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
330 xarticle.pid = xarticle.pid.replace("pid", "").replace(":", "_")
331 else:
332 reg_article = regex.compile(r"\d+")
333 if not isinstance(xarticle.pid, type(None)): 333 ↛ 339line 333 didn't jump to line 339 because the condition on line 333 was always true
334 pid_array = reg_article.findall(url)
335 if len(pid_array) > 0: 335 ↛ 339line 335 didn't jump to line 339 because the condition on line 335 was always true
336 id_article = pid_array[0]
337 xarticle.pid = xissue.pid + "_" + id_article
339 return xarticle
341 def parse_bibtex(self, soup, xarticle, url):
342 """
343 Parse the BibTeX section of a EuDML article page.
344 Extract
345 - the authors (if no author was already found in the page)
346 - the article language
347 - the article pages
348 """
349 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
351 if len(bib_div) > 0: 351 ↛ exitline 351 didn't return from function 'parse_bibtex' because the condition on line 351 was always true
352 bib_tex = bib_div[0].get_text()
353 text = bib_tex.split("\t")
355 for text_part in text:
356 # AUTHORS (only if no authors were already found in the page)
357 if len(xarticle.contributors) == 0:
358 reg_author = re.compile("author =")
359 if reg_author.search(text_part): 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true
360 txt = f"{url} Fallback for authors with the bibtex"
361 print(txt)
362 if settings.CRAWLER_LOG_FILE:
363 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
364 f_.write(txt + "\n")
366 authors_text = (
367 text_part.replace("{", "").replace("}", "").replace("author = ", "")
368 )
369 authors_bib = authors_text.split(",")
370 for index, name in enumerate(authors_bib):
371 if index % 2 == 1:
372 author_name = authors_bib[index - 1] + " " + authors_bib[index]
373 author_name = self.latext_parser.latex_to_text(author_name)
374 author_name = author_name.replace("\xa0", "")
376 author = create_contributor()
377 author["role"] = "author"
378 author["string_name"] = author_name
379 xarticle.contributors.append(author)
381 # LANG
382 reg_lang = re.compile("language = ")
383 if reg_lang.search(text_part):
384 xarticle.lang = (
385 text_part.replace("{", "")
386 .replace("}", "")
387 .replace("language = ", "")
388 .replace(",", "")
389 )
390 if len(xarticle.lang) >= 3: 390 ↛ 393line 390 didn't jump to line 393 because the condition on line 390 was always true
391 xarticle.lang = xarticle.lang[:-1]
393 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true
394 xarticle.abstracts[0]["lang"] = xarticle.lang
396 if not xarticle.fpage:
397 # PAGES
398 reg_pages = re.compile("pages =")
399 if reg_pages.search(text_part):
400 pages = (
401 text_part.replace("{", "")
402 .replace("}", "")
403 .replace("(", "")
404 .replace(")", "")
405 .replace("[", "")
406 .replace("]", "")
407 .replace("pages = ", "")
408 )
409 if len(pages) > 0 and pages != "null": 409 ↛ 355line 409 didn't jump to line 355 because the condition on line 409 was always true
410 pages = pages.split(",")
411 if re.compile(r"\d+-\d+").search(pages[0]): 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true
412 txt = f"{url} Fallback for pages with the bibtex"
413 print(txt)
414 if settings.CRAWLER_LOG_FILE:
415 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
416 f_.write(txt + "\n")
418 pages = pages[0].split("-")
419 xarticle.fpage = pages[0]
420 if len(pages) > 1:
421 reg_digit = re.compile(r"\d+")
422 if re.search(reg_digit, str(pages[1])):
423 pages[1] = re.search(reg_digit, str(pages[1]))[0]
424 xarticle.lpage = pages[1]
425 # FIXME : wrong page_range format... Maybe this can be deleted ?
426 xarticle.page_range = pages[0] + "-" + pages[1]
428 # reg_title = re.compile("title")
429 # if reg_title.search(text_part):
430 # if (
431 # xarticle.title_html is None
432 # or xarticle.title_html == ""
433 # or xarticle.title_html == "Contents"
434 # ):
435 # xarticle.title_html = (
436 # text_part.replace("{", "")
437 # .replace("}", "")
438 # .replace("title = ", "")
439 # .replace(",", "")
440 # )
441 # xarticle.title_tex = xarticle.title_html
442 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
444 def download_file(self, url: str):
445 if url.startswith("https://eudml.org/doc"):
446 return super().download_file(url)
448 content = ""
449 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")
450 attempt = 0
451 while not content and attempt < 3:
452 attempt += 1
453 try:
454 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
455 print(cmd)
456 execute_cmd(cmd)
458 if os.path.isfile(filename):
459 with open(filename) as file_:
460 content = file_.read()
461 if not isinstance(self.session, CachedSession):
462 continue
463 # Mock an HTTP requests to inject the data into the cache
465 except subprocess.CalledProcessError:
466 pass
468 if not content:
469 raise requests.exceptions.HTTPError(f"Unable to download {url}")
471 return content