Coverage for src/crawler/by_source/eudml_crawler.py: 40%
302 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import base64
2import json
3import os
4import re
5import subprocess
7import regex
8import requests
9from bs4 import BeautifulSoup
10from django.conf import settings
11from ptf.model_data import (
12 create_articledata,
13 create_contributor,
14 create_extlink,
15 create_issuedata,
16 create_subj,
17)
18from ptf.utils import execute_cmd
19from requests_cache import CachedSession
21from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
24class EudmlCrawler(BaseCollectionCrawler):
25 source_name = "European Digital Mathematics Library"
26 source_domain = "EUDML"
27 source_website = "https://eudml.org"
29 def __init__(self, *args, **kwargs):
30 super().__init__(*args, **kwargs)
32 self.source = self.get_or_create_source()
34 self.has_dynamic_collection_pages = True
36 def parse_collection_content(self, content):
37 """
38 Parse the HTML page of a EuDml journal and returns a list of xissue.
39 Each xissue has a list of articles with just an url.
41 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
42 """
43 data = json.loads(content)
44 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
45 xissues = []
47 citation_year_node = soup.find("meta", {"name": "citation_year"})
48 if citation_year_node: 48 ↛ 59line 48 didn't jump to line 59 because the condition on line 48 was always true
49 value = citation_year_node.get("content")
50 values = value.split("-")
51 try:
52 self.periode_begin = int(values[0])
53 if len(values) > 1: 53 ↛ 57line 53 didn't jump to line 57 because the condition on line 53 was always true
54 self.periode_end = int(values[1])
55 except ValueError:
56 pass
57 self.periode = self.get_or_create_periode()
59 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
60 # Extract the list of volumes
61 volume_count = 0
62 issue_count = 0
63 for v in data["volumes"]:
64 volume_count += 1
65 volume_number = v["name"]
67 year_re_groups = volume_year_re.search(v["desc"])
68 if year_re_groups is None: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 print("skipping volume : no year found")
70 continue
71 year = year_re_groups.group(1)
73 if len(v["issues"]) > 0 and year != "":
74 # Extract all the issues
75 for i in v["issues"]:
76 issue_count += 1
77 xissue = self.create_xissue(i, year, i["name"], volume_number)
78 xissues.append(xissue)
79 else:
80 # No issues, articles are directly in the volumeF
81 xissue = self.create_xissue(v, year, None, volume_number)
82 xissues.append(xissue)
84 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
85 # This info is used to check the number of articles/issues parsed in the page
86 volumes_to_find = 0
87 issues_to_find = 0
88 articles_to_find = 0
89 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
90 for article_detail_node in article_details_nodes:
91 unit_nodes = article_detail_node.find_all("li")
92 for unit_node in unit_nodes:
93 strong_node = unit_node.find("strong")
94 if strong_node is not None: 94 ↛ 92line 94 didn't jump to line 92 because the condition on line 94 was always true
95 text = strong_node.get_text()
96 if text == "Issue count:":
97 value = unit_node.get_text()[13:]
98 issues_to_find += int(value)
99 elif text == "Volume count:":
100 value = unit_node.get_text()[14:]
101 volumes_to_find += int(value)
102 elif text == "Number of articles:":
103 value = unit_node.get_text()[20:]
104 articles_to_find += int(value)
106 if volume_count != volumes_to_find: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
108 print(txt)
109 if settings.CRAWLER_LOG_FILE:
110 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
111 f_.write(txt + "\n")
113 if issue_count != issues_to_find: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
115 print(txt)
116 if settings.CRAWLER_LOG_FILE:
117 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
118 f_.write(txt + "\n")
120 article_count = sum([len(xissue.articles) for xissue in xissues])
121 if article_count != articles_to_find: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
123 print(txt)
124 if settings.CRAWLER_LOG_FILE:
125 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
126 f_.write(txt + "\n")
128 return xissues
130 def create_xissue(self, issue_data: dict, year_str, issue_number: str | None, volume_number):
131 """
132 EuDML does not have a separate HTML page for an issue.
133 The list of issues/articles is directly found in the collection page.
135 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
136 The PID is temporary and will be updated with the issue number (if any)
137 create_xissue directly creates articles, but with just a pid and an url.
138 """
139 xissue = create_issuedata()
140 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
141 if issue_number:
142 xissue.pid = xissue.pid + "_" + issue_number
143 xissue.year = year_str
144 xissue.volume = volume_number
145 if issue_number:
146 xissue.number = issue_number
148 issue_data["articles"].sort(key=lambda a: a["sortKey"])
149 for index_article, article_data in enumerate(issue_data["articles"]):
150 xarticle = create_articledata()
151 xarticle.pid = "a" + str(index_article)
152 xarticle.url = article_data["url"]
153 xissue.articles.append(xarticle)
154 return xissue
156 def parse_article_content(self, content, xissue, xarticle, url, pid):
157 """
158 Parse the content with Beautifulsoup and returns an ArticleData
159 """
160 xarticle = create_articledata()
161 xarticle.pid = pid
162 soup = BeautifulSoup(content, "xml")
164 what = [
165 "lang",
166 "title",
167 "author",
168 "pdf",
169 "abstract",
170 "page",
171 "doi",
172 "mr",
173 "zbl",
174 "publisher",
175 "keywords",
176 ]
177 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
179 # LINK to SOURCE
180 url_full_text_node = soup.find("a", text="Access to full text")
181 if url_full_text_node is not None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 url_full_text = url_full_text_node.get("href")
183 ext_link = create_extlink()
184 ext_link["rel"] = "primary-source"
185 ext_link["location"] = url_full_text
186 xarticle.ext_links.append(ext_link)
188 # MSC KEYWORDS
189 subj_part = soup.find("article", {"id": "unit-subject-areas"})
190 if subj_part is not None: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
192 subjs = [a for a in subj_part.find_all("a") if reg_msc.search(a.get("href"))]
193 for subj in subjs:
194 type_class = subj.get("href").split("/")
195 subject = create_subj()
196 subject["value"] = type_class[3]
197 subject["type"] = "msc"
198 subject["lang"] = "en"
199 xarticle.kwds.append(subject)
201 # FALLBACK
202 if not xarticle.title_tex: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 try:
204 title = soup.find("h1").get_text(strip=True).replace("\xa0", " ")
205 txt = f"{url} Fallback for title"
206 print(txt)
207 if settings.CRAWLER_LOG_FILE:
208 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
209 f_.write(txt + "\n")
210 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
211 # FIXME
212 except: # noqa: E722
213 pass
215 if len(xarticle.contributors) == 0: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was never true
216 # AUTHORS
217 authors_bloc = soup.find("p", {"class": "sub-title-1"})
218 if authors_bloc:
219 authors_node = authors_bloc.find_all("a")
220 if len(authors_node) > 0:
221 txt = f"{url} Fallback for authors"
222 print(txt)
223 if settings.CRAWLER_LOG_FILE:
224 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
225 f_.write(txt + "\n")
226 for author_node in authors_node:
227 text_author = author_node.get_text()
228 text_author = text_author.replace(",", "")
230 author = create_contributor()
231 author["role"] = "author"
232 author["string_name"] = text_author
234 xarticle.contributors.append(author)
236 if len(xarticle.streams) == 0: 236 ↛ 244line 236 didn't jump to line 244 because the condition on line 236 was always true
237 # PDF
238 pdf_node = soup.find("a", text="Full (PDF)")
239 if pdf_node is not None: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 pdf_url = pdf_node.get("href")
241 if pdf_url:
242 add_pdf_link_to_xarticle(xarticle, pdf_url)
244 if len(xarticle.abstracts) == 0: 244 ↛ 260line 244 didn't jump to line 260 because the condition on line 244 was always true
245 # ABSTRACT
246 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
247 if abstract_node is not None: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 abstract_section_node = abstract_node.find("section")
249 if abstract_section_node:
250 abstract = str(abstract_section_node)
251 xabstract = {
252 "tag": "abstract",
253 "value_html": "",
254 "value_tex": abstract,
255 "value_xml": "",
256 "lang": "en",
257 }
258 xarticle.abstracts.append(xabstract)
260 if len(xarticle.contributors) == 0 or not xarticle.fpage: 260 ↛ 263line 260 didn't jump to line 263 because the condition on line 260 was never true
261 # LANG, PAGES, (AUTHORS)
262 # EuDML has an export BibTex section with some information (lang, pages, authors)
263 self.parse_bibtex(soup, xarticle, url)
265 if xarticle.doi is None: 265 ↛ 341line 265 didn't jump to line 341 because the condition on line 265 was always true
266 # DOI
267 doi_link = soup.find("article", {"id": "unit-other-ids"})
268 if doi_link is not None: 268 ↛ 271line 268 didn't jump to line 271 because the condition on line 268 was never true
269 # Simplify ?
270 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
271 try:
272 reg_doi = re.compile("doi.org")
273 doi_array = [
274 d.get("href")
275 for d in doi_link.find_all("a")
276 if reg_doi.search(str(d.get("href")))
277 ]
278 if doi_array:
279 if len(doi_array) > 1:
280 start_dois = len(doi_array) - 1
281 doi = doi_array[start_dois:][0]
282 else:
283 doi = doi_array[0]
285 doi_array = doi.split("doi.org/")
286 # strip unwanted chars present
287 if len(doi_array) > 1:
288 doi = doi_array[1].encode("ascii", "ignore")
289 doi = str(doi.decode())
290 doi_array = doi.split("\\u")
291 doi = str(doi_array[0])
293 doi = re.sub("}", "", doi)
294 doi = re.sub("\t", "", doi)
295 doi = doi.encode("ascii", "ignore")
296 doi = doi.decode()
298 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
299 doi = doi.decode()
300 doi_array = doi.split("\\u")
301 doi = str(doi_array[0]).strip()
302 doi = doi.replace(" ", "")
304 xarticle.doi = doi
305 except TypeError as e:
306 print(e)
308 # You can't get the first link to zbmath.org: it could be in the list of references !
310 # has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
311 # if not has_zblid:
312 # # ZBL
313 # zblid_link = soup.find(
314 # "a", {"href": re.compile(r"http:\/\/www.zentralblatt-math.org\/zmath\/")}
315 # )
316 # if zblid_link is not None:
317 # zblid = zblid_link.get("href").split("?q=")[1]
318 # if zblid:
319 # print(f"{url} Fallback for zbl-id: {zblid}")
320 # xarticle.extids.append(("zbl-item-id", zblid))
322 # In Other Databases is not (always ?) the publisher
323 # if not xissue.publisher:
324 # # PUBLISHER
325 # section_oai = soup.find("h3", text="In Other Databases")
326 # if section_oai is not None:
327 # section_oai_array = section_oai.parent.find_all("dd")
328 # if section_oai is not None:
329 # pub = [
330 # d.text
331 # for d in section_oai_array
332 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
333 # ]
334 # if pub != "":
335 # print(f"{url} Fallback for publisher")
336 # xpub = create_publisherdata()
337 # xpub.name = pub[0].strip()
338 # xissue.publisher = xpub
340 # ARTICLE PID
341 if xarticle.doi is not None: 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true
342 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
343 xarticle.pid = xarticle.pid.replace("pid", "").replace(":", "_")
344 else:
345 reg_article = regex.compile(r"\d+")
346 if not isinstance(xarticle.pid, type(None)): 346 ↛ 352line 346 didn't jump to line 352 because the condition on line 346 was always true
347 pid_array = reg_article.findall(url)
348 if len(pid_array) > 0: 348 ↛ 352line 348 didn't jump to line 352 because the condition on line 348 was always true
349 id_article = pid_array[0]
350 xarticle.pid = xissue.pid + "_" + id_article
352 return xarticle
354 def parse_bibtex(self, soup, xarticle, url):
355 """
356 Parse the BibTeX section of a EuDML article page.
357 Extract
358 - the authors (if no author was already found in the page)
359 - the article language
360 - the article pages
361 """
362 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
364 if len(bib_div) > 0:
365 bib_tex = bib_div[0].get_text()
366 text = bib_tex.split("\t")
368 for text_part in text:
369 # AUTHORS (only if no authors were already found in the page)
370 if len(xarticle.contributors) == 0:
371 reg_author = re.compile("author =")
372 if reg_author.search(text_part):
373 txt = f"{url} Fallback for authors with the bibtex"
374 print(txt)
375 if settings.CRAWLER_LOG_FILE:
376 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
377 f_.write(txt + "\n")
379 authors_text = (
380 text_part.replace("{", "").replace("}", "").replace("author = ", "")
381 )
382 authors_bib = authors_text.split(",")
383 for index, name in enumerate(authors_bib):
384 if index % 2 == 1:
385 author_name = authors_bib[index - 1] + " " + authors_bib[index]
386 author_name = self.latext_parser.latex_to_text(author_name)
387 author_name = author_name.replace("\xa0", "")
389 author = create_contributor()
390 author["role"] = "author"
391 author["string_name"] = author_name
392 xarticle.contributors.append(author)
394 # LANG
395 reg_lang = re.compile("language = ")
396 if reg_lang.search(text_part):
397 xarticle.lang = (
398 text_part.replace("{", "")
399 .replace("}", "")
400 .replace("language = ", "")
401 .replace(",", "")
402 )
403 if len(xarticle.lang) >= 3:
404 xarticle.lang = xarticle.lang[:-1]
406 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0:
407 xarticle.abstracts[0]["lang"] = xarticle.lang
409 if not xarticle.fpage:
410 # PAGES
411 reg_pages = re.compile("pages =")
412 if reg_pages.search(text_part):
413 pages = (
414 text_part.replace("{", "")
415 .replace("}", "")
416 .replace("(", "")
417 .replace(")", "")
418 .replace("[", "")
419 .replace("]", "")
420 .replace("pages = ", "")
421 )
422 if len(pages) > 0 and pages != "null":
423 pages = pages.split(",")
424 if re.compile(r"\d+-\d+").search(pages[0]):
425 txt = f"{url} Fallback for pages with the bibtex"
426 print(txt)
427 if settings.CRAWLER_LOG_FILE:
428 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
429 f_.write(txt + "\n")
431 pages = pages[0].split("-")
432 xarticle.fpage = pages[0]
433 if len(pages) > 1:
434 reg_digit = re.compile(r"\d+")
435 if re.search(reg_digit, str(pages[1])):
436 pages[1] = re.search(reg_digit, str(pages[1]))[0]
437 xarticle.lpage = pages[1]
438 xarticle.page_range = pages[0] + "-" + pages[1]
440 # reg_title = re.compile("title")
441 # if reg_title.search(text_part):
442 # if (
443 # xarticle.title_html is None
444 # or xarticle.title_html == ""
445 # or xarticle.title_html == "Contents"
446 # ):
447 # xarticle.title_html = (
448 # text_part.replace("{", "")
449 # .replace("}", "")
450 # .replace("title = ", "")
451 # .replace(",", "")
452 # )
453 # xarticle.title_tex = xarticle.title_html
454 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
456 def get_page_content(self, url: str, force_download=False):
457 if url.startswith("https://eudml.org/doc"):
458 return self.download_file(url)
460 content = ""
462 def set_progress_bar_title():
463 if not self.progress_bar: 463 ↛ 465line 463 didn't jump to line 465 because the condition on line 463 was always true
464 return
465 self.progress_bar.text(f"Download {url}")
467 set_progress_bar_title()
468 content = self.download_file_dynamic(
469 url,
470 filename="/tmp/crawler/puppeteer/"
471 + str(base64.b64encode(url.encode("utf-8")), "utf-8"),
472 )
474 return content
476 def download_file_dynamic(self, url: str, filename: str):
477 """
478 Runs a NodeJS subprocess to parse an EUDML Collection
479 """
481 txt = f"Download {url}"
482 if settings.CRAWLER_LOG_FILE:
483 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
484 f_.write(txt + "\n")
486 content = ""
487 attempt = 0
488 while not content and attempt < 3:
489 attempt += 1
490 try:
491 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
492 print(cmd)
493 execute_cmd(cmd)
495 if os.path.isfile(filename):
496 with open(filename) as file_:
497 content = file_.read()
498 if not isinstance(self.session, CachedSession):
499 continue
500 # Mock an HTTP requests to inject the data into the cache
502 except subprocess.CalledProcessError:
503 pass
505 if not content:
506 raise requests.exceptions.HTTPError(f"Unable to download {url}")
508 return content