Coverage for src/crawler/by_source/eudml_crawler.py: 56%
280 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import base64
2import json
3import os
4import re
5import subprocess
7import regex
8import requests
9from bs4 import BeautifulSoup
10from django.conf import settings
11from ptf.model_data import (
12 create_abstract,
13 create_articledata,
14 create_contributor,
15 create_extlink,
16 create_issuedata,
17 create_subj,
18)
19from ptf.utils import execute_cmd
20from requests_cache import CachedSession
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle
26class EudmlCrawler(BaseCollectionCrawler):
27 source_name = "European Digital Mathematics Library"
28 source_domain = "EUDML"
29 source_website = "https://eudml.org"
31 def parse_collection_content(self, content):
32 """
33 Parse the HTML page of a EuDml journal and returns a list of xissue.
34 Each xissue has a list of articles with just an url.
35 """
36 data = json.loads(content)
37 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
38 xissues = []
39 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
40 # Extract the list of volumes
41 volume_count = 0
42 issue_count = 0
43 for v in data["volumes"]:
44 volume_count += 1
45 volume_number = v["name"]
47 year_re_groups = volume_year_re.search(v["desc"])
48 if year_re_groups is None:
49 print("skipping volume : no year found")
50 continue
51 year = year_re_groups.group(1)
52 if year == "": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 print("volume year is an empty string... Skipping")
54 continue
55 if len(v["issues"]) > 0:
56 # Extract all the issues
57 for i in v["issues"]:
58 issue_count += 1
59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)
60 xissues.append(xissue)
61 else:
62 # No issues, articles are directly in the volumeF
63 xissue = self.create_eudml_xissue(v, year, None, volume_number)
64 xissues.append(xissue)
66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
67 # This info is used to check the number of articles/issues parsed in the page
68 volumes_to_find = 0
69 issues_to_find = 0
70 articles_to_find = 0
71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
72 for article_detail_node in article_details_nodes:
73 unit_nodes = article_detail_node.find_all("li")
74 for unit_node in unit_nodes:
75 strong_node = unit_node.find("strong")
76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true
77 text = strong_node.get_text()
78 if text == "Issue count:":
79 value = unit_node.get_text()[13:]
80 issues_to_find += int(value)
81 elif text == "Volume count:":
82 value = unit_node.get_text()[14:]
83 volumes_to_find += int(value)
84 elif text == "Number of articles:":
85 value = unit_node.get_text()[20:]
86 articles_to_find += int(value)
88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
90 print(txt)
91 if settings.CRAWLER_LOG_FILE:
92 os.makedirs(os.path.dirname(settings.CRAWLER_LOG_FILE), exist_ok=True)
93 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
94 f_.write(txt + "\n")
96 if issue_count != issues_to_find:
97 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
98 print(txt)
99 if settings.CRAWLER_LOG_FILE: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 os.makedirs(os.path.dirname(settings.CRAWLER_LOG_FILE), exist_ok=True)
101 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
102 f_.write(txt + "\n")
104 article_count = sum([len(xissue.articles) for xissue in xissues])
105 if article_count != articles_to_find:
106 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
107 print(txt)
108 if settings.CRAWLER_LOG_FILE: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 os.makedirs(os.path.dirname(settings.CRAWLER_LOG_FILE), exist_ok=True)
110 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
111 f_.write(txt + "\n")
113 return xissues
115 def create_eudml_xissue(
116 self, issue_data: dict, year_str, issue_number: str | None, volume_number
117 ):
118 """
119 EuDML does not have a separate HTML page for an issue.
120 The list of issues/articles is directly found in the collection page.
122 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
123 The PID is temporary and will be updated with the issue number (if any)
124 create_xissue directly creates articles, but with just a pid and an url.
125 """
126 xissue = create_issuedata()
127 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
128 if issue_number:
129 xissue.pid = xissue.pid + "_" + issue_number
130 xissue.year = year_str
131 xissue.volume = volume_number
132 if issue_number:
133 xissue.number = issue_number
135 issue_data["articles"].sort(key=lambda a: a["sortKey"])
136 for index_article, article_data in enumerate(issue_data["articles"]):
137 xarticle = create_articledata()
138 xarticle.pid = "a" + str(index_article)
139 xarticle.url = article_data["url"]
140 xissue.articles.append(xarticle)
141 return xissue
143 def parse_article_content(self, content, xissue, xarticle, url):
144 """
145 Parse the content with Beautifulsoup and returns an ArticleData
146 """
147 soup = BeautifulSoup(content, "xml")
149 self.get_metadata_using_citation_meta(
150 xarticle,
151 xissue,
152 soup,
153 [
154 "lang",
155 "title",
156 "author",
157 "pdf",
158 "abstract",
159 "page",
160 "doi",
161 "mr",
162 "zbl",
163 "publisher",
164 "keywords",
165 ],
166 )
168 # LINK to SOURCE
169 url_full_text_node = soup.find("a", text="Access to full text")
170 if url_full_text_node is not None:
171 url_full_text = url_full_text_node.get("href")
172 if isinstance(url_full_text, str): 172 ↛ 177line 172 didn't jump to line 177 because the condition on line 172 was always true
173 ext_link = create_extlink(rel="primary-source", location=url_full_text)
174 xarticle.ext_links.append(ext_link)
176 # MSC KEYWORDS
177 subj_part = soup.select_one("article#unit-subject-areas")
178 if subj_part is not None: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
180 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]
181 for subj in subjs:
182 type_class = subj.get("href").split("/")
183 subject = create_subj(type="msc", lang=xarticle.lang)
184 subject["value"] = type_class[3]
185 xarticle.kwds.append(subject)
187 # FALLBACK
188 if not xarticle.title_tex:
189 try:
190 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")
191 txt = f"{url} Fallback for title"
192 print(txt)
193 if settings.CRAWLER_LOG_FILE: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
195 f_.write(txt + "\n")
196 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
197 # FIXME
198 except: # noqa: E722
199 pass
201 if len(xarticle.contributors) == 0:
202 # AUTHORS
203 authors_bloc = soup.select_one("p.sub-title-1")
204 if authors_bloc: 204 ↛ 222line 204 didn't jump to line 222 because the condition on line 204 was always true
205 authors_node = authors_bloc.find_all("a")
206 if len(authors_node) > 0: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 txt = f"{url} Fallback for authors"
208 print(txt)
209 if settings.CRAWLER_LOG_FILE:
210 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
211 f_.write(txt + "\n")
212 for author_node in authors_node: 212 ↛ 213line 212 didn't jump to line 213 because the loop on line 212 never started
213 text_author = author_node.get_text()
214 text_author = text_author.replace(",", "")
216 author = create_contributor()
217 author["role"] = "author"
218 author["string_name"] = text_author
220 xarticle.contributors.append(author)
222 if len(xarticle.streams) == 0: 222 ↛ 230line 222 didn't jump to line 230 because the condition on line 222 was always true
223 # PDF
224 pdf_node = soup.find("a", text="Full (PDF)")
225 if pdf_node is not None:
226 pdf_url = pdf_node.get("href")
227 if pdf_url: 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true
228 add_pdf_link_to_xarticle(xarticle, pdf_url)
230 if len(xarticle.streams) == 0:
231 if not url_full_text_node:
232 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf")
233 else:
234 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))
236 if len(xarticle.abstracts) == 0: 236 ↛ 248line 236 didn't jump to line 248 because the condition on line 236 was always true
237 # ABSTRACT
238 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
239 if abstract_node is not None: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 abstract_section_node = abstract_node.find("section")
241 if abstract_section_node:
242 abstract = str(abstract_section_node)
243 xabstract = create_abstract(
244 tag="abstract", value_tex=abstract, lang=xarticle.lang
245 )
246 xarticle.abstracts.append(xabstract)
248 if len(xarticle.contributors) == 0 or not xarticle.fpage:
249 # LANG, PAGES, (AUTHORS)
250 # EuDML has an export BibTex section with some information (lang, pages, authors)
251 self.parse_bibtex(soup, xarticle, url)
253 if xarticle.doi is None: 253 ↛ 296line 253 didn't jump to line 296 because the condition on line 253 was always true
254 # DOI
255 doi_link = soup.find("article", {"id": "unit-other-ids"})
256 if doi_link is not None: 256 ↛ 259line 256 didn't jump to line 259 because the condition on line 256 was never true
257 # Simplify ?
258 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
259 try:
260 reg_doi = re.compile("doi.org")
261 doi_array = [
262 d.get("href")
263 for d in doi_link.find_all("a")
264 if reg_doi.search(str(d.get("href")))
265 ]
266 if doi_array:
267 if len(doi_array) > 1:
268 start_dois = len(doi_array) - 1
269 doi = doi_array[start_dois:][0]
270 else:
271 doi = doi_array[0]
273 doi_array = doi.split("doi.org/")
274 # strip unwanted chars present
275 if len(doi_array) > 1:
276 doi = doi_array[1].encode("ascii", "ignore")
277 doi = str(doi.decode())
278 doi_array = doi.split("\\u")
279 doi = str(doi_array[0])
281 doi = re.sub("}", "", doi)
282 doi = re.sub("\t", "", doi)
283 doi = doi.encode("ascii", "ignore")
284 doi = doi.decode()
286 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
287 doi = doi.decode()
288 doi_array = doi.split("\\u")
289 doi = str(doi_array[0]).strip()
290 doi = doi.replace(" ", "")
292 xarticle.doi = doi
293 except TypeError as e:
294 print(e)
296 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
297 if not has_zblid:
298 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')")
299 if zb_tag: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true
300 zb_href = zb_tag.get("href")
301 if not isinstance(zb_href, str):
302 raise ValueError("Couldn't parse zbmath href")
303 zblid = zb_href.removeprefix(
304 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
305 )
306 xarticle.extids.append(("zbl-item-id", zblid))
308 # In Other Databases is not (always ?) the publisher
309 # if not xissue.publisher:
310 # # PUBLISHER
311 # section_oai = soup.find("h3", text="In Other Databases")
312 # if section_oai is not None:
313 # section_oai_array = section_oai.parent.find_all("dd")
314 # if section_oai is not None:
315 # pub = [
316 # d.text
317 # for d in section_oai_array
318 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
319 # ]
320 # if pub != "":
321 # print(f"{url} Fallback for publisher")
322 # xpub = create_publisherdata()
323 # xpub.name = pub[0].strip()
324 # xissue.publisher = xpub
325 return xarticle
327 def parse_bibtex(self, soup, xarticle, url):
328 """
329 Parse the BibTeX section of a EuDML article page.
330 Extract
331 - the authors (if no author was already found in the page)
332 - the article language
333 - the article pages
334 """
335 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
337 if len(bib_div) > 0: 337 ↛ exitline 337 didn't return from function 'parse_bibtex' because the condition on line 337 was always true
338 bib_tex = bib_div[0].get_text()
339 text = bib_tex.split("\t")
341 for text_part in text:
342 # AUTHORS (only if no authors were already found in the page)
343 if len(xarticle.contributors) == 0:
344 reg_author = re.compile("author =")
345 if reg_author.search(text_part): 345 ↛ 346line 345 didn't jump to line 346 because the condition on line 345 was never true
346 txt = f"{url} Fallback for authors with the bibtex"
347 print(txt)
348 if settings.CRAWLER_LOG_FILE:
349 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
350 f_.write(txt + "\n")
352 authors_text = (
353 text_part.replace("{", "").replace("}", "").replace("author = ", "")
354 )
355 authors_bib = authors_text.split(",")
356 for index, name in enumerate(authors_bib):
357 if index % 2 == 1:
358 author_name = authors_bib[index - 1] + " " + authors_bib[index]
359 author_name = self.latext_parser.latex_to_text(author_name)
360 author_name = author_name.replace("\xa0", "")
362 author = create_contributor()
363 author["role"] = "author"
364 author["string_name"] = author_name
365 xarticle.contributors.append(author)
367 # LANG
368 reg_lang = re.compile("language = ")
369 if reg_lang.search(text_part):
370 xarticle.lang = (
371 text_part.replace("{", "")
372 .replace("}", "")
373 .replace("language = ", "")
374 .replace(",", "")
375 )
376 if len(xarticle.lang) >= 3: 376 ↛ 379line 376 didn't jump to line 379 because the condition on line 376 was always true
377 xarticle.lang = xarticle.lang[:-1]
379 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true
380 xarticle.abstracts[0]["lang"] = xarticle.lang
382 if not xarticle.fpage:
383 # PAGES
384 reg_pages = re.compile("pages =")
385 if reg_pages.search(text_part):
386 pages = (
387 text_part.replace("{", "")
388 .replace("}", "")
389 .replace("(", "")
390 .replace(")", "")
391 .replace("[", "")
392 .replace("]", "")
393 .replace("pages = ", "")
394 )
395 if len(pages) > 0 and pages != "null": 395 ↛ 341line 395 didn't jump to line 341 because the condition on line 395 was always true
396 pages = pages.split(",")
397 if re.compile(r"\d+-\d+").search(pages[0]): 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 txt = f"{url} Fallback for pages with the bibtex"
399 print(txt)
400 if settings.CRAWLER_LOG_FILE:
401 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
402 f_.write(txt + "\n")
404 pages = pages[0].split("-")
405 xarticle.fpage = pages[0]
406 if len(pages) > 1:
407 reg_digit = re.compile(r"\d+")
408 if re.search(reg_digit, str(pages[1])):
409 pages[1] = re.search(reg_digit, str(pages[1]))[0]
410 xarticle.lpage = pages[1]
411 # FIXME : wrong page_range format... Maybe this can be deleted ?
412 xarticle.page_range = pages[0] + "-" + pages[1]
414 # reg_title = re.compile("title")
415 # if reg_title.search(text_part):
416 # if (
417 # xarticle.title_html is None
418 # or xarticle.title_html == ""
419 # or xarticle.title_html == "Contents"
420 # ):
421 # xarticle.title_html = (
422 # text_part.replace("{", "")
423 # .replace("}", "")
424 # .replace("title = ", "")
425 # .replace(",", "")
426 # )
427 # xarticle.title_tex = xarticle.title_html
428 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
430 def download_file(self, url: str):
431 if url.startswith("https://eudml.org/doc"):
432 return super().download_file(url)
434 content = ""
435 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")
436 attempt = 0
437 while not content and attempt < 3:
438 attempt += 1
439 try:
440 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
441 print(cmd)
442 execute_cmd(cmd)
444 if os.path.isfile(filename):
445 with open(filename) as file_:
446 content = file_.read()
447 if not isinstance(self.session, CachedSession):
448 continue
449 # Mock an HTTP requests to inject the data into the cache
451 except subprocess.CalledProcessError:
452 pass
454 if not content:
455 raise requests.exceptions.HTTPError(f"Unable to download {url}")
457 return content