Coverage for src / crawler / by_source / eudml_crawler.py: 59%
265 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import base64
2import json
3import os
4import re
5import subprocess
6import time
8import regex
9import requests
10from bs4 import BeautifulSoup
11from ptf.model_data import (
12 ArticleData,
13 create_abstract,
14 create_articledata,
15 create_contributor,
16 create_extlink,
17 create_issuedata,
18 create_subj,
19)
20from ptf.utils import execute_cmd
21from requests_cache import CachedSession
23from crawler.base_crawler import BaseCollectionCrawler
24from crawler.models import ExtlinkChecked
25from crawler.utils import add_pdf_link_to_xarticle
28class EudmlCrawler(BaseCollectionCrawler):
29 source_name = "European Digital Mathematics Library"
30 source_domain = "EUDML"
31 source_website = "https://eudml.org"
33 def parse_collection_content(self, content):
34 """
35 Parse the HTML page of a EuDml journal and returns a list of xissue.
36 Each xissue has a list of articles with just an url.
37 """
38 data = json.loads(content)
39 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
40 xissues = []
41 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
42 # Extract the list of volumes
43 volume_count = 0
44 issue_count = 0
45 for v in data["volumes"]:
46 volume_count += 1
47 volume_number = v["name"]
49 year_re_groups = volume_year_re.search(v["desc"])
50 if year_re_groups is None:
51 self.logger.debug("skipping volume : no year found")
52 continue
53 year = year_re_groups.group(1)
54 if year == "": 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 self.logger.debug("volume year is an empty string... Skipping")
56 continue
57 if len(v["issues"]) > 0:
58 # Extract all the issues
59 for i in v["issues"]:
60 issue_count += 1
61 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)
62 xissues.append(xissue)
63 else:
64 # No issues, articles are directly in the volumeF
65 xissue = self.create_eudml_xissue(v, year, None, volume_number)
66 xissues.append(xissue)
68 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
69 # This info is used to check the number of articles/issues parsed in the page
70 volumes_to_find = 0
71 issues_to_find = 0
72 articles_to_find = 0
73 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
74 for article_detail_node in article_details_nodes:
75 unit_nodes = article_detail_node.find_all("li")
76 for unit_node in unit_nodes:
77 strong_node = unit_node.find("strong")
78 if strong_node is not None: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true
79 text = strong_node.get_text()
80 if text == "Issue count:":
81 value = unit_node.get_text()[13:]
82 issues_to_find += int(value)
83 elif text == "Volume count:":
84 value = unit_node.get_text()[14:]
85 volumes_to_find += int(value)
86 elif text == "Number of articles:":
87 value = unit_node.get_text()[20:]
88 articles_to_find += int(value)
90 if volume_count != volumes_to_find: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
92 self.logger.debug(txt)
94 if issue_count != issues_to_find:
95 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
96 self.logger.debug(txt)
98 article_count = sum([len(xissue.articles) for xissue in xissues])
99 if article_count != articles_to_find:
100 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
101 self.logger.debug(txt)
103 return xissues
105 def create_eudml_xissue(
106 self, issue_data: dict, year_str, issue_number: str | None, volume_number
107 ):
108 """
109 EuDML does not have a separate HTML page for an issue.
110 The list of issues/articles is directly found in the collection page.
112 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
113 The PID is temporary and will be updated with the issue number (if any)
114 create_xissue directly creates articles, but with just a pid and an url.
115 """
116 xissue = create_issuedata()
117 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
118 if issue_number:
119 xissue.pid = xissue.pid + "_" + issue_number
120 xissue.year = year_str
121 xissue.volume = volume_number
122 if issue_number:
123 xissue.number = issue_number
125 issue_data["articles"].sort(key=lambda a: a["sortKey"])
126 for index_article, article_data in enumerate(issue_data["articles"]):
127 xarticle = create_articledata()
128 xarticle.pid = "a" + str(index_article)
129 xarticle.url = article_data["url"]
130 xissue.articles.append(xarticle)
131 return xissue
133 def parse_article_content(self, content, xissue, xarticle, url):
134 """
135 Parse the content with Beautifulsoup and returns an ArticleData
136 """
137 soup = BeautifulSoup(content, "xml")
139 self.get_metadata_using_citation_meta(
140 xarticle,
141 xissue,
142 soup,
143 [
144 "lang",
145 "title",
146 "author",
147 "pdf",
148 "abstract",
149 "page",
150 "doi",
151 "mr",
152 "zbl",
153 "publisher",
154 "keywords",
155 ],
156 )
158 # LINK to SOURCE
159 url_full_text_node = soup.find("a", text="Access to full text")
160 if url_full_text_node is not None:
161 url_full_text = url_full_text_node.get("href")
162 if isinstance(url_full_text, str): 162 ↛ 167line 162 didn't jump to line 167 because the condition on line 162 was always true
163 ext_link = create_extlink(rel="primary-source", location=url_full_text)
164 xarticle.ext_links.append(ext_link)
166 # MSC KEYWORDS
167 subj_part = soup.select_one("article#unit-subject-areas")
168 if subj_part is not None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
170 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]
171 for subj in subjs:
172 type_class = subj.get("href").split("/")
173 subject = create_subj(type="msc", lang=xarticle.lang)
174 subject["value"] = type_class[3]
175 xarticle.kwds.append(subject)
177 # FALLBACK
178 if not xarticle.title_tex:
179 try:
180 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")
181 txt = f"{url} Fallback for title"
182 self.logger.debug(txt, extra={"pid": xarticle.pid})
183 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
184 # FIXME
185 except: # noqa: E722
186 pass
188 if len(xarticle.contributors) == 0:
189 # AUTHORS
190 authors_bloc = soup.select_one("p.sub-title-1")
191 if authors_bloc: 191 ↛ 206line 191 didn't jump to line 206 because the condition on line 191 was always true
192 authors_node = authors_bloc.find_all("a")
193 if len(authors_node) > 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 txt = f"{url} Fallback for authors"
195 self.logger.debug(txt, extra={"pid": xarticle.pid})
196 for author_node in authors_node: 196 ↛ 197line 196 didn't jump to line 197 because the loop on line 196 never started
197 text_author = author_node.get_text()
198 text_author = text_author.replace(",", "")
200 author = create_contributor()
201 author["role"] = "author"
202 author["string_name"] = text_author
204 xarticle.contributors.append(author)
206 if len(xarticle.streams) == 0: 206 ↛ 214line 206 didn't jump to line 214 because the condition on line 206 was always true
207 # PDF
208 pdf_node = soup.find("a", text="Full (PDF)")
209 if pdf_node is not None:
210 pdf_url = pdf_node.get("href")
211 if pdf_url: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true
212 add_pdf_link_to_xarticle(xarticle, pdf_url)
214 if len(xarticle.streams) == 0:
215 if not url_full_text_node:
216 self.logger.debug("Couldn't find pdf", extra={"pid": xarticle.pid})
217 else:
218 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))
220 if len(xarticle.abstracts) == 0: 220 ↛ 232line 220 didn't jump to line 232 because the condition on line 220 was always true
221 # ABSTRACT
222 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
223 if abstract_node is not None: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 abstract_section_node = abstract_node.find("section")
225 if abstract_section_node:
226 abstract = str(abstract_section_node)
228 xarticle.abstracts.append(
229 create_abstract(value_tex=abstract, lang=xarticle.lang)
230 )
232 if len(xarticle.contributors) == 0 or not xarticle.fpage:
233 # LANG, PAGES, (AUTHORS)
234 # EuDML has an export BibTex section with some information (lang, pages, authors)
235 self.parse_bibtex(soup, xarticle, url)
237 if xarticle.doi is None: 237 ↛ 280line 237 didn't jump to line 280 because the condition on line 237 was always true
238 # DOI
239 doi_link = soup.find("article", {"id": "unit-other-ids"})
240 if doi_link is not None: 240 ↛ 243line 240 didn't jump to line 243 because the condition on line 240 was never true
241 # Simplify ?
242 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
243 try:
244 reg_doi = re.compile("doi.org")
245 doi_array = [
246 d.get("href")
247 for d in doi_link.find_all("a")
248 if reg_doi.search(str(d.get("href")))
249 ]
250 if doi_array:
251 if len(doi_array) > 1:
252 start_dois = len(doi_array) - 1
253 doi = doi_array[start_dois:][0]
254 else:
255 doi = doi_array[0]
257 doi_array = doi.split("doi.org/")
258 # strip unwanted chars present
259 if len(doi_array) > 1:
260 doi = doi_array[1].encode("ascii", "ignore")
261 doi = str(doi.decode())
262 doi_array = doi.split("\\u")
263 doi = str(doi_array[0])
265 doi = re.sub("}", "", doi)
266 doi = re.sub("\t", "", doi)
267 doi = doi.encode("ascii", "ignore")
268 doi = doi.decode()
270 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
271 doi = doi.decode()
272 doi_array = doi.split("\\u")
273 doi = str(doi_array[0]).strip()
274 doi = doi.replace(" ", "")
276 xarticle.doi = doi
277 except TypeError as e:
278 self.logger.debug(e, {"pid": xarticle.pid})
280 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
281 if not has_zblid:
282 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')")
283 if zb_tag: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 zb_href = zb_tag.get("href")
285 if not isinstance(zb_href, str):
286 raise ValueError("Couldn't parse zbmath href")
287 zblid = zb_href.removeprefix(
288 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
289 )
290 xarticle.extids.append(("zbl-item-id", zblid))
292 # In Other Databases is not (always ?) the publisher
293 # if not xissue.publisher:
294 # # PUBLISHER
295 # section_oai = soup.find("h3", text="In Other Databases")
296 # if section_oai is not None:
297 # section_oai_array = section_oai.parent.find_all("dd")
298 # if section_oai is not None:
299 # pub = [
300 # d.text
301 # for d in section_oai_array
302 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
303 # ]
304 # if pub != "":
305 # print(f"{url} Fallback for publisher")
306 # xpub = create_publisherdata()
307 # xpub.name = pub[0].strip()
308 # xissue.publisher = xpub
309 return xarticle
311 def parse_bibtex(self, soup, xarticle: ArticleData, url):
312 """
313 Parse the BibTeX section of a EuDML article page.
314 Extract
315 - the authors (if no author was already found in the page)
316 - the article language
317 - the article pages
318 """
319 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
321 if len(bib_div) > 0: 321 ↛ exitline 321 didn't return from function 'parse_bibtex' because the condition on line 321 was always true
322 bib_tex = bib_div[0].get_text()
323 text = bib_tex.split("\t")
325 for text_part in text:
326 # AUTHORS (only if no authors were already found in the page)
327 if len(xarticle.contributors) == 0:
328 reg_author = re.compile("author =")
329 if reg_author.search(text_part): 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true
330 txt = f"{url} Fallback for authors with the bibtex"
331 self.logger.debug(txt, extra={"pid": xarticle.pid})
333 authors_text = (
334 text_part.replace("{", "").replace("}", "").replace("author = ", "")
335 )
336 authors_bib = authors_text.split(",")
337 for index, name in enumerate(authors_bib):
338 if index % 2 == 1:
339 author_name = authors_bib[index - 1] + " " + authors_bib[index]
340 author_name = self.latext_parser.latex_to_text(author_name)
341 author_name = author_name.replace("\xa0", "")
343 author = create_contributor()
344 author["role"] = "author"
345 author["string_name"] = author_name
346 xarticle.contributors.append(author)
348 # LANG
349 reg_lang = re.compile("language = ")
350 if reg_lang.search(text_part):
351 xarticle.lang = (
352 text_part.replace("{", "")
353 .replace("}", "")
354 .replace("language = ", "")
355 .replace(",", "")
356 )
357 if len(xarticle.lang) >= 3: 357 ↛ 360line 357 didn't jump to line 360 because the condition on line 357 was always true
358 xarticle.lang = xarticle.lang[:-1]
360 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 360 ↛ 361line 360 didn't jump to line 361 because the condition on line 360 was never true
361 xarticle.abstracts[0]["lang"] = xarticle.lang
363 if not xarticle.fpage:
364 # PAGES
365 reg_pages = re.compile("pages =")
366 if reg_pages.search(text_part):
367 pages = (
368 text_part.replace("{", "")
369 .replace("}", "")
370 .replace("(", "")
371 .replace(")", "")
372 .replace("[", "")
373 .replace("]", "")
374 .replace("pages = ", "")
375 )
376 if len(pages) > 0 and pages != "null": 376 ↛ 325line 376 didn't jump to line 325 because the condition on line 376 was always true
377 pages = pages.split(",")
378 if re.compile(r"\d+-\d+").search(pages[0]): 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true
379 txt = f"{url} Fallback for pages with the bibtex"
380 self.logger.debug(txt, {"pid", xarticle.pid})
382 pages = pages[0].split("-")
383 xarticle.fpage = pages[0]
384 if len(pages) > 1:
385 reg_digit = re.compile(r"\d+")
386 if re.search(reg_digit, str(pages[1])):
387 pages[1] = re.search(reg_digit, str(pages[1]))[0]
388 xarticle.lpage = pages[1]
389 # FIXME : wrong page_range format... Maybe this can be deleted ?
390 xarticle.page_range = pages[0] + "-" + pages[1]
392 # reg_title = re.compile("title")
393 # if reg_title.search(text_part):
394 # if (
395 # xarticle.title_html is None
396 # or xarticle.title_html == ""
397 # or xarticle.title_html == "Contents"
398 # ):
399 # xarticle.title_html = (
400 # text_part.replace("{", "")
401 # .replace("}", "")
402 # .replace("title = ", "")
403 # .replace(",", "")
404 # )
405 # xarticle.title_tex = xarticle.title_html
406 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
408 def download_file(self, url: str):
409 if url.startswith("https://eudml.org/doc"):
410 return super().download_file(url)
412 content = ""
413 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")
414 attempt = 0
415 while not content and attempt < 3:
416 attempt += 1
417 try:
418 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
419 execute_cmd(cmd, force_execute=True)
421 if os.path.isfile(filename):
422 with open(filename) as file_:
423 content = file_.read()
424 if not isinstance(self.session, CachedSession):
425 continue
426 # Mock an HTTP requests to inject the data into the cache
428 except subprocess.CalledProcessError:
429 pass
431 if not content:
432 raise requests.exceptions.HTTPError(f"Unable to download {url}")
434 return content
436 @classmethod
437 def check_extlink_validity(cls, extlink):
438 """
439 Method used by rot_monitoring to check if links have expired
440 """
441 defaults: dict = {"date": time.time(), "status": ExtlinkChecked.Status.OK}
443 if not extlink.location.startswith("http://gdz.sub.uni-goettingen.de"):
444 return super().check_extlink_validity(extlink)
446 response = requests.get(extlink.location)
448 defaults["http_status"] = response.status_code
450 if response.status_code not in (200, 206):
451 defaults["status"] = ExtlinkChecked.Status.ERROR
453 ExtlinkChecked.objects.update_or_create(extlink=extlink, defaults=defaults)