Coverage for src/crawler/by_source/eudml_crawler.py: 60%
254 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import base64
2import json
3import os
4import re
5import subprocess
7import regex
8import requests
9from bs4 import BeautifulSoup
10from ptf.model_data import (
11 ArticleData,
12 create_abstract,
13 create_articledata,
14 create_contributor,
15 create_extlink,
16 create_issuedata,
17 create_subj,
18)
19from ptf.utils import execute_cmd
20from requests_cache import CachedSession
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle
26class EudmlCrawler(BaseCollectionCrawler):
27 source_name = "European Digital Mathematics Library"
28 source_domain = "EUDML"
29 source_website = "https://eudml.org"
31 def parse_collection_content(self, content):
32 """
33 Parse the HTML page of a EuDml journal and returns a list of xissue.
34 Each xissue has a list of articles with just an url.
35 """
36 data = json.loads(content)
37 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
38 xissues = []
39 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
40 # Extract the list of volumes
41 volume_count = 0
42 issue_count = 0
43 for v in data["volumes"]:
44 volume_count += 1
45 volume_number = v["name"]
47 year_re_groups = volume_year_re.search(v["desc"])
48 if year_re_groups is None:
49 self.logger.debug("skipping volume : no year found")
50 continue
51 year = year_re_groups.group(1)
52 if year == "": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 self.logger.debug("volume year is an empty string... Skipping")
54 continue
55 if len(v["issues"]) > 0:
56 # Extract all the issues
57 for i in v["issues"]:
58 issue_count += 1
59 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)
60 xissues.append(xissue)
61 else:
62 # No issues, articles are directly in the volumeF
63 xissue = self.create_eudml_xissue(v, year, None, volume_number)
64 xissues.append(xissue)
66 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
67 # This info is used to check the number of articles/issues parsed in the page
68 volumes_to_find = 0
69 issues_to_find = 0
70 articles_to_find = 0
71 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
72 for article_detail_node in article_details_nodes:
73 unit_nodes = article_detail_node.find_all("li")
74 for unit_node in unit_nodes:
75 strong_node = unit_node.find("strong")
76 if strong_node is not None: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true
77 text = strong_node.get_text()
78 if text == "Issue count:":
79 value = unit_node.get_text()[13:]
80 issues_to_find += int(value)
81 elif text == "Volume count:":
82 value = unit_node.get_text()[14:]
83 volumes_to_find += int(value)
84 elif text == "Number of articles:":
85 value = unit_node.get_text()[20:]
86 articles_to_find += int(value)
88 if volume_count != volumes_to_find: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
90 self.logger.debug(txt)
92 if issue_count != issues_to_find:
93 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
94 self.logger.debug(txt)
96 article_count = sum([len(xissue.articles) for xissue in xissues])
97 if article_count != articles_to_find:
98 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
99 self.logger.debug(txt)
101 return xissues
103 def create_eudml_xissue(
104 self, issue_data: dict, year_str, issue_number: str | None, volume_number
105 ):
106 """
107 EuDML does not have a separate HTML page for an issue.
108 The list of issues/articles is directly found in the collection page.
110 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
111 The PID is temporary and will be updated with the issue number (if any)
112 create_xissue directly creates articles, but with just a pid and an url.
113 """
114 xissue = create_issuedata()
115 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
116 if issue_number:
117 xissue.pid = xissue.pid + "_" + issue_number
118 xissue.year = year_str
119 xissue.volume = volume_number
120 if issue_number:
121 xissue.number = issue_number
123 issue_data["articles"].sort(key=lambda a: a["sortKey"])
124 for index_article, article_data in enumerate(issue_data["articles"]):
125 xarticle = create_articledata()
126 xarticle.pid = "a" + str(index_article)
127 xarticle.url = article_data["url"]
128 xissue.articles.append(xarticle)
129 return xissue
131 def parse_article_content(self, content, xissue, xarticle, url):
132 """
133 Parse the content with Beautifulsoup and returns an ArticleData
134 """
135 soup = BeautifulSoup(content, "xml")
137 self.get_metadata_using_citation_meta(
138 xarticle,
139 xissue,
140 soup,
141 [
142 "lang",
143 "title",
144 "author",
145 "pdf",
146 "abstract",
147 "page",
148 "doi",
149 "mr",
150 "zbl",
151 "publisher",
152 "keywords",
153 ],
154 )
156 # LINK to SOURCE
157 url_full_text_node = soup.find("a", text="Access to full text")
158 if url_full_text_node is not None:
159 url_full_text = url_full_text_node.get("href")
160 if isinstance(url_full_text, str): 160 ↛ 165line 160 didn't jump to line 165 because the condition on line 160 was always true
161 ext_link = create_extlink(rel="primary-source", location=url_full_text)
162 xarticle.ext_links.append(ext_link)
164 # MSC KEYWORDS
165 subj_part = soup.select_one("article#unit-subject-areas")
166 if subj_part is not None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
168 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]
169 for subj in subjs:
170 type_class = subj.get("href").split("/")
171 subject = create_subj(type="msc", lang=xarticle.lang)
172 subject["value"] = type_class[3]
173 xarticle.kwds.append(subject)
175 # FALLBACK
176 if not xarticle.title_tex:
177 try:
178 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")
179 txt = f"{url} Fallback for title"
180 self.logger.debug(txt, extra={"pid": xarticle.pid})
181 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
182 # FIXME
183 except: # noqa: E722
184 pass
186 if len(xarticle.contributors) == 0:
187 # AUTHORS
188 authors_bloc = soup.select_one("p.sub-title-1")
189 if authors_bloc: 189 ↛ 204line 189 didn't jump to line 204 because the condition on line 189 was always true
190 authors_node = authors_bloc.find_all("a")
191 if len(authors_node) > 0: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 txt = f"{url} Fallback for authors"
193 self.logger.debug(txt, extra={"pid": xarticle.pid})
194 for author_node in authors_node: 194 ↛ 195line 194 didn't jump to line 195 because the loop on line 194 never started
195 text_author = author_node.get_text()
196 text_author = text_author.replace(",", "")
198 author = create_contributor()
199 author["role"] = "author"
200 author["string_name"] = text_author
202 xarticle.contributors.append(author)
204 if len(xarticle.streams) == 0: 204 ↛ 212line 204 didn't jump to line 212 because the condition on line 204 was always true
205 # PDF
206 pdf_node = soup.find("a", text="Full (PDF)")
207 if pdf_node is not None:
208 pdf_url = pdf_node.get("href")
209 if pdf_url: 209 ↛ 212line 209 didn't jump to line 212 because the condition on line 209 was always true
210 add_pdf_link_to_xarticle(xarticle, pdf_url)
212 if len(xarticle.streams) == 0:
213 if not url_full_text_node:
214 self.logger.debug("Couldn't find pdf", extra={"pid": xarticle.pid})
215 else:
216 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))
218 if len(xarticle.abstracts) == 0: 218 ↛ 230line 218 didn't jump to line 230 because the condition on line 218 was always true
219 # ABSTRACT
220 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
221 if abstract_node is not None: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 abstract_section_node = abstract_node.find("section")
223 if abstract_section_node:
224 abstract = str(abstract_section_node)
225 xabstract = create_abstract(
226 tag="abstract", value_tex=abstract, lang=xarticle.lang
227 )
228 xarticle.abstracts.append(xabstract)
230 if len(xarticle.contributors) == 0 or not xarticle.fpage:
231 # LANG, PAGES, (AUTHORS)
232 # EuDML has an export BibTex section with some information (lang, pages, authors)
233 self.parse_bibtex(soup, xarticle, url)
235 if xarticle.doi is None: 235 ↛ 278line 235 didn't jump to line 278 because the condition on line 235 was always true
236 # DOI
237 doi_link = soup.find("article", {"id": "unit-other-ids"})
238 if doi_link is not None: 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was never true
239 # Simplify ?
240 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
241 try:
242 reg_doi = re.compile("doi.org")
243 doi_array = [
244 d.get("href")
245 for d in doi_link.find_all("a")
246 if reg_doi.search(str(d.get("href")))
247 ]
248 if doi_array:
249 if len(doi_array) > 1:
250 start_dois = len(doi_array) - 1
251 doi = doi_array[start_dois:][0]
252 else:
253 doi = doi_array[0]
255 doi_array = doi.split("doi.org/")
256 # strip unwanted chars present
257 if len(doi_array) > 1:
258 doi = doi_array[1].encode("ascii", "ignore")
259 doi = str(doi.decode())
260 doi_array = doi.split("\\u")
261 doi = str(doi_array[0])
263 doi = re.sub("}", "", doi)
264 doi = re.sub("\t", "", doi)
265 doi = doi.encode("ascii", "ignore")
266 doi = doi.decode()
268 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
269 doi = doi.decode()
270 doi_array = doi.split("\\u")
271 doi = str(doi_array[0]).strip()
272 doi = doi.replace(" ", "")
274 xarticle.doi = doi
275 except TypeError as e:
276 self.logger.debug(e, {"pid": xarticle.pid})
278 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
279 if not has_zblid:
280 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')")
281 if zb_tag: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 zb_href = zb_tag.get("href")
283 if not isinstance(zb_href, str):
284 raise ValueError("Couldn't parse zbmath href")
285 zblid = zb_href.removeprefix(
286 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
287 )
288 xarticle.extids.append(("zbl-item-id", zblid))
290 # In Other Databases is not (always ?) the publisher
291 # if not xissue.publisher:
292 # # PUBLISHER
293 # section_oai = soup.find("h3", text="In Other Databases")
294 # if section_oai is not None:
295 # section_oai_array = section_oai.parent.find_all("dd")
296 # if section_oai is not None:
297 # pub = [
298 # d.text
299 # for d in section_oai_array
300 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
301 # ]
302 # if pub != "":
303 # print(f"{url} Fallback for publisher")
304 # xpub = create_publisherdata()
305 # xpub.name = pub[0].strip()
306 # xissue.publisher = xpub
307 return xarticle
309 def parse_bibtex(self, soup, xarticle: ArticleData, url):
310 """
311 Parse the BibTeX section of a EuDML article page.
312 Extract
313 - the authors (if no author was already found in the page)
314 - the article language
315 - the article pages
316 """
317 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
319 if len(bib_div) > 0: 319 ↛ exitline 319 didn't return from function 'parse_bibtex' because the condition on line 319 was always true
320 bib_tex = bib_div[0].get_text()
321 text = bib_tex.split("\t")
323 for text_part in text:
324 # AUTHORS (only if no authors were already found in the page)
325 if len(xarticle.contributors) == 0:
326 reg_author = re.compile("author =")
327 if reg_author.search(text_part): 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true
328 txt = f"{url} Fallback for authors with the bibtex"
329 self.logger.debug(txt, extra={"pid": xarticle.pid})
331 authors_text = (
332 text_part.replace("{", "").replace("}", "").replace("author = ", "")
333 )
334 authors_bib = authors_text.split(",")
335 for index, name in enumerate(authors_bib):
336 if index % 2 == 1:
337 author_name = authors_bib[index - 1] + " " + authors_bib[index]
338 author_name = self.latext_parser.latex_to_text(author_name)
339 author_name = author_name.replace("\xa0", "")
341 author = create_contributor()
342 author["role"] = "author"
343 author["string_name"] = author_name
344 xarticle.contributors.append(author)
346 # LANG
347 reg_lang = re.compile("language = ")
348 if reg_lang.search(text_part):
349 xarticle.lang = (
350 text_part.replace("{", "")
351 .replace("}", "")
352 .replace("language = ", "")
353 .replace(",", "")
354 )
355 if len(xarticle.lang) >= 3: 355 ↛ 358line 355 didn't jump to line 358 because the condition on line 355 was always true
356 xarticle.lang = xarticle.lang[:-1]
358 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 xarticle.abstracts[0]["lang"] = xarticle.lang
361 if not xarticle.fpage:
362 # PAGES
363 reg_pages = re.compile("pages =")
364 if reg_pages.search(text_part):
365 pages = (
366 text_part.replace("{", "")
367 .replace("}", "")
368 .replace("(", "")
369 .replace(")", "")
370 .replace("[", "")
371 .replace("]", "")
372 .replace("pages = ", "")
373 )
374 if len(pages) > 0 and pages != "null": 374 ↛ 323line 374 didn't jump to line 323 because the condition on line 374 was always true
375 pages = pages.split(",")
376 if re.compile(r"\d+-\d+").search(pages[0]): 376 ↛ 377line 376 didn't jump to line 377 because the condition on line 376 was never true
377 txt = f"{url} Fallback for pages with the bibtex"
378 self.logger.debug(txt, {"pid", xarticle.pid})
380 pages = pages[0].split("-")
381 xarticle.fpage = pages[0]
382 if len(pages) > 1:
383 reg_digit = re.compile(r"\d+")
384 if re.search(reg_digit, str(pages[1])):
385 pages[1] = re.search(reg_digit, str(pages[1]))[0]
386 xarticle.lpage = pages[1]
387 # FIXME : wrong page_range format... Maybe this can be deleted ?
388 xarticle.page_range = pages[0] + "-" + pages[1]
390 # reg_title = re.compile("title")
391 # if reg_title.search(text_part):
392 # if (
393 # xarticle.title_html is None
394 # or xarticle.title_html == ""
395 # or xarticle.title_html == "Contents"
396 # ):
397 # xarticle.title_html = (
398 # text_part.replace("{", "")
399 # .replace("}", "")
400 # .replace("title = ", "")
401 # .replace(",", "")
402 # )
403 # xarticle.title_tex = xarticle.title_html
404 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
406 def download_file(self, url: str):
407 if url.startswith("https://eudml.org/doc"):
408 return super().download_file(url)
410 content = ""
411 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")
412 attempt = 0
413 while not content and attempt < 3:
414 attempt += 1
415 try:
416 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
417 execute_cmd(cmd, force_execute=True)
419 if os.path.isfile(filename):
420 with open(filename) as file_:
421 content = file_.read()
422 if not isinstance(self.session, CachedSession):
423 continue
424 # Mock an HTTP requests to inject the data into the cache
426 except subprocess.CalledProcessError:
427 pass
429 if not content:
430 raise requests.exceptions.HTTPError(f"Unable to download {url}")
432 return content