Coverage for src/crawler/by_source/eudml_crawler.py: 60%
255 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1import base64
2import json
3import os
4import re
5import subprocess
7import regex
8import requests
9from bs4 import BeautifulSoup
10from ptf.model_data import (
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_extlink,
15 create_issuedata,
16 create_subj,
17)
18from ptf.utils import execute_cmd
19from requests_cache import CachedSession
21from crawler.base_crawler import BaseCollectionCrawler
22from crawler.utils import add_pdf_link_to_xarticle
25class EudmlCrawler(BaseCollectionCrawler):
26 source_name = "European Digital Mathematics Library"
27 source_domain = "EUDML"
28 source_website = "https://eudml.org"
30 def parse_collection_content(self, content):
31 """
32 Parse the HTML page of a EuDml journal and returns a list of xissue.
33 Each xissue has a list of articles with just an url.
34 """
35 data = json.loads(content)
36 soup = BeautifulSoup(base64.b64decode(data["page"]), "html.parser")
37 xissues = []
38 volume_year_re = regex.compile(r".*\(<strong>(\d+).*<\/strong>\)")
39 # Extract the list of volumes
40 volume_count = 0
41 issue_count = 0
42 for v in data["volumes"]:
43 volume_count += 1
44 volume_number = v["name"]
46 year_re_groups = volume_year_re.search(v["desc"])
47 if year_re_groups is None:
48 print("skipping volume : no year found")
49 continue
50 year = year_re_groups.group(1)
51 if year == "": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 print("volume year is an empty string... Skipping")
53 continue
54 if len(v["issues"]) > 0:
55 # Extract all the issues
56 for i in v["issues"]:
57 issue_count += 1
58 xissue = self.create_eudml_xissue(i, year, i["name"], volume_number)
59 xissues.append(xissue)
60 else:
61 # No issues, articles are directly in the volumeF
62 xissue = self.create_eudml_xissue(v, year, None, volume_number)
63 xissues.append(xissue)
65 # EuDML stores the total of issues and articles in the <ul class="article-details unit unit-list">
66 # This info is used to check the number of articles/issues parsed in the page
67 volumes_to_find = 0
68 issues_to_find = 0
69 articles_to_find = 0
70 article_details_nodes = soup.find_all("ul", {"class": "article-details unit unit-list"})
71 for article_detail_node in article_details_nodes:
72 unit_nodes = article_detail_node.find_all("li")
73 for unit_node in unit_nodes:
74 strong_node = unit_node.find("strong")
75 if strong_node is not None: 75 ↛ 73line 75 didn't jump to line 73 because the condition on line 75 was always true
76 text = strong_node.get_text()
77 if text == "Issue count:":
78 value = unit_node.get_text()[13:]
79 issues_to_find += int(value)
80 elif text == "Volume count:":
81 value = unit_node.get_text()[14:]
82 volumes_to_find += int(value)
83 elif text == "Number of articles:":
84 value = unit_node.get_text()[20:]
85 articles_to_find += int(value)
87 if volume_count != volumes_to_find: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 txt = f"EuDML declares {volumes_to_find} volumes for {self.collection_id}. We parsed {volume_count}"
89 print(txt)
91 if issue_count != issues_to_find:
92 txt = f"EuDML declares {issues_to_find} issues for {self.collection_id}. We parsed {issue_count}"
93 print(txt)
95 article_count = sum([len(xissue.articles) for xissue in xissues])
96 if article_count != articles_to_find:
97 txt = f"EuDML declares {articles_to_find} articles for {self.collection_id}. We parsed {article_count}"
98 print(txt)
100 return xissues
102 def create_eudml_xissue(
103 self, issue_data: dict, year_str, issue_number: str | None, volume_number
104 ):
105 """
106 EuDML does not have a separate HTML page for an issue.
107 The list of issues/articles is directly found in the collection page.
109 create_xissue creates an IssueData (see ptf/model_data.py) and sets its year/volume
110 The PID is temporary and will be updated with the issue number (if any)
111 create_xissue directly creates articles, but with just a pid and an url.
112 """
113 xissue = create_issuedata()
114 xissue.pid = self.collection_id + "_" + year_str + "__" + volume_number
115 if issue_number:
116 xissue.pid = xissue.pid + "_" + issue_number
117 xissue.year = year_str
118 xissue.volume = volume_number
119 if issue_number:
120 xissue.number = issue_number
122 issue_data["articles"].sort(key=lambda a: a["sortKey"])
123 for index_article, article_data in enumerate(issue_data["articles"]):
124 xarticle = create_articledata()
125 xarticle.pid = "a" + str(index_article)
126 xarticle.url = article_data["url"]
127 xissue.articles.append(xarticle)
128 return xissue
130 def parse_article_content(self, content, xissue, xarticle, url):
131 """
132 Parse the content with Beautifulsoup and returns an ArticleData
133 """
134 soup = BeautifulSoup(content, "xml")
136 self.get_metadata_using_citation_meta(
137 xarticle,
138 xissue,
139 soup,
140 [
141 "lang",
142 "title",
143 "author",
144 "pdf",
145 "abstract",
146 "page",
147 "doi",
148 "mr",
149 "zbl",
150 "publisher",
151 "keywords",
152 ],
153 )
155 # LINK to SOURCE
156 url_full_text_node = soup.find("a", text="Access to full text")
157 if url_full_text_node is not None:
158 url_full_text = url_full_text_node.get("href")
159 if isinstance(url_full_text, str): 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true
160 ext_link = create_extlink(rel="primary-source", location=url_full_text)
161 xarticle.ext_links.append(ext_link)
163 # MSC KEYWORDS
164 subj_part = soup.select_one("article#unit-subject-areas")
165 if subj_part is not None: 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 reg_msc = re.compile("/subject/MSC/[a-zA-Z0-9.]+")
167 subjs = [a for a in subj_part.select("a") if reg_msc.search(a.get("href"))]
168 for subj in subjs:
169 type_class = subj.get("href").split("/")
170 subject = create_subj(type="msc", lang=xarticle.lang)
171 subject["value"] = type_class[3]
172 xarticle.kwds.append(subject)
174 # FALLBACK
175 if not xarticle.title_tex:
176 try:
177 title = soup.select_one("h1").get_text(strip=True).replace("\xa0", " ")
178 txt = f"{url} Fallback for title"
179 print(txt)
180 xarticle.title_tex = title.replace("\xa0", " ").replace("\n", "")
181 # FIXME
182 except: # noqa: E722
183 pass
185 if len(xarticle.contributors) == 0:
186 # AUTHORS
187 authors_bloc = soup.select_one("p.sub-title-1")
188 if authors_bloc: 188 ↛ 203line 188 didn't jump to line 203 because the condition on line 188 was always true
189 authors_node = authors_bloc.find_all("a")
190 if len(authors_node) > 0: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 txt = f"{url} Fallback for authors"
192 print(txt)
193 for author_node in authors_node: 193 ↛ 194line 193 didn't jump to line 194 because the loop on line 193 never started
194 text_author = author_node.get_text()
195 text_author = text_author.replace(",", "")
197 author = create_contributor()
198 author["role"] = "author"
199 author["string_name"] = text_author
201 xarticle.contributors.append(author)
203 if len(xarticle.streams) == 0: 203 ↛ 211line 203 didn't jump to line 211 because the condition on line 203 was always true
204 # PDF
205 pdf_node = soup.find("a", text="Full (PDF)")
206 if pdf_node is not None:
207 pdf_url = pdf_node.get("href")
208 if pdf_url: 208 ↛ 211line 208 didn't jump to line 211 because the condition on line 208 was always true
209 add_pdf_link_to_xarticle(xarticle, pdf_url)
211 if len(xarticle.streams) == 0:
212 if not url_full_text_node:
213 print(f"[{self.source_domain}] {self.collection_id} : Couldn't find pdf")
214 else:
215 add_pdf_link_to_xarticle(xarticle, url_full_text_node.get("href"))
217 if len(xarticle.abstracts) == 0: 217 ↛ 229line 217 didn't jump to line 229 because the condition on line 217 was always true
218 # ABSTRACT
219 abstract_node = soup.find("article", {"id": "unit-article-abstract"})
220 if abstract_node is not None: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true
221 abstract_section_node = abstract_node.find("section")
222 if abstract_section_node:
223 abstract = str(abstract_section_node)
224 xabstract = create_abstract(
225 tag="abstract", value_tex=abstract, lang=xarticle.lang
226 )
227 xarticle.abstracts.append(xabstract)
229 if len(xarticle.contributors) == 0 or not xarticle.fpage:
230 # LANG, PAGES, (AUTHORS)
231 # EuDML has an export BibTex section with some information (lang, pages, authors)
232 self.parse_bibtex(soup, xarticle, url)
234 if xarticle.doi is None: 234 ↛ 277line 234 didn't jump to line 277 because the condition on line 234 was always true
235 # DOI
236 doi_link = soup.find("article", {"id": "unit-other-ids"})
237 if doi_link is not None: 237 ↛ 240line 237 didn't jump to line 240 because the condition on line 237 was never true
238 # Simplify ?
239 # See https://eudml.org/doc/54683 with http://dx.doi.org/10.1155/2007/10368%E2%80%89
240 try:
241 reg_doi = re.compile("doi.org")
242 doi_array = [
243 d.get("href")
244 for d in doi_link.find_all("a")
245 if reg_doi.search(str(d.get("href")))
246 ]
247 if doi_array:
248 if len(doi_array) > 1:
249 start_dois = len(doi_array) - 1
250 doi = doi_array[start_dois:][0]
251 else:
252 doi = doi_array[0]
254 doi_array = doi.split("doi.org/")
255 # strip unwanted chars present
256 if len(doi_array) > 1:
257 doi = doi_array[1].encode("ascii", "ignore")
258 doi = str(doi.decode())
259 doi_array = doi.split("\\u")
260 doi = str(doi_array[0])
262 doi = re.sub("}", "", doi)
263 doi = re.sub("\t", "", doi)
264 doi = doi.encode("ascii", "ignore")
265 doi = doi.decode()
267 doi = bytes(r"{}".format(r"" + doi + ""), "utf-8")
268 doi = doi.decode()
269 doi_array = doi.split("\\u")
270 doi = str(doi_array[0]).strip()
271 doi = doi.replace(" ", "")
273 xarticle.doi = doi
274 except TypeError as e:
275 print(e)
277 has_zblid = len([extid for extid in xarticle.extids if extid[0] == "zbl-item-id"]) == 1
278 if not has_zblid:
279 zb_tag = soup.select_one("article#unit-other-ids a:-soup-contains('ZBMath')")
280 if zb_tag: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 zb_href = zb_tag.get("href")
282 if not isinstance(zb_href, str):
283 raise ValueError("Couldn't parse zbmath href")
284 zblid = zb_href.removeprefix(
285 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"
286 )
287 xarticle.extids.append(("zbl-item-id", zblid))
289 # In Other Databases is not (always ?) the publisher
290 # if not xissue.publisher:
291 # # PUBLISHER
292 # section_oai = soup.find("h3", text="In Other Databases")
293 # if section_oai is not None:
294 # section_oai_array = section_oai.parent.find_all("dd")
295 # if section_oai is not None:
296 # pub = [
297 # d.text
298 # for d in section_oai_array
299 # if d.text.strip() not in ["DOI", "ZBMath", "MathSciNet", "PUBLISHER"]
300 # ]
301 # if pub != "":
302 # print(f"{url} Fallback for publisher")
303 # xpub = create_publisherdata()
304 # xpub.name = pub[0].strip()
305 # xissue.publisher = xpub
306 return xarticle
308 def parse_bibtex(self, soup, xarticle, url):
309 """
310 Parse the BibTeX section of a EuDML article page.
311 Extract
312 - the authors (if no author was already found in the page)
313 - the article language
314 - the article pages
315 """
316 bib_div = [p for p in soup.find_all("p") if "@article" in p.text]
318 if len(bib_div) > 0: 318 ↛ exitline 318 didn't return from function 'parse_bibtex' because the condition on line 318 was always true
319 bib_tex = bib_div[0].get_text()
320 text = bib_tex.split("\t")
322 for text_part in text:
323 # AUTHORS (only if no authors were already found in the page)
324 if len(xarticle.contributors) == 0:
325 reg_author = re.compile("author =")
326 if reg_author.search(text_part): 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true
327 txt = f"{url} Fallback for authors with the bibtex"
328 print(txt)
330 authors_text = (
331 text_part.replace("{", "").replace("}", "").replace("author = ", "")
332 )
333 authors_bib = authors_text.split(",")
334 for index, name in enumerate(authors_bib):
335 if index % 2 == 1:
336 author_name = authors_bib[index - 1] + " " + authors_bib[index]
337 author_name = self.latext_parser.latex_to_text(author_name)
338 author_name = author_name.replace("\xa0", "")
340 author = create_contributor()
341 author["role"] = "author"
342 author["string_name"] = author_name
343 xarticle.contributors.append(author)
345 # LANG
346 reg_lang = re.compile("language = ")
347 if reg_lang.search(text_part):
348 xarticle.lang = (
349 text_part.replace("{", "")
350 .replace("}", "")
351 .replace("language = ", "")
352 .replace(",", "")
353 )
354 if len(xarticle.lang) >= 3: 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true
355 xarticle.lang = xarticle.lang[:-1]
357 if len(xarticle.lang) > 0 and len(xarticle.abstracts) > 0: 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true
358 xarticle.abstracts[0]["lang"] = xarticle.lang
360 if not xarticle.fpage:
361 # PAGES
362 reg_pages = re.compile("pages =")
363 if reg_pages.search(text_part):
364 pages = (
365 text_part.replace("{", "")
366 .replace("}", "")
367 .replace("(", "")
368 .replace(")", "")
369 .replace("[", "")
370 .replace("]", "")
371 .replace("pages = ", "")
372 )
373 if len(pages) > 0 and pages != "null": 373 ↛ 322line 373 didn't jump to line 322 because the condition on line 373 was always true
374 pages = pages.split(",")
375 if re.compile(r"\d+-\d+").search(pages[0]): 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true
376 txt = f"{url} Fallback for pages with the bibtex"
377 print(txt)
379 pages = pages[0].split("-")
380 xarticle.fpage = pages[0]
381 if len(pages) > 1:
382 reg_digit = re.compile(r"\d+")
383 if re.search(reg_digit, str(pages[1])):
384 pages[1] = re.search(reg_digit, str(pages[1]))[0]
385 xarticle.lpage = pages[1]
386 # FIXME : wrong page_range format... Maybe this can be deleted ?
387 xarticle.page_range = pages[0] + "-" + pages[1]
389 # reg_title = re.compile("title")
390 # if reg_title.search(text_part):
391 # if (
392 # xarticle.title_html is None
393 # or xarticle.title_html == ""
394 # or xarticle.title_html == "Contents"
395 # ):
396 # xarticle.title_html = (
397 # text_part.replace("{", "")
398 # .replace("}", "")
399 # .replace("title = ", "")
400 # .replace(",", "")
401 # )
402 # xarticle.title_tex = xarticle.title_html
403 # xarticle.title_xml = f"<title-group><article-title>{xarticle.title_html}</article-title></title-group>"
405 def download_file(self, url: str):
406 if url.startswith("https://eudml.org/doc"):
407 return super().download_file(url)
409 content = ""
410 filename = "/tmp/crawler/puppeteer/" + str(base64.b64encode(url.encode("utf-8")), "utf-8")
411 attempt = 0
412 while not content and attempt < 3:
413 attempt += 1
414 try:
415 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/crawl_eudml_col.js -u {url} -o {filename}"
416 print(cmd)
417 execute_cmd(cmd, force_execute=True)
419 if os.path.isfile(filename):
420 with open(filename) as file_:
421 content = file_.read()
422 if not isinstance(self.session, CachedSession):
423 continue
424 # Mock an HTTP requests to inject the data into the cache
426 except subprocess.CalledProcessError:
427 pass
429 if not content:
430 raise requests.exceptions.HTTPError(f"Unable to download {url}")
432 return content