Coverage for src / crawler / by_source / elibm_crawler.py: 53%
234 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1from collections import OrderedDict
3from bs4 import BeautifulSoup
4from ptf.model_data import (
5 IssueData,
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.crawler_utils import article_has_pdf
15from crawler.utils import add_pdf_link_to_xarticle
18class ElibmCrawler(BaseCollectionCrawler):
19 source_name = "Electronic Library of Mathematics"
20 source_domain = "ELIBM"
21 source_website = "https://www.elibm.org"
23 def __init__(self, *args, **kwargs):
24 super().__init__(*args, **kwargs)
25 if self.collection_id == "DOCMA": 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true
26 self.delimiter_inline_formula = "\\("
27 self.delimiter_disp_formula = "\\["
29 def parse_collection_content(self, content):
30 """
31 Parse the HTML page of Annals of Math and returns a list of xissue.
32 Each xissue has its pid/volume/number/year metadata + its url
33 """
34 soup = BeautifulSoup(content, "html.parser")
35 xissues = []
37 # Extract the list of issues
38 link_nodes = soup.find_all("a")
40 # eLibM puts special issue titles as volume number
41 # to create a issue pid, we use S1, S2...
42 last_special_issue_number = 0
44 for link_node in link_nodes:
45 url = link_node.get("href")
46 text = link_node.get_text()
47 if url.startswith("/issue"):
48 xissue, last_special_issue_number = self.create_elibm_xissue(
49 url, text, last_special_issue_number
50 )
52 if xissue: 52 ↛ 44line 52 didn't jump to line 44 because the condition on line 52 was always true
53 xissues.append(xissue)
55 return xissues
57 def get_first_year(self, year):
58 if "/" in year:
59 year = year.split("/")[0]
61 return year
63 def create_elibm_xissue(self, url, text, last_special_issue_number):
64 if "(" not in text or ")" not in text: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return None, None
67 parts = text.split("(")
69 year = parts[1].split(")")[0]
70 year = year.replace("/", "-")
72 # volume might not be an integer. eLibM puts special issue titles as volume number.
73 volume = parts[0].strip()
75 number = ""
76 if "No. " in volume:
77 parts = volume.split("No. ")
78 volume = parts[0].strip()
79 number = parts[1].strip()
81 try:
82 volume_for_pid = int(volume)
83 except ValueError:
84 last_special_issue_number += 1
85 volume_for_pid = f"S{last_special_issue_number}"
87 xissue = create_issuedata()
88 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
89 xissue.year = year
90 xissue.volume = volume
91 xissue.number = number
92 xissue.url = self.source_website + url
94 return xissue, last_special_issue_number
96 def parse_issue_content(self, content, xissue):
97 soup = BeautifulSoup(content, "html.parser")
98 article_nodes = soup.find_all("div", {"class": "title"})
100 for index_article, article_node in enumerate(article_nodes):
101 article_link_node = article_node.find("a")
102 if article_link_node: 102 ↛ 100line 102 didn't jump to line 100 because the condition on line 102 was always true
103 url = article_link_node.get("href")
104 xarticle = create_articledata()
105 xarticle.pid = "a" + str(index_article)
106 xarticle.url = self.source_website + url
108 # eLibM lists the articles in the reverse order, except for one special issue
109 if xissue.volume == "Mahler Selecta": 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 xissue.articles.append(xarticle)
111 else:
112 xissue.articles.insert(0, xarticle)
114 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page
115 if len(xissue.articles) == 0:
116 title_node = soup.find("h2", {"class": "document_title"})
117 if title_node is not None: 117 ↛ exitline 117 didn't return from function 'parse_issue_content' because the condition on line 117 was always true
118 xarticle = create_articledata()
119 xarticle.pid = "a0"
120 xarticle.url = xissue.url
122 xissue.articles.append(xarticle)
124 def parse_article_content(self, content, xissue, xarticle, url):
125 """
126 Parse the content with Beautifulsoup and returns an ArticleData
127 """
128 xarticle.lang = "en"
130 soup = BeautifulSoup(content, "html.parser")
132 # TITLE
133 title_node = soup.find("h2", {"class": "document_title"})
134 if title_node: 134 ↛ 138line 134 didn't jump to line 138 because the condition on line 134 was always true
135 xarticle.title_tex = title_node.get_text()
137 # AUTHORS
138 citation_author_node = soup.find("h3", {"class": "document_author"})
139 if citation_author_node: 139 ↛ 158line 139 didn't jump to line 158 because the condition on line 139 was always true
140 text = citation_author_node.get_text()
141 if text: 141 ↛ 158line 141 didn't jump to line 158 because the condition on line 141 was always true
142 parts = text.split(";")
143 for part in parts:
144 text_author = part.strip()
146 role = "author"
147 if "(ed.)" in text_author:
148 role = "editor"
149 text_author = text_author.split("(ed.)")[0].strip()
151 author = create_contributor()
152 author["role"] = role
153 author["string_name"] = text_author
155 xarticle.contributors.append(author)
157 # PDF
158 link_nodes = soup.find_all("a")
159 for link_node in link_nodes:
160 url = link_node.get("href")
161 if url.startswith("/ft/"):
162 pdf_url = self.source_website + url
163 add_pdf_link_to_xarticle(xarticle, pdf_url)
165 panel_nodes = soup.find_all("h3", {"class": "panel-title"})
166 for panel_node in panel_nodes:
167 text = panel_node.get_text()
168 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"})
170 if text == "Summary":
171 # ABSTRACT
172 abstract = content_node.get_text()
173 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))
175 elif text == "Mathematics Subject Classification":
176 # MSC
177 subjs = content_node.get_text().split(", ")
178 for subj in subjs:
179 subject = create_subj()
180 subject["value"] = subj
181 subject["type"] = "msc"
182 subject["lang"] = "en"
183 xarticle.kwds.append(subject)
185 elif text == "Keywords/Phrases":
186 # Keywords
187 subjs = content_node.get_text().split(", ")
188 for subj in subjs:
189 subject = create_subj()
190 subject["value"] = subj
191 subject["lang"] = "en"
192 xarticle.kwds.append(subject)
194 # PAGES
195 citation_node = soup.find("h5", {"class": "document_source"})
196 if citation_node: 196 ↛ 215line 196 didn't jump to line 215 because the condition on line 196 was always true
197 text = citation_node.get_text()
198 year = f"({xissue.year})"
199 if year in text: 199 ↛ 215line 199 didn't jump to line 215 because the condition on line 199 was always true
200 text = text.split(year)[0]
202 if "p." in text:
203 text = text.split("p.")[0].split(",")[-1].strip()
204 xarticle.size = text
206 elif "-" in text:
207 parts = text.split("-")
208 first_page = parts[-2].split(" ")[-1]
209 last_page = parts[-1].split(",")[0].split(" ")[0]
211 xarticle.fpage = first_page
212 xarticle.lpage = last_page
214 # DOI
215 doi_node = citation_node.next_sibling
216 if doi_node.name == "div": 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 text = doi_node.get_text()
218 if text.startswith("DOI: "):
219 doi = text[5:]
221 xarticle.doi = doi
222 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
224 return xarticle
226 def crawl_collection(self):
227 if self.source is None:
228 raise RuntimeError("ERROR: the source is not set")
230 content = self.download_file(self.collection_url)
231 xissues = self.parse_collection_content(content)
233 """
234 Some collections split the same volumes in different pages
235 Ex: Volume 6 (2000) and Volume 6 (1999)
236 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
237 """
238 xissues_dict = self.merge_xissues(xissues)
240 filtered_xissues = xissues_dict
241 # Filter the issues to crawl if start_pid was set in the constructor
242 # if self.start_pid is not None:
243 # filtered_xissues = {}
244 # start = False
245 # for pid in xissues_dict:
246 # if pid == self.start_pid:
247 # start = True
248 # if start:
249 # filtered_xissues[pid] = xissues_dict[pid]
251 return filtered_xissues
253 def merge_xissues(self, xissues: list[IssueData]):
254 """
255 Some collections split the same volumes in different pages
256 Ex: Volume 6 (2000) and Volume 6 (1999)
257 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
258 """
260 merged_xissues = OrderedDict()
262 for xissue in xissues:
263 xissues_with_same_volume = [
264 item
265 for item in xissues
266 if xissue.volume == item.volume
267 and xissue.number == item.number
268 and xissue.vseries == item.vseries
269 and (item.volume or item.number)
270 ]
272 if len(xissues_with_same_volume) < 2:
273 if xissue.pid is None:
274 raise ValueError("Issue does not have a PID")
275 merged_xissues[xissue.pid] = {"issues": [xissue]}
276 first_issue = xissue
277 year = xissue.year
278 else:
279 first_issue = xissues_with_same_volume[0]
280 volume = xissues_with_same_volume[0].volume
281 number = xissues_with_same_volume[0].number
282 vseries = xissues_with_same_volume[0].vseries
284 # Compute the year based on all issues with the same volume/number
285 begin = end = year = xissues_with_same_volume[0].year
286 if not year:
287 raise ValueError("year is not defined")
289 if "-" in year:
290 parts = year.split("-")
291 begin = parts[0]
292 end = parts[1]
294 for xissue_with_same_volume in xissues_with_same_volume[1:]:
295 new_begin = new_end = xissue_with_same_volume.year
297 if not xissue_with_same_volume.year:
298 raise ValueError("xissue year is not defined")
300 if "-" in xissue_with_same_volume.year:
301 parts = year.split("-")
302 new_begin = parts[0]
303 new_end = parts[1]
305 if begin is None or end is None or new_begin is None or new_end is None:
306 continue
307 begin_int = int(begin)
308 end_int = int(end)
309 new_begin_int = int(new_begin)
310 new_end_int = int(new_end)
312 if new_begin_int < begin_int:
313 begin = new_begin
314 if new_end_int > end_int:
315 end = new_end
317 if begin != end:
318 year = f"{begin}-{end}"
319 else:
320 year = begin
322 # We can now set the real pid
323 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"
324 for issue in xissues_with_same_volume:
325 issue.pid = pid
327 if pid not in merged_xissues:
328 merged_xissues[pid] = {
329 "issues": xissues_with_same_volume,
330 }
332 # We can set the year only for the first xissue because it is the one used to collect
333 # all the articles.
334 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])
335 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year
336 first_issue.merged_year = year
338 return merged_xissues
340 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]):
341 """
342 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages.
343 """
345 xissues_to_crawl = merged_xissues["issues"]
347 merged_xissue = xissues_to_crawl[0]
348 self.crawl_elibm_issue(merged_xissue)
350 if len(xissues_to_crawl) > 1:
351 for raw_xissue in xissues_to_crawl[1:]:
352 self.crawl_elibm_issue(raw_xissue)
354 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles
356 # Updates the article pid
357 for article_index, xarticle in enumerate(merged_xissue):
358 if raw_xissue.pid in xarticle.pid:
359 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}"
361 # Now that the issue pages have been downloaded/read, we can set the merged pid
362 # The merged_year was set in self.merge_xissues
363 # merged_xissue.pid
364 merged_xissue.year = merged_xissue.merged_year
366 if self.ignore_missing_pdf:
367 merged_xissue.articles = [a for a in merged_xissue.articles if article_has_pdf(a)]
369 if not self.dry and len(merged_xissue.articles) > 0:
370 self.process_resource_metadata(merged_xissue, resource_type="issue")
371 self.add_xissue_into_database(merged_xissue)
373 def crawl_elibm_issue(self, xissue: IssueData):
374 """
375 Crawl 1 wag page of an issue.
376 - get the HTML content of the issue
377 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
378 - crawl each article
379 """
381 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
382 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
383 if hasattr(xissue, "url") and xissue.url:
384 content = self.download_file(xissue.url)
385 self.parse_issue_content(content, xissue)
387 xarticles = xissue.articles
389 parsed_xarticles = []
391 for xarticle in xarticles:
392 parsed_xarticle = self.crawl_article(xarticle, xissue)
393 if parsed_xarticle is not None:
394 parsed_xarticles.append(parsed_xarticle)
396 xissue.articles = parsed_xarticles