Coverage for src/crawler/by_source/elibm_crawler.py: 53%
233 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1from collections import OrderedDict
3from bs4 import BeautifulSoup
4from ptf.model_data import (
5 IssueData,
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle
17class ElibmCrawler(BaseCollectionCrawler):
18 source_name = "Electronic Library of Mathematics"
19 source_domain = "ELIBM"
20 source_website = "https://www.elibm.org"
22 def __init__(self, *args, **kwargs):
23 super().__init__(*args, **kwargs)
24 if self.collection_id == "DOCMA": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 self.delimiter_inline_formula = "\\("
26 self.delimiter_disp_formula = "\\["
28 def parse_collection_content(self, content):
29 """
30 Parse the HTML page of Annals of Math and returns a list of xissue.
31 Each xissue has its pid/volume/number/year metadata + its url
32 """
33 soup = BeautifulSoup(content, "html.parser")
34 xissues = []
36 # Extract the list of issues
37 link_nodes = soup.find_all("a")
39 # eLibM puts special issue titles as volume number
40 # to create a issue pid, we use S1, S2...
41 last_special_issue_number = 0
43 for link_node in link_nodes:
44 url = link_node.get("href")
45 text = link_node.get_text()
46 if url.startswith("/issue"):
47 xissue, last_special_issue_number = self.create_elibm_xissue(
48 url, text, last_special_issue_number
49 )
51 if xissue: 51 ↛ 43line 51 didn't jump to line 43 because the condition on line 51 was always true
52 xissues.append(xissue)
54 return xissues
56 def get_first_year(self, year):
57 if "/" in year:
58 year = year.split("/")[0]
60 return year
62 def create_elibm_xissue(self, url, text, last_special_issue_number):
63 if "(" not in text or ")" not in text: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 return None, None
66 parts = text.split("(")
68 year = parts[1].split(")")[0]
69 year = year.replace("/", "-")
71 # volume might not be an integer. eLibM puts special issue titles as volume number.
72 volume = parts[0].strip()
74 number = ""
75 if "No. " in volume:
76 parts = volume.split("No. ")
77 volume = parts[0].strip()
78 number = parts[1].strip()
80 try:
81 volume_for_pid = int(volume)
82 except ValueError:
83 last_special_issue_number += 1
84 volume_for_pid = f"S{last_special_issue_number}"
86 xissue = create_issuedata()
87 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
88 xissue.year = year
89 xissue.volume = volume
90 xissue.number = number
91 xissue.url = self.source_website + url
93 return xissue, last_special_issue_number
95 def parse_issue_content(self, content, xissue):
96 soup = BeautifulSoup(content, "html.parser")
97 article_nodes = soup.find_all("div", {"class": "title"})
99 for index_article, article_node in enumerate(article_nodes):
100 article_link_node = article_node.find("a")
101 if article_link_node: 101 ↛ 99line 101 didn't jump to line 99 because the condition on line 101 was always true
102 url = article_link_node.get("href")
103 xarticle = create_articledata()
104 xarticle.pid = "a" + str(index_article)
105 xarticle.url = self.source_website + url
107 # eLibM lists the articles in the reverse order, except for one special issue
108 if xissue.volume == "Mahler Selecta": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 xissue.articles.append(xarticle)
110 else:
111 xissue.articles.insert(0, xarticle)
113 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page
114 if len(xissue.articles) == 0:
115 title_node = soup.find("h2", {"class": "document_title"})
116 if title_node is not None: 116 ↛ exitline 116 didn't return from function 'parse_issue_content' because the condition on line 116 was always true
117 xarticle = create_articledata()
118 xarticle.pid = "a0"
119 xarticle.url = xissue.url
121 xissue.articles.append(xarticle)
123 def parse_article_content(self, content, xissue, xarticle, url):
124 """
125 Parse the content with Beautifulsoup and returns an ArticleData
126 """
127 xarticle.lang = "en"
129 soup = BeautifulSoup(content, "html.parser")
131 # TITLE
132 title_node = soup.find("h2", {"class": "document_title"})
133 if title_node: 133 ↛ 137line 133 didn't jump to line 137 because the condition on line 133 was always true
134 xarticle.title_tex = title_node.get_text()
136 # AUTHORS
137 citation_author_node = soup.find("h3", {"class": "document_author"})
138 if citation_author_node: 138 ↛ 157line 138 didn't jump to line 157 because the condition on line 138 was always true
139 text = citation_author_node.get_text()
140 if text: 140 ↛ 157line 140 didn't jump to line 157 because the condition on line 140 was always true
141 parts = text.split(";")
142 for part in parts:
143 text_author = part.strip()
145 role = "author"
146 if "(ed.)" in text_author:
147 role = "editor"
148 text_author = text_author.split("(ed.)")[0].strip()
150 author = create_contributor()
151 author["role"] = role
152 author["string_name"] = text_author
154 xarticle.contributors.append(author)
156 # PDF
157 link_nodes = soup.find_all("a")
158 for link_node in link_nodes:
159 url = link_node.get("href")
160 if url.startswith("/ft/"):
161 pdf_url = self.source_website + url
162 add_pdf_link_to_xarticle(xarticle, pdf_url)
164 panel_nodes = soup.find_all("h3", {"class": "panel-title"})
165 for panel_node in panel_nodes:
166 text = panel_node.get_text()
167 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"})
169 if text == "Summary":
170 # ABSTRACT
171 abstract = content_node.get_text()
172 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))
174 elif text == "Mathematics Subject Classification":
175 # MSC
176 subjs = content_node.get_text().split(", ")
177 for subj in subjs:
178 subject = create_subj()
179 subject["value"] = subj
180 subject["type"] = "msc"
181 subject["lang"] = "en"
182 xarticle.kwds.append(subject)
184 elif text == "Keywords/Phrases":
185 # Keywords
186 subjs = content_node.get_text().split(", ")
187 for subj in subjs:
188 subject = create_subj()
189 subject["value"] = subj
190 subject["lang"] = "en"
191 xarticle.kwds.append(subject)
193 # PAGES
194 citation_node = soup.find("h5", {"class": "document_source"})
195 if citation_node: 195 ↛ 214line 195 didn't jump to line 214 because the condition on line 195 was always true
196 text = citation_node.get_text()
197 year = f"({xissue.year})"
198 if year in text: 198 ↛ 214line 198 didn't jump to line 214 because the condition on line 198 was always true
199 text = text.split(year)[0]
201 if "p." in text:
202 text = text.split("p.")[0].split(",")[-1].strip()
203 xarticle.size = text
205 elif "-" in text:
206 parts = text.split("-")
207 first_page = parts[-2].split(" ")[-1]
208 last_page = parts[-1].split(",")[0].split(" ")[0]
210 xarticle.fpage = first_page
211 xarticle.lpage = last_page
213 # DOI
214 doi_node = citation_node.next_sibling
215 if doi_node.name == "div": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 text = doi_node.get_text()
217 if text.startswith("DOI: "):
218 doi = text[5:]
220 xarticle.doi = doi
221 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
223 return xarticle
225 def crawl_collection(self):
226 if self.source is None:
227 raise RuntimeError("ERROR: the source is not set")
229 content = self.download_file(self.collection_url)
230 xissues = self.parse_collection_content(content)
232 """
233 Some collections split the same volumes in different pages
234 Ex: Volume 6 (2000) and Volume 6 (1999)
235 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
236 """
237 xissues_dict = self.merge_xissues(xissues)
239 filtered_xissues = xissues_dict
240 # Filter the issues to crawl if start_pid was set in the constructor
241 # if self.start_pid is not None:
242 # filtered_xissues = {}
243 # start = False
244 # for pid in xissues_dict:
245 # if pid == self.start_pid:
246 # start = True
247 # if start:
248 # filtered_xissues[pid] = xissues_dict[pid]
250 return filtered_xissues
252 def merge_xissues(self, xissues: list[IssueData]):
253 """
254 Some collections split the same volumes in different pages
255 Ex: Volume 6 (2000) and Volume 6 (1999)
256 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
257 """
259 merged_xissues = OrderedDict()
261 for xissue in xissues:
262 xissues_with_same_volume = [
263 item
264 for item in xissues
265 if xissue.volume == item.volume
266 and xissue.number == item.number
267 and xissue.vseries == item.vseries
268 and (item.volume or item.number)
269 ]
271 if len(xissues_with_same_volume) < 2:
272 if xissue.pid is None:
273 raise ValueError("Issue does not have a PID")
274 merged_xissues[xissue.pid] = {"issues": [xissue]}
275 first_issue = xissue
276 year = xissue.year
277 else:
278 first_issue = xissues_with_same_volume[0]
279 volume = xissues_with_same_volume[0].volume
280 number = xissues_with_same_volume[0].number
281 vseries = xissues_with_same_volume[0].vseries
283 # Compute the year based on all issues with the same volume/number
284 begin = end = year = xissues_with_same_volume[0].year
285 if not year:
286 raise ValueError("year is not defined")
288 if "-" in year:
289 parts = year.split("-")
290 begin = parts[0]
291 end = parts[1]
293 for xissue_with_same_volume in xissues_with_same_volume[1:]:
294 new_begin = new_end = xissue_with_same_volume.year
296 if not xissue_with_same_volume.year:
297 raise ValueError("xissue year is not defined")
299 if "-" in xissue_with_same_volume.year:
300 parts = year.split("-")
301 new_begin = parts[0]
302 new_end = parts[1]
304 if begin is None or end is None or new_begin is None or new_end is None:
305 continue
306 begin_int = int(begin)
307 end_int = int(end)
308 new_begin_int = int(new_begin)
309 new_end_int = int(new_end)
311 if new_begin_int < begin_int:
312 begin = new_begin
313 if new_end_int > end_int:
314 end = new_end
316 if begin != end:
317 year = f"{begin}-{end}"
318 else:
319 year = begin
321 # We can now set the real pid
322 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"
323 for issue in xissues_with_same_volume:
324 issue.pid = pid
326 if pid not in merged_xissues:
327 merged_xissues[pid] = {
328 "issues": xissues_with_same_volume,
329 }
331 # We can set the year only for the first xissue because it is the one used to collect
332 # all the articles.
333 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])
334 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year
335 first_issue.merged_year = year
337 return merged_xissues
339 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]):
340 """
341 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages.
342 """
344 xissues_to_crawl = merged_xissues["issues"]
346 merged_xissue = xissues_to_crawl[0]
347 self.crawl_elibm_issue(merged_xissue)
349 if len(xissues_to_crawl) > 1:
350 for raw_xissue in xissues_to_crawl[1:]:
351 self.crawl_elibm_issue(raw_xissue)
353 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles
355 # Updates the article pid
356 for article_index, xarticle in enumerate(merged_xissue):
357 if raw_xissue.pid in xarticle.pid:
358 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}"
360 # Now that the issue pages have been downloaded/read, we can set the merged pid
361 # The merged_year was set in self.merge_xissues
362 # merged_xissue.pid
363 merged_xissue.year = merged_xissue.merged_year
365 if self.ignore_missing_pdf:
366 merged_xissue.articles = [a for a in merged_xissue.articles if self.article_has_pdf(a)]
368 if not self.test_mode and len(merged_xissue.articles) > 0:
369 self.process_resource_metadata(merged_xissue, resource_type="issue")
370 self.add_xissue_into_database(merged_xissue)
372 def crawl_elibm_issue(self, xissue: IssueData):
373 """
374 Crawl 1 wag page of an issue.
375 - get the HTML content of the issue
376 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
377 - crawl each article
378 """
380 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
381 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
382 if hasattr(xissue, "url") and xissue.url:
383 content = self.download_file(xissue.url)
384 self.parse_issue_content(content, xissue)
386 xarticles = xissue.articles
388 parsed_xarticles = []
390 for xarticle in xarticles:
391 parsed_xarticle = self.crawl_article(xarticle, xissue)
392 if parsed_xarticle is not None:
393 parsed_xarticles.append(parsed_xarticle)
395 xissue.articles = parsed_xarticles