Coverage for src/crawler/by_source/elibm_crawler.py: 53%
246 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from collections import OrderedDict
3from bs4 import BeautifulSoup
4from ptf.model_data import (
5 IssueData,
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle
17class ElibmCrawler(BaseCollectionCrawler):
18 source_name = "Electronic Library of Mathematics"
19 source_domain = "ELIBM"
20 source_website = "https://www.elibm.org"
22 def __init__(self, *args, **kwargs):
23 super().__init__(*args, **kwargs)
24 if self.collection_id == "DOCMA": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 self.delimiter_inline_formula = "\\("
26 self.delimiter_disp_formula = "\\["
28 def parse_collection_content(self, content):
29 """
30 Parse the HTML page of Annals of Math and returns a list of xissue.
31 Each xissue has its pid/volume/number/year metadata + its url
33 self.periode is set at the end based on the xissue years of the HTML page
34 """
35 soup = BeautifulSoup(content, "html.parser")
36 xissues = []
38 # Extract the list of issues
39 link_nodes = soup.find_all("a")
41 # eLibM puts special issue titles as volume number
42 # to create a issue pid, we use S1, S2...
43 last_special_issue_number = 0
45 for link_node in link_nodes:
46 url = link_node.get("href")
47 text = link_node.get_text()
48 if url.startswith("/issue"):
49 xissue, last_special_issue_number = self.create_elibm_xissue(
50 url, text, last_special_issue_number
51 )
53 # eLibM lists the special issues at the end.
54 # set the periode_begin if we find a special issue
55 if last_special_issue_number == 1: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 self.periode_begin = self.get_first_year(xissues[-1].year)
58 if xissue: 58 ↛ 45line 58 didn't jump to line 45 because the condition on line 58 was always true
59 xissues.append(xissue)
61 self.periode_end = self.get_first_year(xissues[0].year)
63 if last_special_issue_number == 0: 63 ↛ 66line 63 didn't jump to line 66 because the condition on line 63 was always true
64 self.periode_begin = self.get_first_year(xissues[-1].year)
66 self.periode = self.get_or_create_periode()
68 return xissues
70 def get_first_year(self, year):
71 if "/" in year: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 year = year.split("/")[0]
74 return year
76 def create_elibm_xissue(self, url, text, last_special_issue_number):
77 if "(" not in text or ")" not in text: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 return None, None
80 parts = text.split("(")
82 year = parts[1].split(")")[0]
83 year = year.replace("/", "-")
85 # volume might not be an integer. eLibM puts special issue titles as volume number.
86 volume = parts[0].strip()
88 number = ""
89 if "No. " in volume:
90 parts = volume.split("No. ")
91 volume = parts[0].strip()
92 number = parts[1].strip()
94 try:
95 volume_for_pid = int(volume)
96 except ValueError:
97 last_special_issue_number += 1
98 volume_for_pid = f"S{last_special_issue_number}"
100 xissue = create_issuedata()
101 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
102 xissue.year = year
103 xissue.volume = volume
104 xissue.number = number
105 xissue.url = self.source_website + url
107 return xissue, last_special_issue_number
109 def parse_issue_content(self, content, xissue):
110 soup = BeautifulSoup(content, "html.parser")
111 article_nodes = soup.find_all("div", {"class": "title"})
113 for index_article, article_node in enumerate(article_nodes):
114 article_link_node = article_node.find("a")
115 if article_link_node: 115 ↛ 113line 115 didn't jump to line 113 because the condition on line 115 was always true
116 url = article_link_node.get("href")
117 xarticle = create_articledata()
118 xarticle.pid = "a" + str(index_article)
119 xarticle.url = self.source_website + url
121 # eLibM lists the articles in the reverse order, except for one special issue
122 if xissue.volume == "Mahler Selecta": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 xissue.articles.append(xarticle)
124 else:
125 xissue.articles.insert(0, xarticle)
127 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page
128 if len(xissue.articles) == 0:
129 title_node = soup.find("h2", {"class": "document_title"})
130 if title_node is not None: 130 ↛ exitline 130 didn't return from function 'parse_issue_content' because the condition on line 130 was always true
131 xarticle = create_articledata()
132 xarticle.pid = "a0"
133 xarticle.url = xissue.url
135 xissue.articles.append(xarticle)
137 def parse_article_content(self, content, xissue, xarticle, url, pid):
138 """
139 Parse the content with Beautifulsoup and returns an ArticleData
140 """
141 xarticle.pid = pid
142 xarticle.lang = "en"
144 soup = BeautifulSoup(content, "html.parser")
146 # TITLE
147 title_node = soup.find("h2", {"class": "document_title"})
148 if title_node: 148 ↛ 152line 148 didn't jump to line 152 because the condition on line 148 was always true
149 xarticle.title_tex = title_node.get_text()
151 # AUTHORS
152 citation_author_node = soup.find("h3", {"class": "document_author"})
153 if citation_author_node: 153 ↛ 172line 153 didn't jump to line 172 because the condition on line 153 was always true
154 text = citation_author_node.get_text()
155 if text: 155 ↛ 172line 155 didn't jump to line 172 because the condition on line 155 was always true
156 parts = text.split(";")
157 for part in parts:
158 text_author = part.strip()
160 role = "author"
161 if "(ed.)" in text_author:
162 role = "editor"
163 text_author = text_author.split("(ed.)")[0].strip()
165 author = create_contributor()
166 author["role"] = role
167 author["string_name"] = text_author
169 xarticle.contributors.append(author)
171 # PDF
172 link_nodes = soup.find_all("a")
173 for link_node in link_nodes:
174 url = link_node.get("href")
175 if url.startswith("/ft/"):
176 pdf_url = self.source_website + url
177 add_pdf_link_to_xarticle(xarticle, pdf_url)
179 panel_nodes = soup.find_all("h3", {"class": "panel-title"})
180 for panel_node in panel_nodes:
181 text = panel_node.get_text()
182 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"})
184 if text == "Summary":
185 # ABSTRACT
186 abstract = content_node.get_text()
187 xabstract = create_abstract(tag="abstract", value_tex=abstract, lang=xarticle.lang)
188 xarticle.abstracts.append(xabstract)
190 elif text == "Mathematics Subject Classification":
191 # MSC
192 subjs = content_node.get_text().split(", ")
193 for subj in subjs:
194 subject = create_subj()
195 subject["value"] = subj
196 subject["type"] = "msc"
197 subject["lang"] = "en"
198 xarticle.kwds.append(subject)
200 elif text == "Keywords/Phrases":
201 # Keywords
202 subjs = content_node.get_text().split(", ")
203 for subj in subjs:
204 subject = create_subj()
205 subject["value"] = subj
206 subject["lang"] = "en"
207 xarticle.kwds.append(subject)
209 # PAGES
210 citation_node = soup.find("h5", {"class": "document_source"})
211 if citation_node: 211 ↛ 230line 211 didn't jump to line 230 because the condition on line 211 was always true
212 text = citation_node.get_text()
213 year = f"({xissue.year})"
214 if year in text: 214 ↛ 230line 214 didn't jump to line 230 because the condition on line 214 was always true
215 text = text.split(year)[0]
217 if "p." in text:
218 text = text.split("p.")[0].split(",")[-1].strip()
219 xarticle.size = text
221 elif "-" in text:
222 parts = text.split("-")
223 first_page = parts[-2].split(" ")[-1]
224 last_page = parts[-1].split(",")[0].split(" ")[0]
226 xarticle.fpage = first_page
227 xarticle.lpage = last_page
229 # DOI
230 doi_node = citation_node.next_sibling
231 if doi_node.name == "div": 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 text = doi_node.get_text()
233 if text.startswith("DOI: "):
234 doi = text[5:]
236 xarticle.doi = doi
237 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
239 return xarticle
241 def crawl_collection(self):
242 if self.source is None:
243 raise RuntimeError("ERROR: the source is not set")
245 content = self.download_file(self.collection_url)
246 xissues = self.parse_collection_content(content)
248 """
249 Some collections split the same volumes in different pages
250 Ex: Volume 6 (2000) and Volume 6 (1999)
251 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
252 """
253 xissues_dict = self.merge_xissues(xissues)
255 filtered_xissues = xissues_dict
256 # Filter the issues to crawl if start_pid was set in the constructor
257 if self.start_pid is not None:
258 filtered_xissues = {}
259 start = False
260 for pid in xissues_dict:
261 if pid == self.start_pid:
262 start = True
263 if start:
264 filtered_xissues[pid] = xissues_dict[pid]
266 return filtered_xissues
268 def merge_xissues(self, xissues: list[IssueData]):
269 """
270 Some collections split the same volumes in different pages
271 Ex: Volume 6 (2000) and Volume 6 (1999)
272 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
273 """
275 merged_xissues = OrderedDict()
277 for xissue in xissues:
278 xissues_with_same_volume = [
279 item
280 for item in xissues
281 if xissue.volume == item.volume
282 and xissue.number == item.number
283 and xissue.vseries == item.vseries
284 and (item.volume or item.number)
285 ]
287 if len(xissues_with_same_volume) < 2:
288 if xissue.pid is None:
289 raise ValueError("Issue does not have a PID")
290 merged_xissues[xissue.pid] = {"issues": [xissue]}
291 first_issue = xissue
292 year = xissue.year
293 else:
294 first_issue = xissues_with_same_volume[0]
295 volume = xissues_with_same_volume[0].volume
296 number = xissues_with_same_volume[0].number
297 vseries = xissues_with_same_volume[0].vseries
299 # Compute the year based on all issues with the same volume/number
300 begin = end = year = xissues_with_same_volume[0].year
301 if not year:
302 raise ValueError("year is not defined")
304 if "-" in year:
305 parts = year.split("-")
306 begin = parts[0]
307 end = parts[1]
309 for xissue_with_same_volume in xissues_with_same_volume[1:]:
310 new_begin = new_end = xissue_with_same_volume.year
312 if not xissue_with_same_volume.year:
313 raise ValueError("xissue year is not defined")
315 if "-" in xissue_with_same_volume.year:
316 parts = year.split("-")
317 new_begin = parts[0]
318 new_end = parts[1]
320 if begin is None or end is None or new_begin is None or new_end is None:
321 continue
322 begin_int = int(begin)
323 end_int = int(end)
324 new_begin_int = int(new_begin)
325 new_end_int = int(new_end)
327 if new_begin_int < begin_int:
328 begin = new_begin
329 if new_end_int > end_int:
330 end = new_end
332 if begin != end:
333 year = f"{begin}-{end}"
334 else:
335 year = begin
337 # We can now set the real pid
338 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"
339 for issue in xissues_with_same_volume:
340 issue.pid = pid
342 if pid not in merged_xissues:
343 merged_xissues[pid] = {
344 "issues": xissues_with_same_volume,
345 }
347 # We can set the year only for the first xissue because it is the one used to collect
348 # all the articles.
349 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])
350 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year
351 first_issue.merged_year = year
353 return merged_xissues
355 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]):
356 """
357 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages.
358 """
360 xissues_to_crawl = merged_xissues["issues"]
362 merged_xissue = xissues_to_crawl[0]
363 self.crawl_elibm_issue(merged_xissue)
365 if len(xissues_to_crawl) > 1:
366 for raw_xissue in xissues_to_crawl[1:]:
367 self.crawl_elibm_issue(raw_xissue)
369 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles
371 # Updates the article pid
372 for article_index, xarticle in enumerate(merged_xissue):
373 if raw_xissue.pid in xarticle.pid:
374 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}"
376 # Now that the issue pages have been downloaded/read, we can set the merged pid
377 # The merged_year was set in self.merge_xissues
378 # merged_xissue.pid
379 merged_xissue.year = merged_xissue.merged_year
381 if not self.test_mode and len(merged_xissue.articles) > 0:
382 self.add_xissue_into_database(merged_xissue)
384 def crawl_elibm_issue(self, xissue: IssueData):
385 """
386 Crawl 1 wag page of an issue.
387 - get the HTML content of the issue
388 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
389 - crawl each article
390 """
392 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
393 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
394 if hasattr(xissue, "url") and xissue.url:
395 content = self.download_file(xissue.url)
396 self.parse_issue_content(content, xissue)
398 xarticles = xissue.articles
400 parsed_xarticles = []
402 for xarticle in xarticles:
403 parsed_xarticle = self.crawl_article(xarticle, xissue)
404 if parsed_xarticle is not None:
405 parsed_xarticles.append(parsed_xarticle)
407 xissue.articles = parsed_xarticles