Coverage for src/crawler/by_source/elibm_crawler.py: 76%
159 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1from bs4 import BeautifulSoup
2from crawler.base_crawler import BaseCollectionCrawler
3from crawler.base_crawler import add_pdf_link_to_xarticle
5from ptf.model_data import create_articledata
6from ptf.model_data import create_contributor
7from ptf.model_data import create_issuedata
8from ptf.model_data import create_subj
11class ElibmCrawler(BaseCollectionCrawler):
12 source_name = "Electronic Library of Mathematics"
13 source_domain = "ELIBM"
14 source_website = "https://www.elibm.org"
16 def __init__(self, *args, **kwargs):
17 super().__init__(*args, **kwargs)
19 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue.
21 self.source = self.get_or_create_source()
23 if self.collection_id == "DOCMA":
24 self.delimiter_inline_formula = "\\("
25 self.delimiter_disp_formula = "\\["
27 def parse_collection_content(self, content):
28 """
29 Parse the HTML page of Annals of Math and returns a list of xissue.
30 Each xissue has its pid/volume/number/year metadata + its url
32 self.periode is set at the end based on the xissue years of the HTML page
33 """
34 soup = BeautifulSoup(content, "html.parser")
35 xissues = []
37 # Extract the list of issues
38 link_nodes = soup.find_all("a")
40 # eLibM puts special issue titles as volume number
41 # to create a issue pid, we use S1, S2...
42 last_special_issue_number = 0
44 for link_node in link_nodes:
45 url = link_node.get("href")
46 text = link_node.get_text()
47 if url.startswith("/issue"):
48 xissue, last_special_issue_number = self.create_xissue(
49 url, text, last_special_issue_number
50 )
52 # eLibM lists the special issues at the end.
53 # set the periode_begin if we find a special issue
54 if last_special_issue_number == 1: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 self.periode_begin = self.get_first_year(xissues[-1].year)
57 if xissue: 57 ↛ 44line 57 didn't jump to line 44 because the condition on line 57 was always true
58 xissues.append(xissue)
60 self.periode_end = self.get_first_year(xissues[0].year)
62 if last_special_issue_number == 0: 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true
63 self.periode_begin = self.get_first_year(xissues[-1].year)
65 self.periode = self.get_or_create_periode()
67 return xissues
69 def get_first_year(self, year):
70 if "/" in year: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 year = year.split("/")[0]
73 return year
75 def create_xissue(self, url, text, last_special_issue_number):
76 if "(" not in text or ")" not in text: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 return None
79 parts = text.split("(")
81 year = parts[1].split(")")[0]
82 year = year.replace("/", "-")
84 # volume might not be an integer. eLibM puts special issue titles as volume number.
85 volume = parts[0].strip()
87 number = ""
88 if "No. " in volume: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 parts = volume.split("No. ")
90 volume = parts[0].strip()
91 number = parts[1].strip()
93 try:
94 volume_for_pid = int(volume)
95 except ValueError:
96 last_special_issue_number += 1
97 volume_for_pid = f"S{last_special_issue_number}"
99 xissue = create_issuedata()
100 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
101 xissue.year = year
102 xissue.volume = volume
103 xissue.number = number
104 xissue.url = self.source_website + url
106 return xissue, last_special_issue_number
108 def parse_issue_content(self, content, xissue):
109 soup = BeautifulSoup(content, "html.parser")
110 article_nodes = soup.find_all("div", {"class": "title"})
112 for index_article, article_node in enumerate(article_nodes):
113 article_link_node = article_node.find("a")
114 if article_link_node: 114 ↛ 112line 114 didn't jump to line 112 because the condition on line 114 was always true
115 url = article_link_node.get("href")
116 xarticle = create_articledata()
117 xarticle.pid = "a" + str(index_article)
118 xarticle.url = self.source_website + url
120 # eLibM lists the articles in the reverse order, except for one special issue
121 if xissue.volume == "Mahler Selecta": 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 xissue.articles.append(xarticle)
123 else:
124 xissue.articles.insert(0, xarticle)
126 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page
127 if len(xissue.articles) == 0: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 title_node = soup.find("h2", {"class": "document_title"})
129 if title_node is not None:
130 xarticle = create_articledata()
131 xarticle.pid = "a0"
132 xarticle.url = xissue.url
134 xissue.articles.append(xarticle)
136 def parse_article_content(self, content, xissue, xarticle, url, pid):
137 """
138 Parse the content with Beautifulsoup and returns an ArticleData
139 """
140 xarticle = create_articledata()
141 xarticle.pid = pid
142 xarticle.lang = "en"
144 soup = BeautifulSoup(content, "html.parser")
146 # TITLE
147 title_node = soup.find("h2", {"class": "document_title"})
148 if title_node: 148 ↛ 152line 148 didn't jump to line 152 because the condition on line 148 was always true
149 xarticle.title_tex = title_node.get_text()
151 # AUTHORS
152 citation_author_node = soup.find("h3", {"class": "document_author"})
153 if citation_author_node: 153 ↛ 172line 153 didn't jump to line 172 because the condition on line 153 was always true
154 text = citation_author_node.get_text()
155 if text: 155 ↛ 172line 155 didn't jump to line 172 because the condition on line 155 was always true
156 parts = text.split(";")
157 for part in parts:
158 text_author = part.strip()
160 role = "author"
161 if "(ed.)" in text_author: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 role = "editor"
163 text_author = text_author.split("(ed.)")[0].strip()
165 author = create_contributor()
166 author["role"] = role
167 author["string_name"] = text_author
169 xarticle.contributors.append(author)
171 # PDF
172 link_nodes = soup.find_all("a")
173 for link_node in link_nodes:
174 url = link_node.get("href")
175 if url.startswith("/ft/"):
176 pdf_url = self.source_website + url
177 add_pdf_link_to_xarticle(xarticle, pdf_url)
179 panel_nodes = soup.find_all("h3", {"class": "panel-title"})
180 for panel_node in panel_nodes:
181 text = panel_node.get_text()
182 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"})
184 if text == "Summary":
185 # ABSTRACT
186 abstract = content_node.get_text()
187 xabstract = {
188 "tag": "abstract",
189 "value_html": "",
190 "value_tex": abstract,
191 "value_xml": "",
192 "lang": "en",
193 }
194 xarticle.abstracts.append(xabstract)
196 elif text == "Mathematics Subject Classification":
197 # MSC
198 subjs = content_node.get_text().split(", ")
199 for subj in subjs:
200 subject = create_subj()
201 subject["value"] = subj
202 subject["type"] = "msc"
203 subject["lang"] = "en"
204 xarticle.kwds.append(subject)
206 elif text == "Keywords/Phrases":
207 # Keywords
208 subjs = content_node.get_text().split(", ")
209 for subj in subjs:
210 subject = create_subj()
211 subject["value"] = subj
212 subject["lang"] = "en"
213 xarticle.kwds.append(subject)
215 # PAGES
216 citation_node = soup.find("h5", {"class": "document_source"})
217 if citation_node: 217 ↛ 236line 217 didn't jump to line 236 because the condition on line 217 was always true
218 text = citation_node.get_text()
219 year = f"({xissue.year})"
220 if year in text: 220 ↛ 236line 220 didn't jump to line 236 because the condition on line 220 was always true
221 text = text.split(year)[0]
223 if "p." in text: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 text = text.split("p.")[0].split(",")[-1].strip()
225 xarticle.size = text
227 elif "-" in text: 227 ↛ 236line 227 didn't jump to line 236 because the condition on line 227 was always true
228 parts = text.split("-")
229 first_page = parts[-2].split(" ")[-1]
230 last_page = parts[-1].split(",")[0].split(" ")[0]
232 xarticle.fpage = first_page
233 xarticle.lpage = last_page
235 # DOI
236 doi_node = citation_node.next_sibling
237 if doi_node.name == "div": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 text = doi_node.get_text()
239 if text.startswith("DOI: "):
240 doi = text[5:]
242 xarticle.doi = doi
243 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
245 return xarticle