Coverage for src / crawler / by_source / bdim_crawler.py: 9%
230 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1import re
3import lingua
4import regex
5from bs4 import BeautifulSoup, Tag
6from lingua import LanguageDetectorBuilder
7from ptf.cmds.xml.jats.builder.references import (
8 # ContribAuthor,
9 get_all_authors_xml,
10 get_article_title_xml,
11 get_ext_link_xml,
12 get_publisher_xml,
13 get_source_xml,
14 get_volume_xml,
15 get_year_xml,
16)
17from ptf.cmds.xml.jats.jats_parser import JatsBase
18from ptf.cmds.xml.xml_utils import escape
19from ptf.model_data import (
20 ArticleData,
21 create_abstract,
22 create_articledata,
23 create_contributor,
24 create_issuedata,
25)
27from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler
28from crawler.utils import add_pdf_link_to_xarticle
31class BdimCrawler(ThreadedCrawler):
32 source_name = "Biblioteca Digitale Italiana di Matematica"
33 source_domain = "BDIM"
34 source_website = "http://www.bdim.eu"
35 headers = {"accept_encoding": "utf-8", "cookie": "lingua=en; matematica=tex"}
37 title_corrections = {
38 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato",
39 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.",
40 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.",
41 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.",
42 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.",
43 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.",
44 }
46 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})"
48 _language_detector_builder = LanguageDetectorBuilder.from_languages(
49 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.ITALIAN
50 )
52 def __init__(self, *args, **kwargs):
53 super().__init__(*args, **kwargs)
55 def parse_collection_content(self, content):
56 """
57 Parse the HTML page of Annals of Math and returns a list of xissue.
58 Each xissue has its pid/volume/number/year metadata + its url
59 """
60 soup = BeautifulSoup(content, "html.parser")
61 xissues = []
63 reg_issue = regex.compile(self.issue_href)
65 issue_nodes = []
66 for issue in soup.select("div.listafascicoli a"):
67 href = issue.get("href")
68 if isinstance(href, str) and reg_issue.search(href):
69 issue_nodes.append(issue)
71 for issue_node in issue_nodes:
72 # issue_text = issue_node.get_text()
74 part_issue = issue_node.get("href").split("_")
75 volume = part_issue[-2]
76 number = part_issue[-1]
77 year = part_issue[1]
78 serie = part_issue[2]
79 link = "/item" + issue_node.get("href")
80 xissue = self.create_bdim_xissue(link, serie, volume, number, year)
81 if xissue:
82 xissues.append(xissue)
84 return xissues
86 def get_year(self, year):
87 if "/" in year:
88 year = year.split("/")[0]
90 return year
92 def create_bdim_xissue(self, url, serie, volume, number, dates):
93 year = dates.replace("/", "-")
95 xissue = create_issuedata()
96 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}"
97 xissue.year = year
98 xissue.volume = volume
99 xissue.number = number
100 xissue.vseries = serie
101 xissue.url = self.source_website + url
103 return xissue
105 def parse_issue_content(self, content, xissue):
106 soup = BeautifulSoup(content, "html.parser")
107 article_nodes = soup.find_all("div", {"class": "referenza"})
109 for index_article, article_node in enumerate(article_nodes):
110 article_link_node = article_node.find("a", text="full entry")
111 if article_link_node:
112 url = article_link_node.get("href")
113 xarticle = create_articledata()
114 xarticle.pid = "a" + str(index_article)
115 xarticle.url = self.source_website + url
117 xissue.articles.append(xarticle)
119 def parse_article_content(self, content, xissue, xarticle, url):
120 """
121 Parse the content with Beautifulsoup and returns an ArticleData
122 """
123 soup = BeautifulSoup(content, "html.parser")
124 # TITLE
125 title_node = soup.select_one("span.titolo")
126 if not title_node:
127 raise ValueError("Couldn't find article title")
128 xarticle.title_tex = title_node.get_text()
130 # Lang
131 if "(Italian)" in title_node.parent.text:
132 xarticle.lang = "it"
133 elif "(English)" in title_node.parent.text:
134 xarticle.lang = "en"
135 # Authors
136 reg_author_link = regex.compile(r"\?testo=\w+")
137 text_author_bloc = soup.select_one("div.referenza p")
138 if text_author_bloc:
139 for link in text_author_bloc.select("a"):
140 href = link.get("href")
141 if isinstance(href, str) and reg_author_link.search(href):
142 contrib_node = link.select_one("span.autore")
143 if contrib_node is not None:
144 surname_node = link.select_one("span.cognome")
145 firstname_node = link.select_one("span.nome")
146 author = create_contributor(role="author")
148 if surname_node is not None:
149 surname = surname_node.get_text()
150 author["last_name"] = surname
152 if firstname_node is not None:
153 firstname = firstname_node.get_text()
154 author["first_name"] = firstname
156 if not firstname_node or not surname_node:
157 string_name = contrib_node.get_text()
158 author["string_name"] = string_name
160 xarticle.contributors.append(author)
162 # ABSTRACT
163 abstract_section_node = soup.select_one("div.sunto")
164 if abstract_section_node:
165 abstract = str(abstract_section_node.get_text())
167 xarticle.abstracts.append(
168 create_abstract(value_tex=abstract, lang=self.detect_language(abstract))
169 )
171 # PDF
172 pdf_url = soup.find_all("a", text="pdf")
173 if len(pdf_url) > 0:
174 pdf_url = self.source_website + pdf_url[0].get("href")
175 add_pdf_link_to_xarticle(xarticle, pdf_url)
177 # PAGES
178 pages = soup.select_one("span.pagine")
179 if pages:
180 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
181 if pages_to:
182 parts = pages_to[0].split("-")
183 first_page = parts[0].replace("(", "").replace(")", "")
184 if len(parts) > 1:
185 last_page = parts[1].replace("(", "").replace(")", "")
186 xarticle.lpage = last_page
187 xarticle.fpage = first_page
189 # Biblio
190 bibitems_tags = soup.select("div.biblio div.bibitem")
191 xarticle.bibitems = [self.parse_ref(item) for item in bibitems_tags]
193 # metadata
194 bdim_mr_url = "http://www.ams.org/mathscinet-getitem?mr="
195 bdim_zbl_url = "https://zbmath.org/?q=an:"
196 medata_bloc = soup.select_one("div.referenza > p")
197 if not medata_bloc:
198 raise ValueError("metadata_bloc cannot be found")
200 zbl_id = medata_bloc.select_one(f"a[href^='{bdim_zbl_url}']")
202 if zbl_id:
203 extid_href = zbl_id.get("href")
204 if isinstance(extid_href, str):
205 xarticle.extids.append(("zbl-item-id", extid_href.removeprefix(bdim_zbl_url)))
207 mr_id = medata_bloc.select_one(f"a[href^='{bdim_mr_url}']")
208 if mr_id:
209 extid_href = mr_id.get("href")
210 if isinstance(extid_href, str):
211 xarticle.extids.append(
212 (
213 "mr-item-id",
214 extid_href.removeprefix(bdim_mr_url),
215 )
216 )
218 if xarticle.pid in self.title_corrections:
219 xarticle.title_tex = self.title_corrections[xarticle.pid]
221 content = self.download_file(url, headers={"cookie": "lingua=it; matematica=tex"})
222 xarticle = self.parse_article_content_bdim_it(content, xissue, xarticle, url)
223 return xarticle
225 def parse_article_content_bdim_it(self, content, xissue, xarticle: ArticleData, url):
226 soup = BeautifulSoup(content, "html.parser")
228 # trans abstract
229 abstract_section_node = soup.select_one("div.sunto")
230 if abstract_section_node:
231 abstract = str(abstract_section_node.get_text())
232 if xarticle.abstracts[0]["value_tex"] != abstract:
233 xarticle.abstracts.append(
234 create_abstract(value_tex=abstract, lang=self.detect_language(abstract))
235 )
237 # Trans_title
239 trans_title_node = soup.select_one("span.titolo_trad")
240 if trans_title_node:
241 trans_title = self.create_trans_title(
242 resource_type="article",
243 xresource_lang=xarticle.lang,
244 lang="en" if xarticle.lang == "it" else "it",
245 title_str=trans_title_node.get_text(),
246 )
247 xarticle.titles.append(trans_title)
248 return xarticle
250 def parse_ref(self, item: Tag):
251 value_xml = ""
252 # First pass : we create an semi-complete XML Jats string, except for the authors
253 # that we store inside authors_list to be serialized at the end
254 authors_list = []
255 for c in item.children:
256 c_text = escape(c.text)
257 if isinstance(c, str):
258 value_xml += c_text
259 continue
261 if not isinstance(c, Tag):
262 raise NotImplementedError("bibitem_tag is not a Tag or a string")
264 if c.name == "a":
265 a_xml, is_badge = self.parse_a_tag(c)
266 if is_badge:
267 value_xml = regex.sub(r" \| $", "", value_xml)
268 value_xml += a_xml
269 continue
271 child_class = c.get("class")
272 if not child_class:
273 value_xml += c_text
274 elif "bautore" in child_class:
275 # TODO : parse firstname and lastname
276 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list))
277 authors_list.append(author_data)
278 value_xml += author_xml
280 elif "titolo" in child_class:
281 value_xml += get_article_title_xml(c_text)
282 elif "rivista" in child_class:
283 value_xml += get_source_xml(c_text)
284 elif "anno" in child_class:
285 value_xml += get_year_xml(c_text)
286 elif "volume" in child_class:
287 value_xml += get_volume_xml(c_text)
288 elif "publisher" in child_class:
289 value_xml += get_publisher_xml(c_text)
290 else:
291 # booktitle
292 value_xml += c_text
294 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag.
295 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml)
296 if len(authors_occurence) > 0:
297 first_author = value_xml.index(authors_occurence[0])
298 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1])
299 value_xml = (
300 value_xml[:first_author]
301 + get_all_authors_xml(value_xml[first_author:last_author], authors_list)
302 + value_xml[last_author:]
303 )
305 return JatsBase.bake_ref(value_xml)
306 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements])
308 def parse_a_tag(self, a_tag: Tag):
309 a_text = escape(a_tag.text)
310 href = a_tag.get("href")
311 if not href:
312 return a_text, False
313 elif isinstance(href, list):
314 raise ValueError("a tag has multiple href values !")
315 else:
316 a_type = "uri"
317 if a_text.startswith("MR "):
318 a_type = "mr-item-id"
319 a_text = a_text.removeprefix("MR ")
320 elif a_text.startswith("Zbl "):
321 a_type = "zbl-item-id"
322 a_text = a_text.removeprefix("Zbl ")
323 elif a_text == "fulltext (doi)":
324 a_type = "doi"
325 a_text = a_text.removeprefix("http://dx.doi.org/")
326 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri"
328 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0):
329 value_xml = ""
330 author_data = {"template_str": ""}
331 for c in author_tag.children:
332 c_text = escape(c.text)
333 if isinstance(c, str):
334 author_data["template_str"] += c_text
335 continue
337 if not isinstance(c, Tag):
338 raise NotImplementedError("author_tag is not a Tag or a string")
339 # given name = cognome = prénom
340 # surname = nome = nom de famille
341 child_class = c.get("class")
342 if not child_class:
343 value_xml += c_text
344 elif "cognome" in child_class:
345 c.replace_with("{given_names}")
346 author_data["given_names"] = c_text
347 author_data["template_str"] += "{given_names}"
348 elif "nome" in child_class:
349 c.replace_with("{surname}")
350 author_data["surname"] = c_text
351 author_data["template_str"] += "{surname}"
352 value_xml += "{author_" + str(index) + "}"
354 return author_data, value_xml