Coverage for src/crawler/by_source/bdim_crawler.py: 9%
242 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import re
3import lingua
4import regex
5from bs4 import BeautifulSoup, Tag
6from lingua import LanguageDetectorBuilder
7from ptf.cmds.xml.jats.builder.citation import (
8 ContribAuthor,
9 get_all_authors_xml,
10 get_ext_link_xml,
11 get_publisher_xml,
12 get_source_xml,
13 get_volume_xml,
14 get_year_xml,
15)
16from ptf.cmds.xml.jats.builder.issue import get_title_xml
17from ptf.cmds.xml.jats.jats_parser import JatsBase
18from ptf.cmds.xml.xml_utils import escape
19from ptf.model_data import (
20 ArticleData,
21 create_abstract,
22 create_articledata,
23 create_contributor,
24 create_issuedata,
25)
27from crawler.base_crawler import BaseCollectionCrawler
28from crawler.utils import add_pdf_link_to_xarticle
31class BdimCrawler(BaseCollectionCrawler):
32 source_name = "Biblioteca Digitale Italiana di Matematica"
33 source_domain = "BDIM"
34 source_website = "http://www.bdim.eu"
35 headers = {"accept_encoding": "utf-8", "cookie": "lingua=en; matematica=tex"}
37 match_headers = ["cookie"]
38 title_corrections = {
39 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato",
40 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.",
41 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.",
42 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.",
43 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.",
44 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.",
45 }
47 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})"
49 language_detector = LanguageDetectorBuilder.from_languages(
50 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.ITALIAN
51 ).build()
53 def __init__(self, *args, **kwargs):
54 super().__init__(*args, **kwargs)
56 def parse_collection_content(self, content):
57 """
58 Parse the HTML page of Annals of Math and returns a list of xissue.
59 Each xissue has its pid/volume/number/year metadata + its url
60 """
61 soup = BeautifulSoup(content, "html.parser")
62 xissues = []
64 reg_issue = regex.compile(self.issue_href)
66 issue_nodes = []
67 for issue in soup.select("div.listafascicoli a"):
68 href = issue.get("href")
69 if isinstance(href, str) and reg_issue.search(href):
70 issue_nodes.append(issue)
72 for issue_node in issue_nodes:
73 # issue_text = issue_node.get_text()
75 part_issue = issue_node.get("href").split("_")
76 volume = part_issue[-2]
77 number = part_issue[-1]
78 year = part_issue[1]
79 serie = part_issue[2]
80 link = "/item" + issue_node.get("href")
81 xissue = self.create_bdim_xissue(link, serie, volume, number, year)
82 if xissue:
83 xissues.append(xissue)
85 return xissues
87 def get_year(self, year):
88 if "/" in year:
89 year = year.split("/")[0]
91 return year
93 def create_bdim_xissue(self, url, serie, volume, number, dates):
94 year = dates.replace("/", "-")
96 xissue = create_issuedata()
97 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}"
98 xissue.year = year
99 xissue.volume = volume
100 xissue.number = number
101 xissue.vseries = serie
102 xissue.url = self.source_website + url
104 return xissue
106 def parse_issue_content(self, content, xissue):
107 soup = BeautifulSoup(content, "html.parser")
108 article_nodes = soup.find_all("div", {"class": "referenza"})
110 for index_article, article_node in enumerate(article_nodes):
111 article_link_node = article_node.find("a", text="full entry")
112 if article_link_node:
113 url = article_link_node.get("href")
114 xarticle = create_articledata()
115 xarticle.pid = "a" + str(index_article)
116 xarticle.url = self.source_website + url
118 xissue.articles.append(xarticle)
120 def parse_article_content(self, content, xissue, xarticle, url):
121 """
122 Parse the content with Beautifulsoup and returns an ArticleData
123 """
124 soup = BeautifulSoup(content, "html.parser")
125 # TITLE
126 title_node = soup.select_one("span.titolo")
127 if not title_node:
128 raise ValueError("Couldn't find article title")
129 xarticle.title_tex = title_node.get_text()
130 if xarticle.title_tex == "":
131 xarticle.title_tex = " "
133 # Lang
134 if "(Italian)" in title_node.parent.text:
135 xarticle.lang = "it"
136 elif "(English)" in title_node.parent.text:
137 xarticle.lang = "en"
138 # Authors
139 reg_author_link = regex.compile(r"\?testo=\w+")
140 text_author_bloc = soup.select_one("div.referenza p")
141 if text_author_bloc:
142 for link in text_author_bloc.select("a"):
143 href = link.get("href")
144 if isinstance(href, str) and reg_author_link.search(href):
145 contrib_node = link.select_one("span.autore")
146 if contrib_node is not None:
147 surname_node = link.select_one("span.cognome")
148 firstname_node = link.select_one("span.nome")
149 author = create_contributor(role="author")
151 if surname_node is not None:
152 surname = surname_node.get_text()
153 author["last_name"] = surname
155 if firstname_node is not None:
156 firstname = firstname_node.get_text()
157 author["first_name"] = firstname
159 if not firstname_node or not surname_node:
160 string_name = contrib_node.get_text()
161 author["string_name"] = string_name
163 xarticle.contributors.append(author)
165 # ABSTRACT
166 abstract_section_node = soup.select_one("div.sunto")
167 if abstract_section_node:
168 abstract = str(abstract_section_node.get_text())
169 xabstract = create_abstract(
170 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract)
171 )
172 xarticle.abstracts.append(xabstract)
174 # PDF
175 pdf_url = soup.find_all("a", text="pdf")
176 if len(pdf_url) > 0:
177 pdf_url = self.source_website + pdf_url[0].get("href")
178 add_pdf_link_to_xarticle(xarticle, pdf_url)
180 # PAGES
181 pages = soup.select_one("span.pagine")
182 if pages:
183 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
184 if pages_to:
185 parts = pages_to[0].split("-")
186 first_page = parts[0].replace("(", "").replace(")", "")
187 if len(parts) > 1:
188 last_page = parts[1].replace("(", "").replace(")", "")
189 xarticle.lpage = last_page
190 xarticle.fpage = first_page
192 # Biblio
193 bibitems_tags = soup.select("div.biblio div.bibitem")
194 bibitems = [self.parse_ref(item) for item in bibitems_tags]
195 if len(bibitems) > 0:
196 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
198 # metadata
199 reg_zbl_id = re.compile(r"Zbl \w+")
200 reg_mr_id = re.compile(r"MR \d+")
202 medata_bloc = soup.select_one("div.referenza")
203 if not medata_bloc:
204 raise ValueError("metadata_bloc cannot be found")
205 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())]
206 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())]
208 if len(zbl_id) > 0:
209 zblid = zbl_id[0].get("href")
210 pos = zblid.find("?q=an:")
211 if pos > 0:
212 zblid = zblid[pos + 6 :]
213 xarticle.extids.append(("zbl-item-id", zblid))
214 if len(mr_id) > 0:
215 mr_id = mr_id[0].get_text()
216 mr_id = mr_id.split("MR ")
217 mr_id = mr_id[1]
218 xarticle.extids.append(("mr-item-id", mr_id))
220 if f"{xissue.pid}_{xarticle.pid}" in self.title_corrections:
221 xarticle.title_tex = self.title_corrections[f"{xissue.pid}_{xarticle.pid}"]
223 content = self.download_file(url, headers={"cookie": "lingua=it; matematica=tex"})
224 xarticle = self.parse_article_content_bdim_it(content, xissue, xarticle, url)
225 return xarticle
227 def parse_article_content_bdim_it(self, content, xissue, xarticle: ArticleData, url):
228 soup = BeautifulSoup(content, "html.parser")
230 # Trans_title
231 trans_title_node = soup.select_one("span.titolo_trad")
232 if trans_title_node:
233 xarticle.trans_title_tex = trans_title_node.get_text()
235 # trans abstract
236 abstract_section_node = soup.select_one("div.sunto")
237 if abstract_section_node:
238 abstract = str(abstract_section_node.get_text())
239 if xarticle.abstracts[0]["value_tex"] != abstract:
240 xabstract = create_abstract(
241 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract)
242 )
243 xarticle.abstracts.append(xabstract)
245 if xarticle.trans_title_tex or len(xarticle.abstracts) > 1:
246 xarticle.trans_lang = "en" if xarticle.lang == "it" else "it"
247 return xarticle
249 def parse_ref(self, item: Tag):
250 value_xml = ""
251 # First pass : we create an semi-complete XML Jats string, except for the authors
252 # that we store inside authors_list to be serialized at the end
253 authors_list: list[ContribAuthor] = []
254 for c in item.children:
255 c_text = escape(c.text)
256 if isinstance(c, str):
257 value_xml += c_text
258 continue
260 if not isinstance(c, Tag):
261 raise NotImplementedError("bibitem_tag is not a Tag or a string")
263 if c.name == "a":
264 a_xml, is_badge = self.parse_a_tag(c)
265 if is_badge:
266 value_xml = regex.sub(r" \| $", "", value_xml)
267 value_xml += a_xml
268 continue
270 child_class = c.get("class")
271 if not child_class:
272 value_xml += c_text
273 elif "bautore" in child_class:
274 # TODO : parse firstname and lastname
275 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list))
276 authors_list.append(author_data)
277 value_xml += author_xml
279 elif "titolo" in child_class:
280 value_xml += get_title_xml(c_text)
281 elif "rivista" in child_class:
282 value_xml += get_source_xml(c_text)
283 elif "anno" in child_class:
284 value_xml += get_year_xml(c_text)
285 elif "volume" in child_class:
286 value_xml += get_volume_xml(c_text)
287 elif "publisher" in child_class:
288 value_xml += get_publisher_xml(c_text)
289 else:
290 # booktitle
291 value_xml += c_text
293 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag.
294 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml)
295 if len(authors_occurence) > 0:
296 first_author = value_xml.index(authors_occurence[0])
297 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1])
298 value_xml = (
299 value_xml[:first_author]
300 + get_all_authors_xml(value_xml[first_author:last_author], authors_list)
301 + value_xml[last_author:]
302 )
304 return JatsBase.bake_ref(value_xml)
305 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements])
307 def parse_a_tag(self, a_tag: Tag):
308 a_text = escape(a_tag.text)
309 href = a_tag.get("href")
310 if not href:
311 return a_text, False
312 elif isinstance(href, list):
313 raise ValueError("a tag has multiple href values !")
314 else:
315 a_type = "uri"
316 if a_text.startswith("MR "):
317 a_type = "mr-item-id"
318 a_text = a_text.removeprefix("MR ")
319 elif a_text.startswith("Zbl "):
320 a_type = "zbl-item-id"
321 a_text = a_text.removeprefix("Zbl ")
322 elif a_text == "fulltext (doi)":
323 a_type = "doi"
324 a_text = a_text.removeprefix("http://dx.doi.org/")
325 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri"
327 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0):
328 value_xml = ""
329 author_data: ContribAuthor = {"template_str": ""}
330 for c in author_tag.children:
331 c_text = escape(c.text)
332 if isinstance(c, str):
333 author_data["template_str"] += c_text
334 continue
336 if not isinstance(c, Tag):
337 raise NotImplementedError("author_tag is not a Tag or a string")
338 # given name = cognome = prénom
339 # surname = nome = nom de famille
340 child_class = c.get("class")
341 if not child_class:
342 value_xml += c_text
343 elif "cognome" in child_class:
344 c.replace_with("{given_names}")
345 author_data["given_names"] = c_text
346 author_data["template_str"] += "{given_names}"
347 elif "nome" in child_class:
348 c.replace_with("{surname}")
349 author_data["surname"] = c_text
350 author_data["template_str"] += "{surname}"
351 value_xml += "{author_" + str(index) + "}"
353 return author_data, value_xml