Coverage for src/crawler/by_source/bdim_crawler.py: 89%
216 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.jats.builder.citation import (
6 ContribAuthor,
7 get_all_authors_xml,
8 get_ext_link_xml,
9 get_publisher_xml,
10 get_source_xml,
11 get_volume_xml,
12 get_year_xml,
13)
14from ptf.cmds.xml.jats.builder.issue import get_title_xml
15from ptf.cmds.xml.xml_utils import escape
16from ptf.model_data import (
17 create_abstract,
18 create_articledata,
19 create_contributor,
20 create_issuedata,
21)
23from crawler.base_crawler import BaseCollectionCrawler
24from crawler.utils import add_pdf_link_to_xarticle
27class BdimCrawler(BaseCollectionCrawler):
28 source_name = "Biblioteca Digitale Italiana di Matematica"
29 source_domain = "BDIM"
30 source_website = "http://www.bdim.eu"
32 title_corrections = {
33 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato",
34 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.",
35 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.",
36 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.",
37 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.",
38 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.",
39 }
41 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})"
43 def parse_collection_content(self, content):
44 """
45 Parse the HTML page of Annals of Math and returns a list of xissue.
46 Each xissue has its pid/volume/number/year metadata + its url
48 self.periode is set at the end based on the xissue years of the HTML page
49 """
50 soup = BeautifulSoup(content, "html.parser")
51 xissues = []
53 reg_issue = regex.compile(self.issue_href)
55 issue_nodes = []
56 for issue in soup.select("div.listafascicoli a"):
57 href = issue.get("href")
58 if isinstance(href, str) and reg_issue.search(href): 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true
59 issue_nodes.append(issue)
61 for issue_node in issue_nodes:
62 # issue_text = issue_node.get_text()
64 part_issue = issue_node.get("href").split("_")
65 volume = part_issue[-2]
66 number = part_issue[-1]
67 year = part_issue[1]
68 serie = part_issue[2]
69 link = "/item" + issue_node.get("href")
70 xissue = self.create_bdim_xissue(link, serie, volume, number, year)
71 if xissue: 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true
72 xissues.append(xissue)
74 self.periode_begin = self.get_year(xissues[0].year)
75 self.periode_end = self.get_year(xissues[-1].year)
77 self.periode = self.get_or_create_periode()
79 return xissues
81 def get_year(self, year):
82 if "/" in year: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 year = year.split("/")[0]
85 return year
87 def create_bdim_xissue(self, url, serie, volume, number, dates):
88 year = dates.replace("/", "-")
90 xissue = create_issuedata()
91 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}"
92 xissue.year = year
93 xissue.volume = volume
94 xissue.number = number
95 xissue.vseries = serie
96 xissue.url = self.source_website + url
98 return xissue
100 def parse_issue_content(self, content, xissue):
101 soup = BeautifulSoup(content, "html.parser")
102 article_nodes = soup.find_all("div", {"class": "referenza"})
104 for index_article, article_node in enumerate(article_nodes):
105 article_link_node = article_node.find("a", text="referenza completa")
106 if article_link_node: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true
107 url = article_link_node.get("href")
108 xarticle = create_articledata()
109 xarticle.pid = "a" + str(index_article)
110 xarticle.url = self.source_website + url
112 xissue.articles.append(xarticle)
114 def parse_article_content(self, content, xissue, xarticle, url, pid):
115 """
116 Parse the content with Beautifulsoup and returns an ArticleData
117 """
118 xarticle = create_articledata()
119 xarticle.pid = pid
120 # TODO : is this correct ?
121 xarticle.lang = "it"
122 soup = BeautifulSoup(content, "html.parser")
123 # TITLE
124 title_node = soup.select_one("span.titolo")
125 if title_node: 125 ↛ 131line 125 didn't jump to line 131 because the condition on line 125 was always true
126 xarticle.title_tex = title_node.get_text()
127 if xarticle.title_tex == "":
128 xarticle.title_tex = " "
130 # Authors
131 reg_author_link = regex.compile(r"\?testo=\w+")
132 text_author_bloc = soup.select_one("div.referenza p")
133 if text_author_bloc: 133 ↛ 158line 133 didn't jump to line 158 because the condition on line 133 was always true
134 for link in text_author_bloc.select("a"):
135 href = link.get("href")
136 if isinstance(href, str) and reg_author_link.search(href):
137 contrib_node = link.select_one("span.autore")
138 if contrib_node is not None: 138 ↛ 134line 138 didn't jump to line 134 because the condition on line 138 was always true
139 surname_node = link.select_one("span.cognome")
140 firstname_node = link.select_one("span.nome")
141 author = create_contributor(role="author")
143 if surname_node is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true
144 surname = surname_node.get_text()
145 author["last_name"] = surname
147 if firstname_node is not None: 147 ↛ 151line 147 didn't jump to line 151 because the condition on line 147 was always true
148 firstname = firstname_node.get_text()
149 author["first_name"] = firstname
151 if not firstname_node or not surname_node: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 string_name = contrib_node.get_text()
153 author["string_name"] = string_name
155 xarticle.contributors.append(author)
157 # ABSTRACT
158 abstract_section_node = soup.select_one("div.sunto")
159 if abstract_section_node: 159 ↛ 167line 159 didn't jump to line 167 because the condition on line 159 was always true
160 abstract = str(abstract_section_node.get_text())
161 xabstract = create_abstract(
162 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract)
163 )
164 xarticle.abstracts.append(xabstract)
166 # PDF
167 pdf_url = soup.find_all("a", text="pdf")
168 if len(pdf_url) > 0: 168 ↛ 173line 168 didn't jump to line 173 because the condition on line 168 was always true
169 pdf_url = self.source_website + pdf_url[0].get("href")
170 add_pdf_link_to_xarticle(xarticle, pdf_url)
172 # PAGES
173 pages = soup.select_one("span.pagine")
174 if pages: 174 ↛ 185line 174 didn't jump to line 185 because the condition on line 174 was always true
175 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
176 if pages_to: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true
177 parts = pages_to[0].split("-")
178 first_page = parts[0].replace("(", "").replace(")", "")
179 if len(parts) > 1: 179 ↛ 182line 179 didn't jump to line 182 because the condition on line 179 was always true
180 last_page = parts[1].replace("(", "").replace(")", "")
181 xarticle.lpage = last_page
182 xarticle.fpage = first_page
184 # Biblio
185 bibitems_tags = soup.select("div.biblio div.bibitem")
186 bibitems = [self.parse_ref(item) for item in bibitems_tags]
187 if len(bibitems) > 0:
188 xarticle.abstracts.append(self.create_bibliography(bibitems))
190 # metadata
191 reg_zbl_id = re.compile(r"Zbl \w+")
192 reg_mr_id = re.compile(r"MR \d+")
194 medata_bloc = soup.select_one("div.referenza")
195 if not medata_bloc: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 raise ValueError("metadata_bloc cannot be found")
197 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())]
198 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())]
200 if len(zbl_id) > 0:
201 zblid = zbl_id[0].get("href")
202 pos = zblid.find("?q=an:")
203 if pos > 0: 203 ↛ 205line 203 didn't jump to line 205 because the condition on line 203 was always true
204 zblid = zblid[pos + 6 :]
205 xarticle.extids.append(("zbl-item-id", zblid))
206 if len(mr_id) > 0:
207 mr_id = mr_id[0].get_text()
208 mr_id = mr_id.split("MR ")
209 mr_id = mr_id[1]
210 xarticle.extids.append(("mr-item-id", mr_id))
212 if xarticle.pid in self.title_corrections:
213 xarticle.title_tex = self.title_corrections[xarticle.pid]
215 return xarticle
217 def parse_ref(self, item: Tag):
218 value_xml = ""
219 # First pass : we create an semi-complete XML Jats string, except for the authors
220 # that we store inside authors_list to be serialized at the end
221 authors_list: list[ContribAuthor] = []
222 for c in item.children:
223 c_text = escape(c.text)
224 if isinstance(c, str):
225 value_xml += c_text
226 continue
228 if not isinstance(c, Tag): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 raise NotImplementedError("bibitem_tag is not a Tag or a string")
231 if c.name == "a":
232 a_xml, is_badge = self.parse_a_tag(c)
233 if is_badge:
234 value_xml = regex.sub(r" \| $", "", value_xml)
235 value_xml += a_xml
236 continue
238 child_class = c.get("class")
239 if not child_class: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 value_xml += c_text
241 elif "bautore" in child_class:
242 # TODO : parse firstname and lastname
243 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list))
244 authors_list.append(author_data)
245 value_xml += author_xml
247 elif "titolo" in child_class:
248 value_xml += get_title_xml(c_text)
249 elif "rivista" in child_class:
250 value_xml += get_source_xml(c_text)
251 elif "anno" in child_class:
252 value_xml += get_year_xml(c_text)
253 elif "volume" in child_class:
254 value_xml += get_volume_xml(c_text)
255 elif "publisher" in child_class:
256 value_xml += get_publisher_xml(c_text)
257 else:
258 # booktitle
259 value_xml += c_text
261 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag.
262 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml)
263 if len(authors_occurence) > 0:
264 first_author = value_xml.index(authors_occurence[0])
265 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1])
266 value_xml = (
267 value_xml[:first_author]
268 + get_all_authors_xml(value_xml[first_author:last_author], authors_list)
269 + value_xml[last_author:]
270 )
272 return self.create_crawled_bibitem(value_xml)
273 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements])
275 def parse_a_tag(self, a_tag: Tag):
276 a_text = escape(a_tag.text)
277 href = a_tag.get("href")
278 if not href: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true
279 return a_text, False
280 elif isinstance(href, list): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 raise ValueError("a tag has multiple href values !")
282 else:
283 a_type = "uri"
284 if a_text.startswith("MR "):
285 a_type = "mr-item-id"
286 a_text = a_text.removeprefix("MR ")
287 elif a_text.startswith("Zbl "):
288 a_type = "zbl-item-id"
289 a_text = a_text.removeprefix("Zbl ")
290 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri"
292 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0):
293 value_xml = ""
294 author_data: ContribAuthor = {"template_str": ""}
295 for c in author_tag.children:
296 c_text = escape(c.text)
297 if isinstance(c, str):
298 author_data["template_str"] += c_text
299 continue
301 if not isinstance(c, Tag): 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true
302 raise NotImplementedError("author_tag is not a Tag or a string")
303 # given name = cognome = prénom
304 # surname = nome = nom de famille
305 child_class = c.get("class")
306 if not child_class: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 value_xml += c_text
308 elif "cognome" in child_class:
309 c.replace_with("{given_names}")
310 author_data["given_names"] = c_text
311 author_data["template_str"] += "{given_names}"
312 elif "nome" in child_class: 312 ↛ 295line 312 didn't jump to line 295 because the condition on line 312 was always true
313 c.replace_with("{surname}")
314 author_data["surname"] = c_text
315 author_data["template_str"] += "{surname}"
316 value_xml += "{author_" + str(index) + "}"
318 return author_data, value_xml