Coverage for src/crawler/by_source/bdim_crawler.py: 8%

241 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import re 

2 

3import lingua 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from lingua import LanguageDetectorBuilder 

7from ptf.cmds.xml.jats.builder.citation import ( 

8 ContribAuthor, 

9 get_all_authors_xml, 

10 get_ext_link_xml, 

11 get_publisher_xml, 

12 get_source_xml, 

13 get_volume_xml, 

14 get_year_xml, 

15) 

16from ptf.cmds.xml.jats.builder.issue import get_title_xml 

17from ptf.cmds.xml.xml_utils import escape 

18from ptf.model_data import ( 

19 ArticleData, 

20 create_abstract, 

21 create_articledata, 

22 create_contributor, 

23 create_issuedata, 

24) 

25 

26from crawler.base_crawler import BaseCollectionCrawler 

27from crawler.utils import add_pdf_link_to_xarticle 

28 

29 

30class BdimCrawler(BaseCollectionCrawler): 

31 source_name = "Biblioteca Digitale Italiana di Matematica" 

32 source_domain = "BDIM" 

33 source_website = "http://www.bdim.eu" 

34 headers = {"accept_encoding": "utf-8", "cookie": "lingua=en; matematica=tex"} 

35 

36 match_headers = ["cookie"] 

37 title_corrections = { 

38 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato", 

39 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.", 

40 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.", 

41 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.", 

42 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.", 

43 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.", 

44 } 

45 

46 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})" 

47 

48 language_detector = LanguageDetectorBuilder.from_languages( 

49 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.ITALIAN 

50 ).build() 

51 

52 def __init__(self, *args, **kwargs): 

53 super().__init__(*args, **kwargs) 

54 

55 def parse_collection_content(self, content): 

56 """ 

57 Parse the HTML page of Annals of Math and returns a list of xissue. 

58 Each xissue has its pid/volume/number/year metadata + its url 

59 """ 

60 soup = BeautifulSoup(content, "html.parser") 

61 xissues = [] 

62 

63 reg_issue = regex.compile(self.issue_href) 

64 

65 issue_nodes = [] 

66 for issue in soup.select("div.listafascicoli a"): 

67 href = issue.get("href") 

68 if isinstance(href, str) and reg_issue.search(href): 

69 issue_nodes.append(issue) 

70 

71 for issue_node in issue_nodes: 

72 # issue_text = issue_node.get_text() 

73 

74 part_issue = issue_node.get("href").split("_") 

75 volume = part_issue[-2] 

76 number = part_issue[-1] 

77 year = part_issue[1] 

78 serie = part_issue[2] 

79 link = "/item" + issue_node.get("href") 

80 xissue = self.create_bdim_xissue(link, serie, volume, number, year) 

81 if xissue: 

82 xissues.append(xissue) 

83 

84 return xissues 

85 

86 def get_year(self, year): 

87 if "/" in year: 

88 year = year.split("/")[0] 

89 

90 return year 

91 

92 def create_bdim_xissue(self, url, serie, volume, number, dates): 

93 year = dates.replace("/", "-") 

94 

95 xissue = create_issuedata() 

96 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}" 

97 xissue.year = year 

98 xissue.volume = volume 

99 xissue.number = number 

100 xissue.vseries = serie 

101 xissue.url = self.source_website + url 

102 

103 return xissue 

104 

105 def parse_issue_content(self, content, xissue): 

106 soup = BeautifulSoup(content, "html.parser") 

107 article_nodes = soup.find_all("div", {"class": "referenza"}) 

108 

109 for index_article, article_node in enumerate(article_nodes): 

110 article_link_node = article_node.find("a", text="full entry") 

111 if article_link_node: 

112 url = article_link_node.get("href") 

113 xarticle = create_articledata() 

114 xarticle.pid = "a" + str(index_article) 

115 xarticle.url = self.source_website + url 

116 

117 xissue.articles.append(xarticle) 

118 

119 def parse_article_content(self, content, xissue, xarticle, url): 

120 """ 

121 Parse the content with Beautifulsoup and returns an ArticleData 

122 """ 

123 soup = BeautifulSoup(content, "html.parser") 

124 # TITLE 

125 title_node = soup.select_one("span.titolo") 

126 if not title_node: 

127 raise ValueError("Couldn't find article title") 

128 xarticle.title_tex = title_node.get_text() 

129 if xarticle.title_tex == "": 

130 xarticle.title_tex = " " 

131 

132 # Lang 

133 if "(Italian)" in title_node.parent.text: 

134 xarticle.lang = "it" 

135 elif "(English)" in title_node.parent.text: 

136 xarticle.lang = "en" 

137 # Authors 

138 reg_author_link = regex.compile(r"\?testo=\w+") 

139 text_author_bloc = soup.select_one("div.referenza p") 

140 if text_author_bloc: 

141 for link in text_author_bloc.select("a"): 

142 href = link.get("href") 

143 if isinstance(href, str) and reg_author_link.search(href): 

144 contrib_node = link.select_one("span.autore") 

145 if contrib_node is not None: 

146 surname_node = link.select_one("span.cognome") 

147 firstname_node = link.select_one("span.nome") 

148 author = create_contributor(role="author") 

149 

150 if surname_node is not None: 

151 surname = surname_node.get_text() 

152 author["last_name"] = surname 

153 

154 if firstname_node is not None: 

155 firstname = firstname_node.get_text() 

156 author["first_name"] = firstname 

157 

158 if not firstname_node or not surname_node: 

159 string_name = contrib_node.get_text() 

160 author["string_name"] = string_name 

161 

162 xarticle.contributors.append(author) 

163 

164 # ABSTRACT 

165 abstract_section_node = soup.select_one("div.sunto") 

166 if abstract_section_node: 

167 abstract = str(abstract_section_node.get_text()) 

168 xabstract = create_abstract( 

169 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract) 

170 ) 

171 xarticle.abstracts.append(xabstract) 

172 

173 # PDF 

174 pdf_url = soup.find_all("a", text="pdf") 

175 if len(pdf_url) > 0: 

176 pdf_url = self.source_website + pdf_url[0].get("href") 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 # PAGES 

180 pages = soup.select_one("span.pagine") 

181 if pages: 

182 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

183 if pages_to: 

184 parts = pages_to[0].split("-") 

185 first_page = parts[0].replace("(", "").replace(")", "") 

186 if len(parts) > 1: 

187 last_page = parts[1].replace("(", "").replace(")", "") 

188 xarticle.lpage = last_page 

189 xarticle.fpage = first_page 

190 

191 # Biblio 

192 bibitems_tags = soup.select("div.biblio div.bibitem") 

193 bibitems = [self.parse_ref(item) for item in bibitems_tags] 

194 if len(bibitems) > 0: 

195 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

196 

197 # metadata 

198 reg_zbl_id = re.compile(r"Zbl \w+") 

199 reg_mr_id = re.compile(r"MR \d+") 

200 

201 medata_bloc = soup.select_one("div.referenza") 

202 if not medata_bloc: 

203 raise ValueError("metadata_bloc cannot be found") 

204 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())] 

205 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())] 

206 

207 if len(zbl_id) > 0: 

208 zblid = zbl_id[0].get("href") 

209 pos = zblid.find("?q=an:") 

210 if pos > 0: 

211 zblid = zblid[pos + 6 :] 

212 xarticle.extids.append(("zbl-item-id", zblid)) 

213 if len(mr_id) > 0: 

214 mr_id = mr_id[0].get_text() 

215 mr_id = mr_id.split("MR ") 

216 mr_id = mr_id[1] 

217 xarticle.extids.append(("mr-item-id", mr_id)) 

218 

219 if f"{xissue.pid}_{xarticle.pid}" in self.title_corrections: 

220 xarticle.title_tex = self.title_corrections[f"{xissue.pid}_{xarticle.pid}"] 

221 

222 content = self.download_file(url, headers={"cookie": "lingua=it; matematica=tex"}) 

223 xarticle = self.parse_article_content_bdim_it(content, xissue, xarticle, url) 

224 return xarticle 

225 

226 def parse_article_content_bdim_it(self, content, xissue, xarticle: ArticleData, url): 

227 soup = BeautifulSoup(content, "html.parser") 

228 

229 # Trans_title 

230 trans_title_node = soup.select_one("span.titolo_trad") 

231 if trans_title_node: 

232 xarticle.trans_title_tex = trans_title_node.get_text() 

233 

234 # trans abstract 

235 abstract_section_node = soup.select_one("div.sunto") 

236 if abstract_section_node: 

237 abstract = str(abstract_section_node.get_text()) 

238 if xarticle.abstracts[0]["value_tex"] != abstract: 

239 xabstract = create_abstract( 

240 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract) 

241 ) 

242 xarticle.abstracts.append(xabstract) 

243 

244 if xarticle.trans_title_tex or len(xarticle.abstracts) > 1: 

245 xarticle.trans_lang = "en" if xarticle.lang == "it" else "it" 

246 return xarticle 

247 

248 def parse_ref(self, item: Tag): 

249 value_xml = "" 

250 # First pass : we create an semi-complete XML Jats string, except for the authors 

251 # that we store inside authors_list to be serialized at the end 

252 authors_list: list[ContribAuthor] = [] 

253 for c in item.children: 

254 c_text = escape(c.text) 

255 if isinstance(c, str): 

256 value_xml += c_text 

257 continue 

258 

259 if not isinstance(c, Tag): 

260 raise NotImplementedError("bibitem_tag is not a Tag or a string") 

261 

262 if c.name == "a": 

263 a_xml, is_badge = self.parse_a_tag(c) 

264 if is_badge: 

265 value_xml = regex.sub(r" \| $", "", value_xml) 

266 value_xml += a_xml 

267 continue 

268 

269 child_class = c.get("class") 

270 if not child_class: 

271 value_xml += c_text 

272 elif "bautore" in child_class: 

273 # TODO : parse firstname and lastname 

274 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list)) 

275 authors_list.append(author_data) 

276 value_xml += author_xml 

277 

278 elif "titolo" in child_class: 

279 value_xml += get_title_xml(c_text) 

280 elif "rivista" in child_class: 

281 value_xml += get_source_xml(c_text) 

282 elif "anno" in child_class: 

283 value_xml += get_year_xml(c_text) 

284 elif "volume" in child_class: 

285 value_xml += get_volume_xml(c_text) 

286 elif "publisher" in child_class: 

287 value_xml += get_publisher_xml(c_text) 

288 else: 

289 # booktitle 

290 value_xml += c_text 

291 

292 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag. 

293 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml) 

294 if len(authors_occurence) > 0: 

295 first_author = value_xml.index(authors_occurence[0]) 

296 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1]) 

297 value_xml = ( 

298 value_xml[:first_author] 

299 + get_all_authors_xml(value_xml[first_author:last_author], authors_list) 

300 + value_xml[last_author:] 

301 ) 

302 

303 return self.create_crawled_bibitem(value_xml) 

304 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements]) 

305 

306 def parse_a_tag(self, a_tag: Tag): 

307 a_text = escape(a_tag.text) 

308 href = a_tag.get("href") 

309 if not href: 

310 return a_text, False 

311 elif isinstance(href, list): 

312 raise ValueError("a tag has multiple href values !") 

313 else: 

314 a_type = "uri" 

315 if a_text.startswith("MR "): 

316 a_type = "mr-item-id" 

317 a_text = a_text.removeprefix("MR ") 

318 elif a_text.startswith("Zbl "): 

319 a_type = "zbl-item-id" 

320 a_text = a_text.removeprefix("Zbl ") 

321 elif a_text == "fulltext (doi)": 

322 a_type = "doi" 

323 a_text = a_text.removeprefix("http://dx.doi.org/") 

324 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri" 

325 

326 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0): 

327 value_xml = "" 

328 author_data: ContribAuthor = {"template_str": ""} 

329 for c in author_tag.children: 

330 c_text = escape(c.text) 

331 if isinstance(c, str): 

332 author_data["template_str"] += c_text 

333 continue 

334 

335 if not isinstance(c, Tag): 

336 raise NotImplementedError("author_tag is not a Tag or a string") 

337 # given name = cognome = prénom 

338 # surname = nome = nom de famille 

339 child_class = c.get("class") 

340 if not child_class: 

341 value_xml += c_text 

342 elif "cognome" in child_class: 

343 c.replace_with("{given_names}") 

344 author_data["given_names"] = c_text 

345 author_data["template_str"] += "{given_names}" 

346 elif "nome" in child_class: 

347 c.replace_with("{surname}") 

348 author_data["surname"] = c_text 

349 author_data["template_str"] += "{surname}" 

350 value_xml += "{author_" + str(index) + "}" 

351 

352 return author_data, value_xml