Coverage for src/crawler/by_source/bdim_crawler.py: 88%

216 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2 

3import lingua 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from lingua import LanguageDetectorBuilder 

7from ptf.cmds.xml.jats.builder.citation import ( 

8 ContribAuthor, 

9 get_all_authors_xml, 

10 get_ext_link_xml, 

11 get_publisher_xml, 

12 get_source_xml, 

13 get_volume_xml, 

14 get_year_xml, 

15) 

16from ptf.cmds.xml.jats.builder.issue import get_title_xml 

17from ptf.cmds.xml.xml_utils import escape 

18from ptf.model_data import ( 

19 create_abstract, 

20 create_articledata, 

21 create_contributor, 

22 create_issuedata, 

23) 

24 

25from crawler.base_crawler import BaseCollectionCrawler 

26from crawler.utils import add_pdf_link_to_xarticle 

27 

28 

29class BdimCrawler(BaseCollectionCrawler): 

30 source_name = "Biblioteca Digitale Italiana di Matematica" 

31 source_domain = "BDIM" 

32 source_website = "http://www.bdim.eu" 

33 

34 title_corrections = { 

35 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato", 

36 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.", 

37 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.", 

38 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.", 

39 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.", 

40 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.", 

41 } 

42 

43 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})" 

44 

45 language_detector = LanguageDetectorBuilder.from_languages( 

46 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.ITALIAN 

47 ).build() 

48 

49 def __init__(self, *args, **kwargs): 

50 super().__init__(*args, **kwargs) 

51 

52 def parse_collection_content(self, content): 

53 """ 

54 Parse the HTML page of Annals of Math and returns a list of xissue. 

55 Each xissue has its pid/volume/number/year metadata + its url 

56 """ 

57 soup = BeautifulSoup(content, "html.parser") 

58 xissues = [] 

59 

60 reg_issue = regex.compile(self.issue_href) 

61 

62 issue_nodes = [] 

63 for issue in soup.select("div.listafascicoli a"): 

64 href = issue.get("href") 

65 if isinstance(href, str) and reg_issue.search(href): 65 ↛ 63line 65 didn't jump to line 63 because the condition on line 65 was always true

66 issue_nodes.append(issue) 

67 

68 for issue_node in issue_nodes: 

69 # issue_text = issue_node.get_text() 

70 

71 part_issue = issue_node.get("href").split("_") 

72 volume = part_issue[-2] 

73 number = part_issue[-1] 

74 year = part_issue[1] 

75 serie = part_issue[2] 

76 link = "/item" + issue_node.get("href") 

77 xissue = self.create_bdim_xissue(link, serie, volume, number, year) 

78 if xissue: 78 ↛ 68line 78 didn't jump to line 68 because the condition on line 78 was always true

79 xissues.append(xissue) 

80 

81 return xissues 

82 

83 def get_year(self, year): 

84 if "/" in year: 

85 year = year.split("/")[0] 

86 

87 return year 

88 

89 def create_bdim_xissue(self, url, serie, volume, number, dates): 

90 year = dates.replace("/", "-") 

91 

92 xissue = create_issuedata() 

93 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}" 

94 xissue.year = year 

95 xissue.volume = volume 

96 xissue.number = number 

97 xissue.vseries = serie 

98 xissue.url = self.source_website + url 

99 

100 return xissue 

101 

102 def parse_issue_content(self, content, xissue): 

103 soup = BeautifulSoup(content, "html.parser") 

104 article_nodes = soup.find_all("div", {"class": "referenza"}) 

105 

106 for index_article, article_node in enumerate(article_nodes): 

107 article_link_node = article_node.find("a", text="referenza completa") 

108 if article_link_node: 108 ↛ 106line 108 didn't jump to line 106 because the condition on line 108 was always true

109 url = article_link_node.get("href") 

110 xarticle = create_articledata() 

111 xarticle.pid = "a" + str(index_article) 

112 xarticle.url = self.source_website + url 

113 

114 xissue.articles.append(xarticle) 

115 

116 def parse_article_content(self, content, xissue, xarticle, url): 

117 """ 

118 Parse the content with Beautifulsoup and returns an ArticleData 

119 """ 

120 # TODO : is this correct ? 

121 xarticle.lang = "it" 

122 soup = BeautifulSoup(content, "html.parser") 

123 # TITLE 

124 title_node = soup.select_one("span.titolo") 

125 if title_node: 125 ↛ 131line 125 didn't jump to line 131 because the condition on line 125 was always true

126 xarticle.title_tex = title_node.get_text() 

127 if xarticle.title_tex == "": 

128 xarticle.title_tex = " " 

129 

130 # Authors 

131 reg_author_link = regex.compile(r"\?testo=\w+") 

132 text_author_bloc = soup.select_one("div.referenza p") 

133 if text_author_bloc: 133 ↛ 158line 133 didn't jump to line 158 because the condition on line 133 was always true

134 for link in text_author_bloc.select("a"): 

135 href = link.get("href") 

136 if isinstance(href, str) and reg_author_link.search(href): 

137 contrib_node = link.select_one("span.autore") 

138 if contrib_node is not None: 138 ↛ 134line 138 didn't jump to line 134 because the condition on line 138 was always true

139 surname_node = link.select_one("span.cognome") 

140 firstname_node = link.select_one("span.nome") 

141 author = create_contributor(role="author") 

142 

143 if surname_node is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true

144 surname = surname_node.get_text() 

145 author["last_name"] = surname 

146 

147 if firstname_node is not None: 147 ↛ 151line 147 didn't jump to line 151 because the condition on line 147 was always true

148 firstname = firstname_node.get_text() 

149 author["first_name"] = firstname 

150 

151 if not firstname_node or not surname_node: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 string_name = contrib_node.get_text() 

153 author["string_name"] = string_name 

154 

155 xarticle.contributors.append(author) 

156 

157 # ABSTRACT 

158 abstract_section_node = soup.select_one("div.sunto") 

159 if abstract_section_node: 159 ↛ 167line 159 didn't jump to line 167 because the condition on line 159 was always true

160 abstract = str(abstract_section_node.get_text()) 

161 xabstract = create_abstract( 

162 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract) 

163 ) 

164 xarticle.abstracts.append(xabstract) 

165 

166 # PDF 

167 pdf_url = soup.find_all("a", text="pdf") 

168 if len(pdf_url) > 0: 168 ↛ 173line 168 didn't jump to line 173 because the condition on line 168 was always true

169 pdf_url = self.source_website + pdf_url[0].get("href") 

170 add_pdf_link_to_xarticle(xarticle, pdf_url) 

171 

172 # PAGES 

173 pages = soup.select_one("span.pagine") 

174 if pages: 174 ↛ 185line 174 didn't jump to line 185 because the condition on line 174 was always true

175 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

176 if pages_to: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 parts = pages_to[0].split("-") 

178 first_page = parts[0].replace("(", "").replace(")", "") 

179 if len(parts) > 1: 179 ↛ 182line 179 didn't jump to line 182 because the condition on line 179 was always true

180 last_page = parts[1].replace("(", "").replace(")", "") 

181 xarticle.lpage = last_page 

182 xarticle.fpage = first_page 

183 

184 # Biblio 

185 bibitems_tags = soup.select("div.biblio div.bibitem") 

186 bibitems = [self.parse_ref(item) for item in bibitems_tags] 

187 if len(bibitems) > 0: 

188 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

189 

190 # metadata 

191 reg_zbl_id = re.compile(r"Zbl \w+") 

192 reg_mr_id = re.compile(r"MR \d+") 

193 

194 medata_bloc = soup.select_one("div.referenza") 

195 if not medata_bloc: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 raise ValueError("metadata_bloc cannot be found") 

197 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())] 

198 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())] 

199 

200 if len(zbl_id) > 0: 

201 zblid = zbl_id[0].get("href") 

202 pos = zblid.find("?q=an:") 

203 if pos > 0: 203 ↛ 205line 203 didn't jump to line 205 because the condition on line 203 was always true

204 zblid = zblid[pos + 6 :] 

205 xarticle.extids.append(("zbl-item-id", zblid)) 

206 if len(mr_id) > 0: 

207 mr_id = mr_id[0].get_text() 

208 mr_id = mr_id.split("MR ") 

209 mr_id = mr_id[1] 

210 xarticle.extids.append(("mr-item-id", mr_id)) 

211 

212 if f"{xissue.pid}_{xarticle.pid}" in self.title_corrections: 

213 xarticle.title_tex = self.title_corrections[f"{xissue.pid}_{xarticle.pid}"] 

214 

215 return xarticle 

216 

217 def parse_ref(self, item: Tag): 

218 value_xml = "" 

219 # First pass : we create an semi-complete XML Jats string, except for the authors 

220 # that we store inside authors_list to be serialized at the end 

221 authors_list: list[ContribAuthor] = [] 

222 for c in item.children: 

223 c_text = escape(c.text) 

224 if isinstance(c, str): 

225 value_xml += c_text 

226 continue 

227 

228 if not isinstance(c, Tag): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 raise NotImplementedError("bibitem_tag is not a Tag or a string") 

230 

231 if c.name == "a": 

232 a_xml, is_badge = self.parse_a_tag(c) 

233 if is_badge: 

234 value_xml = regex.sub(r" \| $", "", value_xml) 

235 value_xml += a_xml 

236 continue 

237 

238 child_class = c.get("class") 

239 if not child_class: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 value_xml += c_text 

241 elif "bautore" in child_class: 

242 # TODO : parse firstname and lastname 

243 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list)) 

244 authors_list.append(author_data) 

245 value_xml += author_xml 

246 

247 elif "titolo" in child_class: 

248 value_xml += get_title_xml(c_text) 

249 elif "rivista" in child_class: 

250 value_xml += get_source_xml(c_text) 

251 elif "anno" in child_class: 

252 value_xml += get_year_xml(c_text) 

253 elif "volume" in child_class: 

254 value_xml += get_volume_xml(c_text) 

255 elif "publisher" in child_class: 

256 value_xml += get_publisher_xml(c_text) 

257 else: 

258 # booktitle 

259 value_xml += c_text 

260 

261 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag. 

262 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml) 

263 if len(authors_occurence) > 0: 

264 first_author = value_xml.index(authors_occurence[0]) 

265 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1]) 

266 value_xml = ( 

267 value_xml[:first_author] 

268 + get_all_authors_xml(value_xml[first_author:last_author], authors_list) 

269 + value_xml[last_author:] 

270 ) 

271 

272 return self.create_crawled_bibitem(value_xml) 

273 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements]) 

274 

275 def parse_a_tag(self, a_tag: Tag): 

276 a_text = escape(a_tag.text) 

277 href = a_tag.get("href") 

278 if not href: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true

279 return a_text, False 

280 elif isinstance(href, list): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 raise ValueError("a tag has multiple href values !") 

282 else: 

283 a_type = "uri" 

284 if a_text.startswith("MR "): 

285 a_type = "mr-item-id" 

286 a_text = a_text.removeprefix("MR ") 

287 elif a_text.startswith("Zbl "): 

288 a_type = "zbl-item-id" 

289 a_text = a_text.removeprefix("Zbl ") 

290 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri" 

291 

292 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0): 

293 value_xml = "" 

294 author_data: ContribAuthor = {"template_str": ""} 

295 for c in author_tag.children: 

296 c_text = escape(c.text) 

297 if isinstance(c, str): 

298 author_data["template_str"] += c_text 

299 continue 

300 

301 if not isinstance(c, Tag): 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true

302 raise NotImplementedError("author_tag is not a Tag or a string") 

303 # given name = cognome = prénom 

304 # surname = nome = nom de famille 

305 child_class = c.get("class") 

306 if not child_class: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 value_xml += c_text 

308 elif "cognome" in child_class: 

309 c.replace_with("{given_names}") 

310 author_data["given_names"] = c_text 

311 author_data["template_str"] += "{given_names}" 

312 elif "nome" in child_class: 312 ↛ 295line 312 didn't jump to line 295 because the condition on line 312 was always true

313 c.replace_with("{surname}") 

314 author_data["surname"] = c_text 

315 author_data["template_str"] += "{surname}" 

316 value_xml += "{author_" + str(index) + "}" 

317 

318 return author_data, value_xml