Coverage for src/crawler/by_source/bdim_crawler.py: 89%

216 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.citation import ( 

6 ContribAuthor, 

7 get_all_authors_xml, 

8 get_ext_link_xml, 

9 get_publisher_xml, 

10 get_source_xml, 

11 get_volume_xml, 

12 get_year_xml, 

13) 

14from ptf.cmds.xml.jats.builder.issue import get_title_xml 

15from ptf.cmds.xml.xml_utils import escape 

16from ptf.model_data import ( 

17 create_abstract, 

18 create_articledata, 

19 create_contributor, 

20 create_issuedata, 

21) 

22 

23from crawler.base_crawler import BaseCollectionCrawler 

24from crawler.utils import add_pdf_link_to_xarticle 

25 

26 

27class BdimCrawler(BaseCollectionCrawler): 

28 source_name = "Biblioteca Digitale Italiana di Matematica" 

29 source_domain = "BDIM" 

30 source_website = "http://www.bdim.eu" 

31 

32 title_corrections = { 

33 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato", 

34 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.", 

35 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.", 

36 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.", 

37 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.", 

38 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.", 

39 } 

40 

41 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})" 

42 

43 def parse_collection_content(self, content): 

44 """ 

45 Parse the HTML page of Annals of Math and returns a list of xissue. 

46 Each xissue has its pid/volume/number/year metadata + its url 

47 

48 self.periode is set at the end based on the xissue years of the HTML page 

49 """ 

50 soup = BeautifulSoup(content, "html.parser") 

51 xissues = [] 

52 

53 reg_issue = regex.compile(self.issue_href) 

54 

55 issue_nodes = [] 

56 for issue in soup.select("div.listafascicoli a"): 

57 href = issue.get("href") 

58 if isinstance(href, str) and reg_issue.search(href): 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true

59 issue_nodes.append(issue) 

60 

61 for issue_node in issue_nodes: 

62 # issue_text = issue_node.get_text() 

63 

64 part_issue = issue_node.get("href").split("_") 

65 volume = part_issue[-2] 

66 number = part_issue[-1] 

67 year = part_issue[1] 

68 serie = part_issue[2] 

69 link = "/item" + issue_node.get("href") 

70 xissue = self.create_bdim_xissue(link, serie, volume, number, year) 

71 if xissue: 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true

72 xissues.append(xissue) 

73 

74 self.periode_begin = self.get_year(xissues[0].year) 

75 self.periode_end = self.get_year(xissues[-1].year) 

76 

77 self.periode = self.get_or_create_periode() 

78 

79 return xissues 

80 

81 def get_year(self, year): 

82 if "/" in year: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 year = year.split("/")[0] 

84 

85 return year 

86 

87 def create_bdim_xissue(self, url, serie, volume, number, dates): 

88 year = dates.replace("/", "-") 

89 

90 xissue = create_issuedata() 

91 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}" 

92 xissue.year = year 

93 xissue.volume = volume 

94 xissue.number = number 

95 xissue.vseries = serie 

96 xissue.url = self.source_website + url 

97 

98 return xissue 

99 

100 def parse_issue_content(self, content, xissue): 

101 soup = BeautifulSoup(content, "html.parser") 

102 article_nodes = soup.find_all("div", {"class": "referenza"}) 

103 

104 for index_article, article_node in enumerate(article_nodes): 

105 article_link_node = article_node.find("a", text="referenza completa") 

106 if article_link_node: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true

107 url = article_link_node.get("href") 

108 xarticle = create_articledata() 

109 xarticle.pid = "a" + str(index_article) 

110 xarticle.url = self.source_website + url 

111 

112 xissue.articles.append(xarticle) 

113 

114 def parse_article_content(self, content, xissue, xarticle, url, pid): 

115 """ 

116 Parse the content with Beautifulsoup and returns an ArticleData 

117 """ 

118 xarticle = create_articledata() 

119 xarticle.pid = pid 

120 # TODO : is this correct ? 

121 xarticle.lang = "it" 

122 soup = BeautifulSoup(content, "html.parser") 

123 # TITLE 

124 title_node = soup.select_one("span.titolo") 

125 if title_node: 125 ↛ 131line 125 didn't jump to line 131 because the condition on line 125 was always true

126 xarticle.title_tex = title_node.get_text() 

127 if xarticle.title_tex == "": 

128 xarticle.title_tex = " " 

129 

130 # Authors 

131 reg_author_link = regex.compile(r"\?testo=\w+") 

132 text_author_bloc = soup.select_one("div.referenza p") 

133 if text_author_bloc: 133 ↛ 158line 133 didn't jump to line 158 because the condition on line 133 was always true

134 for link in text_author_bloc.select("a"): 

135 href = link.get("href") 

136 if isinstance(href, str) and reg_author_link.search(href): 

137 contrib_node = link.select_one("span.autore") 

138 if contrib_node is not None: 138 ↛ 134line 138 didn't jump to line 134 because the condition on line 138 was always true

139 surname_node = link.select_one("span.cognome") 

140 firstname_node = link.select_one("span.nome") 

141 author = create_contributor(role="author") 

142 

143 if surname_node is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true

144 surname = surname_node.get_text() 

145 author["last_name"] = surname 

146 

147 if firstname_node is not None: 147 ↛ 151line 147 didn't jump to line 151 because the condition on line 147 was always true

148 firstname = firstname_node.get_text() 

149 author["first_name"] = firstname 

150 

151 if not firstname_node or not surname_node: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 string_name = contrib_node.get_text() 

153 author["string_name"] = string_name 

154 

155 xarticle.contributors.append(author) 

156 

157 # ABSTRACT 

158 abstract_section_node = soup.select_one("div.sunto") 

159 if abstract_section_node: 159 ↛ 167line 159 didn't jump to line 167 because the condition on line 159 was always true

160 abstract = str(abstract_section_node.get_text()) 

161 xabstract = create_abstract( 

162 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract) 

163 ) 

164 xarticle.abstracts.append(xabstract) 

165 

166 # PDF 

167 pdf_url = soup.find_all("a", text="pdf") 

168 if len(pdf_url) > 0: 168 ↛ 173line 168 didn't jump to line 173 because the condition on line 168 was always true

169 pdf_url = self.source_website + pdf_url[0].get("href") 

170 add_pdf_link_to_xarticle(xarticle, pdf_url) 

171 

172 # PAGES 

173 pages = soup.select_one("span.pagine") 

174 if pages: 174 ↛ 185line 174 didn't jump to line 185 because the condition on line 174 was always true

175 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

176 if pages_to: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 parts = pages_to[0].split("-") 

178 first_page = parts[0].replace("(", "").replace(")", "") 

179 if len(parts) > 1: 179 ↛ 182line 179 didn't jump to line 182 because the condition on line 179 was always true

180 last_page = parts[1].replace("(", "").replace(")", "") 

181 xarticle.lpage = last_page 

182 xarticle.fpage = first_page 

183 

184 # Biblio 

185 bibitems_tags = soup.select("div.biblio div.bibitem") 

186 bibitems = [self.parse_ref(item) for item in bibitems_tags] 

187 if len(bibitems) > 0: 

188 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

189 

190 # metadata 

191 reg_zbl_id = re.compile(r"Zbl \w+") 

192 reg_mr_id = re.compile(r"MR \d+") 

193 

194 medata_bloc = soup.select_one("div.referenza") 

195 if not medata_bloc: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 raise ValueError("metadata_bloc cannot be found") 

197 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())] 

198 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())] 

199 

200 if len(zbl_id) > 0: 

201 zblid = zbl_id[0].get("href") 

202 pos = zblid.find("?q=an:") 

203 if pos > 0: 203 ↛ 205line 203 didn't jump to line 205 because the condition on line 203 was always true

204 zblid = zblid[pos + 6 :] 

205 xarticle.extids.append(("zbl-item-id", zblid)) 

206 if len(mr_id) > 0: 

207 mr_id = mr_id[0].get_text() 

208 mr_id = mr_id.split("MR ") 

209 mr_id = mr_id[1] 

210 xarticle.extids.append(("mr-item-id", mr_id)) 

211 

212 if xarticle.pid in self.title_corrections: 

213 xarticle.title_tex = self.title_corrections[xarticle.pid] 

214 

215 return xarticle 

216 

217 def parse_ref(self, item: Tag): 

218 value_xml = "" 

219 # First pass : we create an semi-complete XML Jats string, except for the authors 

220 # that we store inside authors_list to be serialized at the end 

221 authors_list: list[ContribAuthor] = [] 

222 for c in item.children: 

223 c_text = escape(c.text) 

224 if isinstance(c, str): 

225 value_xml += c_text 

226 continue 

227 

228 if not isinstance(c, Tag): 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 raise NotImplementedError("bibitem_tag is not a Tag or a string") 

230 

231 if c.name == "a": 

232 a_xml, is_badge = self.parse_a_tag(c) 

233 if is_badge: 

234 value_xml = regex.sub(r" \| $", "", value_xml) 

235 value_xml += a_xml 

236 continue 

237 

238 child_class = c.get("class") 

239 if not child_class: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 value_xml += c_text 

241 elif "bautore" in child_class: 

242 # TODO : parse firstname and lastname 

243 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list)) 

244 authors_list.append(author_data) 

245 value_xml += author_xml 

246 

247 elif "titolo" in child_class: 

248 value_xml += get_title_xml(c_text) 

249 elif "rivista" in child_class: 

250 value_xml += get_source_xml(c_text) 

251 elif "anno" in child_class: 

252 value_xml += get_year_xml(c_text) 

253 elif "volume" in child_class: 

254 value_xml += get_volume_xml(c_text) 

255 elif "publisher" in child_class: 

256 value_xml += get_publisher_xml(c_text) 

257 else: 

258 # booktitle 

259 value_xml += c_text 

260 

261 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag. 

262 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml) 

263 if len(authors_occurence) > 0: 

264 first_author = value_xml.index(authors_occurence[0]) 

265 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1]) 

266 value_xml = ( 

267 value_xml[:first_author] 

268 + get_all_authors_xml(value_xml[first_author:last_author], authors_list) 

269 + value_xml[last_author:] 

270 ) 

271 

272 return self.create_crawled_bibitem(value_xml) 

273 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements]) 

274 

275 def parse_a_tag(self, a_tag: Tag): 

276 a_text = escape(a_tag.text) 

277 href = a_tag.get("href") 

278 if not href: 278 ↛ 279line 278 didn't jump to line 279 because the condition on line 278 was never true

279 return a_text, False 

280 elif isinstance(href, list): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 raise ValueError("a tag has multiple href values !") 

282 else: 

283 a_type = "uri" 

284 if a_text.startswith("MR "): 

285 a_type = "mr-item-id" 

286 a_text = a_text.removeprefix("MR ") 

287 elif a_text.startswith("Zbl "): 

288 a_type = "zbl-item-id" 

289 a_text = a_text.removeprefix("Zbl ") 

290 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri" 

291 

292 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0): 

293 value_xml = "" 

294 author_data: ContribAuthor = {"template_str": ""} 

295 for c in author_tag.children: 

296 c_text = escape(c.text) 

297 if isinstance(c, str): 

298 author_data["template_str"] += c_text 

299 continue 

300 

301 if not isinstance(c, Tag): 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true

302 raise NotImplementedError("author_tag is not a Tag or a string") 

303 # given name = cognome = prénom 

304 # surname = nome = nom de famille 

305 child_class = c.get("class") 

306 if not child_class: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 value_xml += c_text 

308 elif "cognome" in child_class: 

309 c.replace_with("{given_names}") 

310 author_data["given_names"] = c_text 

311 author_data["template_str"] += "{given_names}" 

312 elif "nome" in child_class: 312 ↛ 295line 312 didn't jump to line 295 because the condition on line 312 was always true

313 c.replace_with("{surname}") 

314 author_data["surname"] = c_text 

315 author_data["template_str"] += "{surname}" 

316 value_xml += "{author_" + str(index) + "}" 

317 

318 return author_data, value_xml