Coverage for src/crawler/by_source/bdim_crawler.py: 89%

215 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import re 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.citation import ( 

6 ContribAuthor, 

7 get_all_authors_xml, 

8 get_ext_link_xml, 

9 get_publisher_xml, 

10 get_source_xml, 

11 get_volume_xml, 

12 get_year_xml, 

13) 

14from ptf.cmds.xml.jats.builder.issue import get_title_xml 

15from ptf.cmds.xml.xml_utils import escape 

16from ptf.model_data import ( 

17 create_abstract, 

18 create_articledata, 

19 create_contributor, 

20 create_issuedata, 

21) 

22 

23from crawler.base_crawler import BaseCollectionCrawler 

24from crawler.utils import add_pdf_link_to_xarticle 

25 

26 

27class BdimCrawler(BaseCollectionCrawler): 

28 source_name = "Biblioteca Digitale Italiana di Matematica" 

29 source_domain = "BDIM" 

30 source_website = "http://www.bdim.eu" 

31 

32 title_corrections = { 

33 "RLINA_1965_8_39_5_a17": "Eventi fasici nel midollo spinale quali prove di inibizione presinaptica durante il sonno desincronizzato", 

34 "RLINA_1973_8_55_6_a0": "Complementarity between nilpotent selfmappings and periodic autohomeomorphisms.", 

35 "RLINA_1973_8_55_6_a2": "Sur une extension du lemme de Green.", 

36 "RLINA_1979_8_67_1-2_a6": "On the existence o f an unbounded connected set of solutions for nonlinear equations in Banach spaces.", 

37 "RLINA_1972_8_52_2_a5": "Sul carattere proiettivo del rapporto plurisezionale.", 

38 "RLINA_1980_8_69_1-2_a6": "A note on a variational formulation of the Einstein equations for thermo-elastic materials.", 

39 } 

40 

41 issue_href = r"\?id=(?P<col>\w+)(?P<issue>_\d{1,4})" 

42 

43 def parse_collection_content(self, content): 

44 """ 

45 Parse the HTML page of Annals of Math and returns a list of xissue. 

46 Each xissue has its pid/volume/number/year metadata + its url 

47 

48 self.periode is set at the end based on the xissue years of the HTML page 

49 """ 

50 soup = BeautifulSoup(content, "html.parser") 

51 xissues = [] 

52 

53 reg_issue = regex.compile(self.issue_href) 

54 

55 issue_nodes = [] 

56 for issue in soup.select("div.listafascicoli a"): 

57 href = issue.get("href") 

58 if isinstance(href, str) and reg_issue.search(href): 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true

59 issue_nodes.append(issue) 

60 

61 for issue_node in issue_nodes: 

62 # issue_text = issue_node.get_text() 

63 

64 part_issue = issue_node.get("href").split("_") 

65 volume = part_issue[-2] 

66 number = part_issue[-1] 

67 year = part_issue[1] 

68 serie = part_issue[2] 

69 link = "/item" + issue_node.get("href") 

70 xissue = self.create_bdim_xissue(link, serie, volume, number, year) 

71 if xissue: 71 ↛ 61line 71 didn't jump to line 61 because the condition on line 71 was always true

72 xissues.append(xissue) 

73 

74 self.periode_begin = self.get_year(xissues[0].year) 

75 self.periode_end = self.get_year(xissues[-1].year) 

76 

77 self.periode = self.get_or_create_periode() 

78 

79 return xissues 

80 

81 def get_year(self, year): 

82 if "/" in year: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 year = year.split("/")[0] 

84 

85 return year 

86 

87 def create_bdim_xissue(self, url, serie, volume, number, dates): 

88 year = dates.replace("/", "-") 

89 

90 xissue = create_issuedata() 

91 xissue.pid = f"{self.collection_id}_{year}_{serie}_{volume}_{number}" 

92 xissue.year = year 

93 xissue.volume = volume 

94 xissue.number = number 

95 xissue.vseries = serie 

96 xissue.url = self.source_website + url 

97 

98 return xissue 

99 

100 def parse_issue_content(self, content, xissue): 

101 soup = BeautifulSoup(content, "html.parser") 

102 article_nodes = soup.find_all("div", {"class": "referenza"}) 

103 

104 for index_article, article_node in enumerate(article_nodes): 

105 article_link_node = article_node.find("a", text="referenza completa") 

106 if article_link_node: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true

107 url = article_link_node.get("href") 

108 xarticle = create_articledata() 

109 xarticle.pid = "a" + str(index_article) 

110 xarticle.url = self.source_website + url 

111 

112 xissue.articles.append(xarticle) 

113 

114 def parse_article_content(self, content, xissue, xarticle, url, pid): 

115 """ 

116 Parse the content with Beautifulsoup and returns an ArticleData 

117 """ 

118 xarticle.pid = pid 

119 # TODO : is this correct ? 

120 xarticle.lang = "it" 

121 soup = BeautifulSoup(content, "html.parser") 

122 # TITLE 

123 title_node = soup.select_one("span.titolo") 

124 if title_node: 124 ↛ 130line 124 didn't jump to line 130 because the condition on line 124 was always true

125 xarticle.title_tex = title_node.get_text() 

126 if xarticle.title_tex == "": 

127 xarticle.title_tex = " " 

128 

129 # Authors 

130 reg_author_link = regex.compile(r"\?testo=\w+") 

131 text_author_bloc = soup.select_one("div.referenza p") 

132 if text_author_bloc: 132 ↛ 157line 132 didn't jump to line 157 because the condition on line 132 was always true

133 for link in text_author_bloc.select("a"): 

134 href = link.get("href") 

135 if isinstance(href, str) and reg_author_link.search(href): 

136 contrib_node = link.select_one("span.autore") 

137 if contrib_node is not None: 137 ↛ 133line 137 didn't jump to line 133 because the condition on line 137 was always true

138 surname_node = link.select_one("span.cognome") 

139 firstname_node = link.select_one("span.nome") 

140 author = create_contributor(role="author") 

141 

142 if surname_node is not None: 142 ↛ 146line 142 didn't jump to line 146 because the condition on line 142 was always true

143 surname = surname_node.get_text() 

144 author["last_name"] = surname 

145 

146 if firstname_node is not None: 146 ↛ 150line 146 didn't jump to line 150 because the condition on line 146 was always true

147 firstname = firstname_node.get_text() 

148 author["first_name"] = firstname 

149 

150 if not firstname_node or not surname_node: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 string_name = contrib_node.get_text() 

152 author["string_name"] = string_name 

153 

154 xarticle.contributors.append(author) 

155 

156 # ABSTRACT 

157 abstract_section_node = soup.select_one("div.sunto") 

158 if abstract_section_node: 158 ↛ 166line 158 didn't jump to line 166 because the condition on line 158 was always true

159 abstract = str(abstract_section_node.get_text()) 

160 xabstract = create_abstract( 

161 tag="abstract", value_tex=abstract, lang=self.detect_language(abstract) 

162 ) 

163 xarticle.abstracts.append(xabstract) 

164 

165 # PDF 

166 pdf_url = soup.find_all("a", text="pdf") 

167 if len(pdf_url) > 0: 167 ↛ 172line 167 didn't jump to line 172 because the condition on line 167 was always true

168 pdf_url = self.source_website + pdf_url[0].get("href") 

169 add_pdf_link_to_xarticle(xarticle, pdf_url) 

170 

171 # PAGES 

172 pages = soup.select_one("span.pagine") 

173 if pages: 173 ↛ 184line 173 didn't jump to line 184 because the condition on line 173 was always true

174 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text()) 

175 if pages_to: 175 ↛ 184line 175 didn't jump to line 184 because the condition on line 175 was always true

176 parts = pages_to[0].split("-") 

177 first_page = parts[0].replace("(", "").replace(")", "") 

178 if len(parts) > 1: 178 ↛ 181line 178 didn't jump to line 181 because the condition on line 178 was always true

179 last_page = parts[1].replace("(", "").replace(")", "") 

180 xarticle.lpage = last_page 

181 xarticle.fpage = first_page 

182 

183 # Biblio 

184 bibitems_tags = soup.select("div.biblio div.bibitem") 

185 bibitems = [self.parse_ref(item) for item in bibitems_tags] 

186 if len(bibitems) > 0: 

187 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

188 

189 # metadata 

190 reg_zbl_id = re.compile(r"Zbl \w+") 

191 reg_mr_id = re.compile(r"MR \d+") 

192 

193 medata_bloc = soup.select_one("div.referenza") 

194 if not medata_bloc: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 raise ValueError("metadata_bloc cannot be found") 

196 mr_id = [link for link in medata_bloc.find_all("a") if reg_mr_id.search(link.get_text())] 

197 zbl_id = [link for link in medata_bloc.find_all("a") if reg_zbl_id.search(link.get_text())] 

198 

199 if len(zbl_id) > 0: 

200 zblid = zbl_id[0].get("href") 

201 pos = zblid.find("?q=an:") 

202 if pos > 0: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true

203 zblid = zblid[pos + 6 :] 

204 xarticle.extids.append(("zbl-item-id", zblid)) 

205 if len(mr_id) > 0: 

206 mr_id = mr_id[0].get_text() 

207 mr_id = mr_id.split("MR ") 

208 mr_id = mr_id[1] 

209 xarticle.extids.append(("mr-item-id", mr_id)) 

210 

211 if xarticle.pid in self.title_corrections: 

212 xarticle.title_tex = self.title_corrections[xarticle.pid] 

213 

214 return xarticle 

215 

216 def parse_ref(self, item: Tag): 

217 value_xml = "" 

218 # First pass : we create an semi-complete XML Jats string, except for the authors 

219 # that we store inside authors_list to be serialized at the end 

220 authors_list: list[ContribAuthor] = [] 

221 for c in item.children: 

222 c_text = escape(c.text) 

223 if isinstance(c, str): 

224 value_xml += c_text 

225 continue 

226 

227 if not isinstance(c, Tag): 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 raise NotImplementedError("bibitem_tag is not a Tag or a string") 

229 

230 if c.name == "a": 

231 a_xml, is_badge = self.parse_a_tag(c) 

232 if is_badge: 

233 value_xml = regex.sub(r" \| $", "", value_xml) 

234 value_xml += a_xml 

235 continue 

236 

237 child_class = c.get("class") 

238 if not child_class: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 value_xml += c_text 

240 elif "bautore" in child_class: 

241 # TODO : parse firstname and lastname 

242 author_data, author_xml = self.parse_biblio_author_tag(c, len(authors_list)) 

243 authors_list.append(author_data) 

244 value_xml += author_xml 

245 

246 elif "titolo" in child_class: 

247 value_xml += get_title_xml(c_text) 

248 elif "rivista" in child_class: 

249 value_xml += get_source_xml(c_text) 

250 elif "anno" in child_class: 

251 value_xml += get_year_xml(c_text) 

252 elif "volume" in child_class: 

253 value_xml += get_volume_xml(c_text) 

254 elif "publisher" in child_class: 

255 value_xml += get_publisher_xml(c_text) 

256 else: 

257 # booktitle 

258 value_xml += c_text 

259 

260 # In order to have a valid Jats xml, we have to group all authors into the person-group xml tag. 

261 authors_occurence = regex.compile(r"{author_\d}").findall(value_xml) 

262 if len(authors_occurence) > 0: 

263 first_author = value_xml.index(authors_occurence[0]) 

264 last_author = value_xml.index(authors_occurence[-1]) + len(authors_occurence[-1]) 

265 value_xml = ( 

266 value_xml[:first_author] 

267 + get_all_authors_xml(value_xml[first_author:last_author], authors_list) 

268 + value_xml[last_author:] 

269 ) 

270 

271 return self.create_crawled_bibitem(value_xml) 

272 # return self.create_crawled_bibitem([*bib_elements, *bib_link_elements]) 

273 

274 def parse_a_tag(self, a_tag: Tag): 

275 a_text = escape(a_tag.text) 

276 href = a_tag.get("href") 

277 if not href: 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 return a_text, False 

279 elif isinstance(href, list): 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 raise ValueError("a tag has multiple href values !") 

281 else: 

282 a_type = "uri" 

283 if a_text.startswith("MR "): 

284 a_type = "mr-item-id" 

285 a_text = a_text.removeprefix("MR ") 

286 elif a_text.startswith("Zbl "): 

287 a_type = "zbl-item-id" 

288 a_text = a_text.removeprefix("Zbl ") 

289 return get_ext_link_xml(escape(href), a_text, a_type), a_type != "uri" 

290 

291 def parse_biblio_author_tag(self, author_tag: Tag, index: int = 0): 

292 value_xml = "" 

293 author_data: ContribAuthor = {"template_str": ""} 

294 for c in author_tag.children: 

295 c_text = escape(c.text) 

296 if isinstance(c, str): 

297 author_data["template_str"] += c_text 

298 continue 

299 

300 if not isinstance(c, Tag): 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 raise NotImplementedError("author_tag is not a Tag or a string") 

302 # given name = cognome = prénom 

303 # surname = nome = nom de famille 

304 child_class = c.get("class") 

305 if not child_class: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 value_xml += c_text 

307 elif "cognome" in child_class: 

308 c.replace_with("{given_names}") 

309 author_data["given_names"] = c_text 

310 author_data["template_str"] += "{given_names}" 

311 elif "nome" in child_class: 311 ↛ 294line 311 didn't jump to line 294 because the condition on line 311 was always true

312 c.replace_with("{surname}") 

313 author_data["surname"] = c_text 

314 author_data["template_str"] += "{surname}" 

315 value_xml += "{author_" + str(index) + "}" 

316 

317 return author_data, value_xml