Coverage for src/crawler/by_source/elibm

1from collections import OrderedDict

3from bs4 import BeautifulSoup

4from ptf.model_data import (

5 IssueData,

6 create_abstract,

7 create_articledata,

8 create_contributor,

9 create_issuedata,

10 create_subj,

11)

13from crawler.base_crawler import BaseCollectionCrawler

14from crawler.utils import add_pdf_link_to_xarticle

17class ElibmCrawler(BaseCollectionCrawler):

18 source_name = "Electronic Library of Mathematics"

19 source_domain = "ELIBM"

20 source_website = "https://www.elibm.org"

22 def __init__(self, *args, **kwargs):

23 super().__init__(*args, **kwargs)

24 if self.collection_id == "DOCMA": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 self.delimiter_inline_formula = "\\("

26 self.delimiter_disp_formula = "\\["

28 def parse_collection_content(self, content):

29 """

30 Parse the HTML page of Annals of Math and returns a list of xissue.

31 Each xissue has its pid/volume/number/year metadata + its url

32 """

33 soup = BeautifulSoup(content, "html.parser")

34 xissues = []

36 # Extract the list of issues

37 link_nodes = soup.find_all("a")

39 # eLibM puts special issue titles as volume number

40 # to create a issue pid, we use S1, S2...

41 last_special_issue_number = 0

43 for link_node in link_nodes:

44 url = link_node.get("href")

45 text = link_node.get_text()

46 if url.startswith("/issue"):

47 xissue, last_special_issue_number = self.create_elibm_xissue(

48 url, text, last_special_issue_number

49 )

51 if xissue: 51 ↛ 43line 51 didn't jump to line 43 because the condition on line 51 was always true

52 xissues.append(xissue)

54 return xissues

56 def get_first_year(self, year):

57 if "/" in year:

58 year = year.split("/")[0]

60 return year

62 def create_elibm_xissue(self, url, text, last_special_issue_number):

63 if "(" not in text or ")" not in text: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 return None, None

66 parts = text.split("(")

68 year = parts[1].split(")")[0]

69 year = year.replace("/", "-")

71 # volume might not be an integer. eLibM puts special issue titles as volume number.

72 volume = parts[0].strip()

74 number = ""

75 if "No. " in volume:

76 parts = volume.split("No. ")

77 volume = parts[0].strip()

78 number = parts[1].strip()

80 try:

81 volume_for_pid = int(volume)

82 except ValueError:

83 last_special_issue_number += 1

84 volume_for_pid = f"S{last_special_issue_number}"

86 xissue = create_issuedata()

87 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"

88 xissue.year = year

89 xissue.volume = volume

90 xissue.number = number

91 xissue.url = self.source_website + url

93 return xissue, last_special_issue_number

95 def parse_issue_content(self, content, xissue):

96 soup = BeautifulSoup(content, "html.parser")

97 article_nodes = soup.find_all("div", {"class": "title"})

99 for index_article, article_node in enumerate(article_nodes):

100 article_link_node = article_node.find("a")

101 if article_link_node: 101 ↛ 99line 101 didn't jump to line 99 because the condition on line 101 was always true

102 url = article_link_node.get("href")

103 xarticle = create_articledata()

104 xarticle.pid = "a" + str(index_article)

105 xarticle.url = self.source_website + url

106

107 # eLibM lists the articles in the reverse order, except for one special issue

108 if xissue.volume == "Mahler Selecta": 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 xissue.articles.append(xarticle)

110 else:

111 xissue.articles.insert(0, xarticle)

112

113 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page

114 if len(xissue.articles) == 0:

115 title_node = soup.find("h2", {"class": "document_title"})

116 if title_node is not None: 116 ↛ exitline 116 didn't return from function 'parse_issue_content' because the condition on line 116 was always true

117 xarticle = create_articledata()

118 xarticle.pid = "a0"

119 xarticle.url = xissue.url

120

121 xissue.articles.append(xarticle)

122

123 def parse_article_content(self, content, xissue, xarticle, url):

124 """

125 Parse the content with Beautifulsoup and returns an ArticleData

126 """

127 xarticle.lang = "en"

128

129 soup = BeautifulSoup(content, "html.parser")

130

131 # TITLE

132 title_node = soup.find("h2", {"class": "document_title"})

133 if title_node: 133 ↛ 137line 133 didn't jump to line 137 because the condition on line 133 was always true

134 xarticle.title_tex = title_node.get_text()

135

136 # AUTHORS

137 citation_author_node = soup.find("h3", {"class": "document_author"})

138 if citation_author_node: 138 ↛ 157line 138 didn't jump to line 157 because the condition on line 138 was always true

139 text = citation_author_node.get_text()

140 if text: 140 ↛ 157line 140 didn't jump to line 157 because the condition on line 140 was always true

141 parts = text.split(";")

142 for part in parts:

143 text_author = part.strip()

144

145 role = "author"

146 if "(ed.)" in text_author:

147 role = "editor"

148 text_author = text_author.split("(ed.)")[0].strip()

149

150 author = create_contributor()

151 author["role"] = role

152 author["string_name"] = text_author

153

154 xarticle.contributors.append(author)

155

156 # PDF

157 link_nodes = soup.find_all("a")

158 for link_node in link_nodes:

159 url = link_node.get("href")

160 if url.startswith("/ft/"):

161 pdf_url = self.source_website + url

162 add_pdf_link_to_xarticle(xarticle, pdf_url)

163

164 panel_nodes = soup.find_all("h3", {"class": "panel-title"})

165 for panel_node in panel_nodes:

166 text = panel_node.get_text()

167 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"})

168

169 if text == "Summary":

170 # ABSTRACT

171 abstract = content_node.get_text()

172 xabstract = create_abstract(tag="abstract", value_tex=abstract, lang=xarticle.lang)

173 xarticle.abstracts.append(xabstract)

174

175 elif text == "Mathematics Subject Classification":

176 # MSC

177 subjs = content_node.get_text().split(", ")

178 for subj in subjs:

179 subject = create_subj()

180 subject["value"] = subj

181 subject["type"] = "msc"

182 subject["lang"] = "en"

183 xarticle.kwds.append(subject)

184

185 elif text == "Keywords/Phrases":

186 # Keywords

187 subjs = content_node.get_text().split(", ")

188 for subj in subjs:

189 subject = create_subj()

190 subject["value"] = subj

191 subject["lang"] = "en"

192 xarticle.kwds.append(subject)

193

194 # PAGES

195 citation_node = soup.find("h5", {"class": "document_source"})

196 if citation_node: 196 ↛ 215line 196 didn't jump to line 215 because the condition on line 196 was always true

197 text = citation_node.get_text()

198 year = f"({xissue.year})"

199 if year in text: 199 ↛ 215line 199 didn't jump to line 215 because the condition on line 199 was always true

200 text = text.split(year)[0]

201

202 if "p." in text:

203 text = text.split("p.")[0].split(",")[-1].strip()

204 xarticle.size = text

205

206 elif "-" in text:

207 parts = text.split("-")

208 first_page = parts[-2].split(" ")[-1]

209 last_page = parts[-1].split(",")[0].split(" ")[0]

210

211 xarticle.fpage = first_page

212 xarticle.lpage = last_page

213

214 # DOI

215 doi_node = citation_node.next_sibling

216 if doi_node.name == "div": 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 text = doi_node.get_text()

218 if text.startswith("DOI: "):

219 doi = text[5:]

220

221 xarticle.doi = doi

222 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")

223

224 return xarticle

225

226 def crawl_collection(self):

227 if self.source is None:

228 raise RuntimeError("ERROR: the source is not set")

229

230 content = self.download_file(self.collection_url)

231 xissues = self.parse_collection_content(content)

232

233 """

234 Some collections split the same volumes in different pages

235 Ex: Volume 6 (2000) and Volume 6 (1999)

236 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)

237 """

238 xissues_dict = self.merge_xissues(xissues)

239

240 filtered_xissues = xissues_dict

241 # Filter the issues to crawl if start_pid was set in the constructor

242 # if self.start_pid is not None:

243 # filtered_xissues = {}

244 # start = False

245 # for pid in xissues_dict:

246 # if pid == self.start_pid:

247 # start = True

248 # if start:

249 # filtered_xissues[pid] = xissues_dict[pid]

250

251 return filtered_xissues

252

253 def merge_xissues(self, xissues: list[IssueData]):

254 """

255 Some collections split the same volumes in different pages

256 Ex: Volume 6 (2000) and Volume 6 (1999)

257 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)

258 """

259

260 merged_xissues = OrderedDict()

261

262 for xissue in xissues:

263 xissues_with_same_volume = [

264 item

265 for item in xissues

266 if xissue.volume == item.volume

267 and xissue.number == item.number

268 and xissue.vseries == item.vseries

269 and (item.volume or item.number)

270 ]

271

272 if len(xissues_with_same_volume) < 2:

273 if xissue.pid is None:

274 raise ValueError("Issue does not have a PID")

275 merged_xissues[xissue.pid] = {"issues": [xissue]}

276 first_issue = xissue

277 year = xissue.year

278 else:

279 first_issue = xissues_with_same_volume[0]

280 volume = xissues_with_same_volume[0].volume

281 number = xissues_with_same_volume[0].number

282 vseries = xissues_with_same_volume[0].vseries

283

284 # Compute the year based on all issues with the same volume/number

285 begin = end = year = xissues_with_same_volume[0].year

286 if not year:

287 raise ValueError("year is not defined")

288

289 if "-" in year:

290 parts = year.split("-")

291 begin = parts[0]

292 end = parts[1]

293

294 for xissue_with_same_volume in xissues_with_same_volume[1:]:

295 new_begin = new_end = xissue_with_same_volume.year

296

297 if not xissue_with_same_volume.year:

298 raise ValueError("xissue year is not defined")

299

300 if "-" in xissue_with_same_volume.year:

301 parts = year.split("-")

302 new_begin = parts[0]

303 new_end = parts[1]

304

305 if begin is None or end is None or new_begin is None or new_end is None:

306 continue

307 begin_int = int(begin)

308 end_int = int(end)

309 new_begin_int = int(new_begin)

310 new_end_int = int(new_end)

311

312 if new_begin_int < begin_int:

313 begin = new_begin

314 if new_end_int > end_int:

315 end = new_end

316

317 if begin != end:

318 year = f"{begin}-{end}"

319 else:

320 year = begin

321

322 # We can now set the real pid

323 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"

324 for issue in xissues_with_same_volume:

325 issue.pid = pid

326

327 if pid not in merged_xissues:

328 merged_xissues[pid] = {

329 "issues": xissues_with_same_volume,

330 }

331

332 # We can set the year only for the first xissue because it is the one used to collect

333 # all the articles.

334 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])

335 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year

336 first_issue.merged_year = year

337

338 return merged_xissues

339

340 def crawl_issue(self, merged_xissues: dict[str, list[IssueData]]):

341 """

342 Wrapper around crawl_elibm_issue, to handle issues declared in multiple web pages.

343 """

344

345 xissues_to_crawl = merged_xissues["issues"]

346

347 merged_xissue = xissues_to_crawl[0]

348 self.crawl_elibm_issue(merged_xissue)

349

350 if len(xissues_to_crawl) > 1:

351 for raw_xissue in xissues_to_crawl[1:]:

352 self.crawl_elibm_issue(raw_xissue)

353

354 merged_xissue.articles = raw_xissue.articles + merged_xissue.articles

355

356 # Updates the article pid

357 for article_index, xarticle in enumerate(merged_xissue):

358 if raw_xissue.pid in xarticle.pid:

359 xarticle.pid = f"{raw_xissue.pid}_a{str(article_index)}"

360

361 # Now that the issue pages have been downloaded/read, we can set the merged pid

362 # The merged_year was set in self.merge_xissues

363 # merged_xissue.pid

364 merged_xissue.year = merged_xissue.merged_year

365

366 if self.ignore_missing_pdf:

367 merged_xissue.articles = [a for a in merged_xissue.articles if self.article_has_pdf(a)]

368

369 if not self.test_mode and len(merged_xissue.articles) > 0:

370 self.process_resource_metadata(merged_xissue, resource_type="issue")

371 self.add_xissue_into_database(merged_xissue)

372

373 def crawl_elibm_issue(self, xissue: IssueData):

374 """

375 Crawl 1 wag page of an issue.

376 - get the HTML content of the issue

377 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata

378 - crawl each article

379 """

380

381 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.

382 # The list of articles directly come from the collection HTML page: the xissue has no url attribute

383 if hasattr(xissue, "url") and xissue.url:

384 content = self.download_file(xissue.url)

385 self.parse_issue_content(content, xissue)

386

387 xarticles = xissue.articles

388

389 parsed_xarticles = []

390

391 for xarticle in xarticles:

392 parsed_xarticle = self.crawl_article(xarticle, xissue)

393 if parsed_xarticle is not None:

394 parsed_xarticles.append(parsed_xarticle)

395

396 xissue.articles = parsed_xarticles

Coverage for src/crawler/by_source/elibm_crawler.py: 53%

234 statements