Coverage for src/crawler/base_crawler.py: 49%

393 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import time 

2from collections import OrderedDict 

3from collections.abc import Sequence 

4from datetime import timedelta 

5 

6import requests 

7from alive_progress import alive_bar 

8from bs4 import BeautifulSoup 

9from django.conf import settings 

10from django.contrib.auth.models import User 

11from django.utils import timezone 

12from ptf.cmds import xml_cmds 

13from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

14from ptf.cmds.xml.jats.builder.issue import get_title_xml 

15from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml 

16from ptf.display.resolver import extids_formats, resolve_id 

17from ptf.model_data import ( 

18 AbstractDict, 

19 ArticleData, 

20 IssueData, 

21 RefData, 

22 create_articledata, 

23 create_contributor, 

24 create_extlink, 

25 create_publisherdata, 

26) 

27from ptf.model_data_converter import update_data_for_jats 

28from pylatexenc.latex2text import LatexNodes2Text 

29from pysolr import SolrError 

30from requests_cache import CachedSession, FileCache 

31 

32# TODO: pass a class factory instead of a dependency to a site 

33# TODO: pass a class factory instead of a dependency to a site 

34from crawler.models import Periode, Source 

35from crawler.utils import get_or_create_collection 

36 

37from .crawler_types import CitationLiteral 

38 

39 

40class BaseCollectionCrawler: 

41 """ 

42 Base collection for the crawlers. 

43 To create a crawler: 

44 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

45 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

46 3) update factory.py so that crawler_factory can return your new crawler 

47 """ 

48 

49 source_name = "" 

50 source_domain = "" 

51 source_website = "" 

52 

53 periode_begin = None 

54 periode_end = None 

55 

56 session: requests.Session | CachedSession 

57 

58 def __init__(self, *args, username: str, collection_id: str, collection_url: str, **kwargs): 

59 self.username = username 

60 self.user = User.objects.get(username=self.username) 

61 

62 self.collection_id = collection_id 

63 self.collection_url = ( 

64 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

65 ) 

66 self.collection = get_or_create_collection(self.collection_id) 

67 

68 self.source = None 

69 

70 self.issue_href = "" 

71 self.test_mode = kwargs.get("test_mode", False) 

72 self.publisher = kwargs.get("publisher", "mathdoc") 

73 

74 # progress_bar can be set externally, for example if you want to crawl all the collections of a given source. 

75 self.progress_bar = kwargs.get("progress_bar", None) 

76 

77 # EuDML uses javascript to fill the journal page with the issues list. 

78 # We need to use a headless browser (NodeJs/Puppeteer) that can handle dynamic content with EuDML. 

79 # Set has_dynamic* to True if the Source uses dynamic content in its web pages. 

80 self.has_dynamic_collection_pages = False 

81 self.has_dynamic_issue_pages = False 

82 self.has_dynamic_article_pages = False 

83 

84 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page 

85 # AMP sets or creates the Periode during the __init__ 

86 # TODO: see with other sources when to create the Periode 

87 self.periode = None 

88 self.periode_first_issue = None 

89 self.periode_last_issue = None 

90 

91 self.start_pid = kwargs.get("start_pid", None) 

92 

93 # Some source have multiple pages for 1 issue. We need to merge the content 

94 

95 self.latext_parser = LatexNodes2Text() 

96 

97 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

98 # do not use the "$" to surround tex formulas 

99 self.delimiter_inline_formula = "$" 

100 self.delimiter_disp_formula = "$" 

101 

102 self.session = CachedSession( 

103 backend=FileCache( 

104 getattr(settings, "HTML_ROOT_FOLDER", "/tmp/ptf_requests_cache"), 

105 decode_content=False, 

106 ), 

107 headers={ 

108 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

109 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

110 }, 

111 expire_after=timedelta(days=30), 

112 ) 

113 # self.session = requests.Session() 

114 

115 def parse_collection_content(self, content: str) -> list[IssueData]: 

116 """ 

117 Parse the HTML content with BeautifulSoup 

118 returns a list of xissue. 

119 Override this function in a derived class 

120 """ 

121 return [] 

122 

123 def parse_issue_content(self, content: str, xissue: IssueData): 

124 """ 

125 Parse the HTML content with BeautifulSoup 

126 Fills the xissue.articles 

127 Override this function in a derived class. 

128 """ 

129 

130 def parse_article_content( 

131 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str 

132 ): 

133 """ 

134 Parse the HTML content with BeautifulSoup 

135 returns the xarticle. 

136 Override this function in a derived class. 

137 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

138 The article url is also passed as a parameter 

139 """ 

140 return create_articledata() 

141 

142 def crawl_collection(self, col_only=False): 

143 """ 

144 Crawl an entire collection. ptf.models.Container objects are created. 

145 - get the HTML content of the collection_url 

146 - parse the HTML content with beautifulsoup to extract the list of issues 

147 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

148 - crawl each issue if col_only is False 

149 - Returns the list of merged issues. 

150 It is an OrderedDict {pid: {"issues": xissues}} 

151 The key is the pid of the merged issues. 

152 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

153 the pid is then made with 1999-2000__6_ 

154 """ 

155 

156 if self.source is None: 

157 raise RuntimeError("ERROR: the source is not set") 

158 

159 content = self.get_page_content(self.collection_url) 

160 xissues = self.parse_collection_content(content) 

161 

162 """ 

163 Some collections split the same volumes in different pages 

164 Ex: Volume 6 (2000) and Volume 6 (1999) 

165 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

166 """ 

167 merged_xissues = self.merge_xissues(xissues) 

168 

169 if col_only: 

170 # TODO: update Celery tasks 

171 return merged_xissues 

172 

173 filtered_xissues = OrderedDict() 

174 

175 # Filter the issues to crawl if start_pid was set in the constructor 

176 start = False 

177 for pid in merged_xissues: 

178 if self.start_pid is None or start or pid == self.start_pid: 

179 start = True 

180 filtered_xissues[pid] = merged_xissues[pid] 

181 

182 def iterate_xissues(filtered_xissues, progress_bar_for_issues): 

183 """ 

184 internal function to be used by alive_bar (see below) 

185 to iterate on the issues to crawl. 

186 crawl_issue calls addOrUpdateIssueXmlCmd but returns the xissue computed by the crawl 

187 """ 

188 crawled_xissues = [] 

189 for pid in filtered_xissues: 

190 if self.progress_bar is not None and progress_bar_for_issues: 

191 self.progress_bar() 

192 crawled_xissue = self.crawl_issue(pid, filtered_xissues) 

193 crawled_xissues.append(crawled_xissue) 

194 return crawled_xissues 

195 

196 if self.progress_bar is None: 

197 with alive_bar( 

198 len(filtered_xissues), 

199 dual_line=True, 

200 title=f"Crawl {self.collection_id} - {self.collection_url}", 

201 stats="(eta {eta})", 

202 force_tty=True, 

203 ) as self.progress_bar: 

204 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=True) 

205 else: 

206 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=False) 

207 

208 return crawled_xissues 

209 

210 def crawl_issue(self, pid, merged_xissues): 

211 """ 

212 Wrapper around crawl_one_issue_url, to handle issues declared in multiple web pages. 

213 If you want to crawl only 1 issue and not the entire collection, 

214 you need to call crawl_collection(col_only=True) before to get the merged_xissues 

215 A ptf.models.Container object is created with its Articles. 

216 Returns the full xissue (with its articles) used to call addOrUpdateIssueXmlCmd 

217 """ 

218 

219 if pid not in merged_xissues: 

220 raise ValueError(f"Error {pid} is not found in the collection") 

221 

222 xissues_to_crawl = merged_xissues[pid]["issues"] 

223 

224 merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0]) 

225 

226 if len(xissues_to_crawl) > 1: 

227 do_append = merged_xissues[pid]["do_append"] 

228 for index, raw_xissue in enumerate(xissues_to_crawl[1:]): 

229 crawled_xissue = self.crawl_one_issue_url(raw_xissue) 

230 

231 if do_append: 

232 merged_xissue.articles.extend(crawled_xissue.articles) 

233 else: 

234 merged_xissue.articles[:0] = crawled_xissue.articles 

235 

236 # Updates the article pid 

237 for article_index, xarticle in enumerate(merged_xissue): 

238 if raw_xissue.pid in xarticle.pid: 

239 xarticle.pid = f"{pid}_a{str(article_index)}" 

240 

241 # Now that the issue pages have been downloaded/read, we can set the merged pid 

242 # The merged_year was set in self.merge_xissues 

243 merged_xissue.pid = pid 

244 merged_xissue.year = merged_xissue.merged_year 

245 if self.test_mode is False or self.test_mode is None: 

246 if len(merged_xissue.articles) > 0: 

247 self.add_xissue_into_database(merged_xissue) 

248 

249 return merged_xissue 

250 

251 def crawl_one_issue_url(self, xissue: IssueData): 

252 """ 

253 Crawl 1 wag page of an issue. 

254 - get the HTML content of the issue 

255 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

256 - crawl each article 

257 """ 

258 

259 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

260 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

261 if hasattr(xissue, "url") and xissue.url: 

262 content = self.get_page_content(xissue.url) 

263 self.parse_issue_content(content, xissue) 

264 

265 xarticles = xissue.articles 

266 

267 if self.progress_bar: 

268 self.progress_bar.title = ( 

269 f"Crawl {self.collection_id} - {xissue.year} {xissue.volume} {xissue.number}" 

270 ) 

271 

272 parsed_xarticles = [] 

273 

274 for xarticle in xarticles: 

275 parsed_xarticle = self.crawl_article(xarticle, xissue) 

276 if parsed_xarticle is not None: 

277 parsed_xarticles.append(parsed_xarticle) 

278 

279 xissue.articles = parsed_xarticles 

280 

281 return xissue 

282 

283 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

284 if hasattr(xarticle, "url") and xarticle.url: 

285 if self.progress_bar: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

287 

288 url = xarticle.url 

289 

290 content = self.get_page_content(xarticle.url) 

291 pid = f"{xissue.pid}_{xarticle.pid}" 

292 xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url, pid) 

293 if xarticle is None: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 return None 

295 

296 # ARTICLE URL as en ExtLink (to display the link in the article page) 

297 ext_link = create_extlink() 

298 ext_link["rel"] = "source" 

299 ext_link["location"] = url 

300 ext_link["metadata"] = self.source_domain 

301 xarticle.ext_links.append(ext_link) 

302 

303 # The article title may have formulas surrounded with '$' 

304 return self.process_article_metadata(xarticle) 

305 

306 def process_article_metadata(self, xarticle: ArticleData): 

307 html, xml = get_html_and_xml_from_text_with_formulas( 

308 xarticle.title_tex, 

309 delimiter_inline=self.delimiter_inline_formula, 

310 delimiter_disp=self.delimiter_disp_formula, 

311 ) 

312 xml = get_title_xml(xml, with_tex_values=False) 

313 xarticle.title_html = html 

314 xarticle.title_xml = xml 

315 

316 abstracts_to_parse = [ 

317 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract" 

318 ] 

319 # abstract may have formulas surrounded with '$' 

320 if len(abstracts_to_parse) > 0: 

321 for xabstract in abstracts_to_parse: 

322 html, xml = get_html_and_xml_from_text_with_formulas( 

323 xabstract["value_tex"], 

324 delimiter_inline=self.delimiter_inline_formula, 

325 delimiter_disp=self.delimiter_disp_formula, 

326 ) 

327 xabstract["value_html"] = html 

328 lang = xabstract["lang"] 

329 if lang == xarticle.lang: 

330 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

331 else: 

332 xabstract[ 

333 "value_xml" 

334 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

335 

336 update_data_for_jats(xarticle) 

337 

338 return xarticle 

339 

340 def download_file(self, url: str): 

341 """ 

342 Downloads a URL, saves its content on disk in filename and returns its content. 

343 """ 

344 

345 txt = f"Download {url}" 

346 if settings.CRAWLER_LOG_FILE: 346 ↛ 347line 346 didn't jump to line 347 because the condition on line 346 was never true

347 with open(settings.CRAWLER_LOG_FILE, "a") as f_: 

348 f_.write(txt + "\n") 

349 

350 content = "" 

351 attempt = 0 

352 response = None 

353 while not content and attempt < 3: 

354 try: 

355 headers = {"accept_encoding": "utf-8"} 

356 # For SSL Errors, use verify=False kwarg 

357 verify = True 

358 if url.startswith("https://hdml.di.ionio.gr/"): 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 verify = False 

360 # self.session.cache.delete(urls=[url]) 

361 response = self.session.get(url, headers=headers, verify=verify) 

362 if not response.ok: 

363 raise requests.exceptions.HTTPError( 

364 f"Endpoint answered with code {response.status_code} : {url}", 

365 response=response, 

366 ) 

367 content = self.decode_response(response) 

368 except (requests.ConnectionError, requests.ConnectTimeout): 

369 attempt += 1 

370 

371 if not content: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

373 

374 return content 

375 

376 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

377 """Override this if the content-type headers from the sources are advertising something else than the actual content 

378 SASA needs this""" 

379 return response.content.decode(encoding) 

380 

381 def get_page_content(self, url: str, force_download=False): 

382 """ 

383 NOTE : is this needed ? 

384 Get the HTML content of a given url. 

385 A cache is used to back up the HTML content on disk. By default, the cache is used to read the HTML content. 

386 """ 

387 content = "" 

388 

389 def set_progress_bar_title(): 

390 if not self.progress_bar: 390 ↛ 392line 390 didn't jump to line 392 because the condition on line 390 was always true

391 return 

392 if isinstance(self.session, CachedSession): 

393 if self.session.cache.contains( 

394 url=url, 

395 ): 

396 self.progress_bar.text(f"Get Cached {url}") 

397 return 

398 self.progress_bar.text(f"Download {url}") 

399 

400 set_progress_bar_title() 

401 content = self.download_file(url) 

402 

403 return content 

404 

405 def get_or_create_source(self): 

406 try: 

407 source = Source.objects.get(name=self.source_name) 

408 except Source.DoesNotExist: 

409 source = Source( 

410 name=self.source_name, 

411 domain=self.source_domain, 

412 website=self.source_website, 

413 create_xissue=True, 

414 periode_href="", 

415 article_href="", 

416 pdf_href="", 

417 ) 

418 source.save() 

419 

420 return source 

421 

422 def get_or_create_periode(self): 

423 if self.periode is not None: 

424 return self.periode 

425 

426 if self.collection is None or self.source is None: 

427 raise ValueError("You need to set a collection or a source before creating a periode") 

428 

429 qs = Periode.objects.filter(collection=self.collection, source=self.source) 

430 if qs.exists(): 

431 periode = qs.first() 

432 else: 

433 periode = Periode( 

434 collection=self.collection, 

435 source=self.source, 

436 title=self.collection.title_tex, 

437 issue_href=self.issue_href, 

438 collection_href=self.collection_url, 

439 doi_href="", 

440 published=False, 

441 begin=self.periode_begin, 

442 end=self.periode_end, 

443 first_issue=self.periode_first_issue, 

444 last_issue=self.periode_last_issue, 

445 ) 

446 periode.save() 

447 

448 return periode 

449 

450 def merge_xissues(self, xissues: list[IssueData]): 

451 """ 

452 Some collections split the same volumes in different pages 

453 Ex: Volume 6 (2000) and Volume 6 (1999) 

454 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

455 """ 

456 

457 merged_xissues = OrderedDict() 

458 

459 for xissue in xissues: 

460 xissues_with_same_volume = [ 

461 item 

462 for item in xissues 

463 if xissue.volume == item.volume 

464 and xissue.number == item.number 

465 and xissue.vseries == item.vseries 

466 and (item.volume or item.number) 

467 ] 

468 

469 do_append = False 

470 

471 if len(xissues_with_same_volume) < 2: 

472 if xissue.pid is None: 

473 raise ValueError("Issue does not have a PID") 

474 merged_xissues[xissue.pid] = {"issues": [xissue], "do_append": True} 

475 first_issue = xissue 

476 year = xissue.year 

477 else: 

478 first_issue = xissues_with_same_volume[0] 

479 volume = xissues_with_same_volume[0].volume 

480 number = xissues_with_same_volume[0].number 

481 vseries = xissues_with_same_volume[0].vseries 

482 

483 # Compute the year based on all issues with the same volume/number 

484 begin = end = year = xissues_with_same_volume[0].year 

485 if not year: 

486 raise ValueError("year is not defined") 

487 

488 if "-" in year: 

489 parts = year.split("-") 

490 begin = parts[0] 

491 end = parts[1] 

492 

493 for xissue_with_same_volume in xissues_with_same_volume[1:]: 

494 new_begin = new_end = xissue_with_same_volume.year 

495 

496 if not xissue_with_same_volume.year: 

497 raise ValueError("xissue year is not defined") 

498 

499 if "-" in xissue_with_same_volume.year: 

500 parts = year.split("-") 

501 new_begin = parts[0] 

502 new_end = parts[1] 

503 

504 if begin is None or end is None or new_begin is None or new_end is None: 

505 continue 

506 begin_int = int(begin) 

507 end_int = int(end) 

508 new_begin_int = int(new_begin) 

509 new_end_int = int(new_end) 

510 

511 if new_begin_int < begin_int: 

512 begin = new_begin 

513 if new_end_int > end_int: 

514 end = new_end 

515 do_append = True 

516 

517 if begin != end: 

518 year = f"{begin}-{end}" 

519 else: 

520 year = begin 

521 

522 # We can now set the real pid 

523 # Note: We cannot update the pid of each xissue of xissues_with_same_volume 

524 # because the HTML cache relies on the original id 

525 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}" 

526 if pid not in merged_xissues: 

527 merged_xissues[pid] = { 

528 "issues": xissues_with_same_volume, 

529 "do_append": do_append, 

530 } 

531 

532 # We can set the year only for the first xissue because it is the one used to collect 

533 # all the articles. 

534 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0]) 

535 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year 

536 first_issue.merged_year = year 

537 

538 return merged_xissues 

539 

540 def add_xissue_into_database(self, xissue: IssueData): 

541 xissue.journal = self.collection 

542 

543 xpub = create_publisherdata() 

544 xpub.name = self.publisher 

545 xissue.publisher = xpub 

546 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

547 

548 attempt = 1 

549 success = False 

550 

551 while not success and attempt < 4: 

552 try: 

553 params = {"xissue": xissue, "use_body": False} 

554 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

555 cmd.do() 

556 success = True 

557 except SolrError: 

558 attempt += 1 

559 time.sleep(10) 

560 

561 def get_metadata_using_citation_meta( 

562 self, 

563 xarticle: ArticleData, 

564 xissue: IssueData, 

565 soup: BeautifulSoup, 

566 what: list[CitationLiteral] = [], 

567 ): 

568 """ 

569 :param xarticle: the xarticle that will collect the metadata 

570 :param xissue: the xissue that will collect the publisher 

571 :param soup: the BeautifulSoup object of tha article page 

572 :param what: list of citation_ items to collect. 

573 :return: None. The given article is modified 

574 """ 

575 

576 if "title" in what: 

577 # TITLE 

578 citation_title_node = soup.select_one("meta[name='citation_title']") 

579 if citation_title_node: 579 ↛ 584line 579 didn't jump to line 584 because the condition on line 579 was always true

580 title = citation_title_node.get("content") 

581 if isinstance(title, str): 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true

582 xarticle.title_tex = title 

583 

584 if "author" in what: 584 ↛ 596line 584 didn't jump to line 596 because the condition on line 584 was always true

585 # AUTHORS 

586 citation_author_nodes = soup.find_all("meta", {"name": "citation_author"}) 

587 for citation_author_node in citation_author_nodes: 

588 text_author = citation_author_node.get("content") 

589 

590 author = create_contributor() 

591 author["role"] = "author" 

592 author["string_name"] = text_author 

593 

594 xarticle.contributors.append(author) 

595 

596 if "pdf" in what: 596 ↛ 604line 596 didn't jump to line 604 because the condition on line 596 was always true

597 # PDF 

598 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

599 if citation_pdf_node: 

600 pdf_url = citation_pdf_node.get("content") 

601 if isinstance(pdf_url, str): 601 ↛ 604line 601 didn't jump to line 604 because the condition on line 601 was always true

602 add_pdf_link_to_xarticle(xarticle, pdf_url) 

603 

604 lang = "en" 

605 if "lang" in what: 

606 # LANG 

607 citation_lang_node = soup.find("meta", {"name": "citation_language"}) 

608 if citation_lang_node: 608 ↛ 612line 608 didn't jump to line 612 because the condition on line 608 was always true

609 # TODO: check other language code 

610 lang = xarticle.lang = citation_lang_node.get("content").strip()[0:2] 

611 

612 if "abstract" in what: 

613 # ABSTRACT 

614 abstract_node = soup.find("div", {"class": "entry-content"}) 

615 if abstract_node is not None: 

616 abstract_section_node = abstract_node.find("p") 

617 if abstract_section_node: 617 ↛ 629line 617 didn't jump to line 629 because the condition on line 617 was always true

618 abstract = str(abstract_section_node) 

619 xarticle.abstracts.append( 

620 { 

621 "tag": "abstract", 

622 "value_html": "", 

623 "value_tex": abstract, 

624 "value_xml": "", 

625 "lang": lang, 

626 } 

627 ) 

628 

629 if "page" in what: 629 ↛ 645line 629 didn't jump to line 645 because the condition on line 629 was always true

630 # PAGES 

631 citation_fpage_node = soup.find("meta", {"name": "citation_firstpage"}) 

632 if citation_fpage_node: 632 ↛ 638line 632 didn't jump to line 638 because the condition on line 632 was always true

633 page = citation_fpage_node.get("content") 

634 page = page.split("(")[0] 

635 if len(page) < 32: 635 ↛ 638line 635 didn't jump to line 638 because the condition on line 635 was always true

636 xarticle.fpage = page 

637 

638 citation_lpage_node = soup.find("meta", {"name": "citation_lastpage"}) 

639 if citation_lpage_node: 639 ↛ 645line 639 didn't jump to line 645 because the condition on line 639 was always true

640 page = citation_fpage_node.get("content") 

641 page = page.split("(")[0] 

642 if len(page) < 32: 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true

643 xarticle.fpage = page 

644 

645 if "doi" in what: 

646 # DOI 

647 citation_doi_node = soup.find("meta", {"name": "citation_doi"}) 

648 if citation_doi_node: 

649 doi = citation_doi_node.get("content").strip() 

650 pos = doi.find("10.") 

651 if pos > 0: 

652 doi = doi[pos:] 

653 xarticle.doi = doi 

654 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

655 

656 if "mr" in what: 

657 # MR 

658 citation_mr_node = soup.find("meta", {"name": "citation_mr"}) 

659 if citation_mr_node: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true

660 mr = citation_mr_node.get("content").strip() 

661 if mr.find("MR") == 0: 

662 mr = mr[2:] 

663 xarticle.extids.append(("mr-item-id", mr)) 

664 

665 if "zbl" in what: 

666 # ZBL 

667 citation_zbl_node = soup.find("meta", {"name": "citation_zbl"}) 

668 if citation_zbl_node: 

669 zbl = citation_zbl_node.get("content").strip() 

670 if zbl.find("Zbl") == 0: 670 ↛ 674line 670 didn't jump to line 674 because the condition on line 670 was always true

671 zbl = zbl[3:].strip() 

672 xarticle.extids.append(("zbl-item-id", zbl)) 

673 

674 if "publisher" in what and (not self.test_mode): 674 ↛ 676line 674 didn't jump to line 676 because the condition on line 674 was never true

675 # PUBLISHER 

676 citation_publisher_node = soup.find("meta", {"name": "citation_publisher"}) 

677 if citation_publisher_node: 

678 pub = citation_publisher_node.get("content").strip() 

679 if pub != "": 

680 xpub = create_publisherdata() 

681 xpub.name = pub 

682 xissue.publisher = xpub 

683 

684 if "keywords" in what: 

685 # KEYWORDS 

686 citation_kwd_node = soup.find("meta", {"name": "citation_keywords"}) 

687 if citation_kwd_node: 

688 kwds = citation_kwd_node.get("content").split(",") 

689 for kwd in kwds: 

690 if kwd == "": 690 ↛ 691line 690 didn't jump to line 691 because the condition on line 690 was never true

691 continue 

692 kwd = kwd.strip() 

693 xarticle.kwds.append({"type": "", "lang": lang, "value": kwd}) 

694 

695 def create_crawled_bibitem(self, value_xml: str): 

696 xref = RefData(lang="en") 

697 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

698 

699 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>' 

700 xref.citation_xml = value_xml 

701 xref = check_bibitem_xml(xref) 

702 

703 # Bakes extlink badges into the bibliography html 

704 # Maybe we should put this into another file (jats_parser ?) 

705 for extid in xref.extids: 

706 href = resolve_id(extid[0], extid[1]) 

707 if (not href) or (not xref.citation_html): 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true

708 continue 

709 str_format = extid[0] 

710 if str_format in extids_formats: 710 ↛ 712line 710 didn't jump to line 712 because the condition on line 710 was always true

711 str_format = extids_formats[str_format] 

712 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

713 

714 return xref 

715 

716 def create_bibliography(self, bibitems: Sequence[RefData]): 

717 xml_str = "<ref-list>\n" 

718 html_str = "<div>\n" 

719 

720 for item in bibitems: 

721 xml_str += f"\t{item.citation_xml}\n" 

722 html_str += f"\t<p>{item.citation_html}</p>\n" 

723 xml_str += "</ref-list>" 

724 

725 # for item in bibitems: 

726 # html_str = 

727 # html_str += f"\t<p>{item.citation_html}</p>\n" 

728 html_str += "</div>" 

729 

730 tex_str = "<div>\n" 

731 for item in bibitems: 

732 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

733 tex_str += "</div>" 

734 

735 biblio_dict: AbstractDict = { 

736 "tag": "biblio", 

737 "value_html": html_str, 

738 "value_tex": tex_str, 

739 "value_xml": xml_str, 

740 "lang": "en", 

741 } 

742 

743 return biblio_dict 

744 

745 

746def add_pdf_link_to_xarticle(xarticle: ArticleData, pdf_url: str): 

747 data = { 

748 "rel": "full-text", 

749 "mimetype": "application/pdf", 

750 "location": pdf_url, 

751 "base": "", 

752 "text": "Full Text", 

753 } 

754 xarticle.streams.append(data) 

755 

756 # The pdf url is already added as a stream (just above) but might be replaced by a file later on. 

757 # Keep the pdf url as an Extlink if we want to propose both option: 

758 # - direct download of a local PDF 

759 # - URL to the remote PDF 

760 ext_link = create_extlink(rel="article-pdf", location=pdf_url) 

761 xarticle.ext_links.append(ext_link)