Coverage for src/crawler/base_crawler.py: 75%

360 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import time 

2from collections.abc import Sequence 

3from datetime import timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetector, LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.issue import get_title_xml 

16from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml 

17from ptf.display.resolver import extids_formats, resolve_id 

18from ptf.model_data import ( 

19 ArticleData, 

20 IssueData, 

21 RefData, 

22 create_abstract, 

23 create_contributor, 

24 create_extlink, 

25 create_issuedata, 

26 create_publisherdata, 

27) 

28from ptf.model_data_converter import update_data_for_jats 

29from pylatexenc.latex2text import LatexNodes2Text 

30from pysolr import SolrError 

31from requests_cache import CachedSession, FileCache 

32 

33from crawler.models import Periode, Source 

34from crawler.types import CitationLiteral 

35from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

36 

37# TODO: pass a class factory instead of a dependency to a site 

38# TODO: pass a class factory instead of a dependency to a site 

39 

40 

41class BaseCollectionCrawler: 

42 """ 

43 Base collection for the crawlers. 

44 To create a crawler: 

45 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

46 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

47 3) update factory.py so that crawler_factory can return your new crawler 

48 """ 

49 

50 source_name = "" 

51 source_domain = "" 

52 source_website = "" 

53 

54 periode_begin: int = 0 

55 periode_end: int = 9999 

56 

57 issue_href = "" 

58 

59 source = None 

60 session: requests.Session | CachedSession 

61 

62 next_allowed_request: float = time.time() 

63 

64 latext_parser = LatexNodes2Text() 

65 

66 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

67 # do not use the "$" to surround tex formulas 

68 delimiter_inline_formula = "$" 

69 delimiter_disp_formula = "$" 

70 

71 # HACK : Workaround for tests (monkeypatching) 

72 # We store the class here, so we can monkeypatch it when running tests 

73 # subCrawlers = { 

74 # LofplCrawler: None 

75 # } 

76 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

77 

78 language_detector: LanguageDetector 

79 

80 def __init__( 

81 self, 

82 *args, 

83 username: str, 

84 collection_id: str, 

85 collection_url: str, 

86 test_mode: bool = False, 

87 publisher: str = "mathdoc", 

88 start_pid: str | None = None, 

89 ): 

90 for CrawlerClass in self.subCrawlers: 90 ↛ 91line 90 didn't jump to line 91 because the loop on line 90 never started

91 self.subCrawlers[CrawlerClass] = CrawlerClass( 

92 *args, 

93 username=username, 

94 collection_id=collection_id, 

95 collection_url=collection_url, 

96 test_mode=test_mode, 

97 publisher=publisher, 

98 start_pid=start_pid, 

99 ) 

100 

101 self.username = username 

102 self.user = User.objects.get(username=self.username) 

103 

104 self.collection_id = collection_id 

105 self.collection_url = ( 

106 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

107 ) 

108 self.collection = get_or_create_collection(self.collection_id) 

109 

110 self.test_mode = test_mode 

111 self.publisher = publisher 

112 

113 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page 

114 # AMP sets or creates the Periode during the __init__ 

115 # TODO: see with other sources when to create the Periode 

116 self.periode = None 

117 self.periode_first_issue = None 

118 self.periode_last_issue = None 

119 

120 self.start_pid = start_pid 

121 

122 # Some source have multiple pages for 1 issue. We need to merge the content 

123 self.build_language_detector() 

124 

125 self.session = CachedSession( 

126 backend=FileCache( 

127 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache", 

128 decode_content=False, 

129 ), 

130 headers={ 

131 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0", 

132 "From": getattr(settings, "REQUESTS_EMAIL", None) or "accueil@listes.mathdoc.fr", 

133 }, 

134 expire_after=timedelta(days=30), 

135 ) 

136 

137 self.source = self.get_or_create_source() 

138 self.periode = self.get_or_create_periode() 

139 

140 def build_language_detector(self): 

141 self.language_detector = LanguageDetectorBuilder.from_all_languages().build() 

142 

143 def parse_collection_content(self, content: str) -> list[IssueData]: 

144 """ 

145 Parse the HTML content with BeautifulSoup 

146 returns a list of xissue. 

147 Override this function in a derived class 

148 """ 

149 return [] 

150 

151 def parse_issue_content(self, content: str, xissue: IssueData): 

152 """ 

153 Parse the HTML content with BeautifulSoup 

154 Fills the xissue.articles 

155 Override this function in a derived class. 

156 

157 CAV : You are supposed to create articles there. Please assign a PID to each article. 

158 The PID can be `a + article_index`, like this : `a0` `a21` 

159 """ 

160 

161 def parse_article_content( 

162 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str 

163 ): 

164 """ 

165 Parse the HTML content with BeautifulSoup 

166 returns the xarticle. 

167 Override this function in a derived class. 

168 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

169 The article url is also passed as a parameter 

170 

171 CAV : You are supposed to assign articles pid again here 

172 """ 

173 xarticle.pid = pid 

174 return xarticle 

175 

176 def crawl_collection(self): 

177 # TODO: Comments, filter 

178 """ 

179 Crawl an entire collection. ptf.models.Container objects are created. 

180 - get the HTML content of the collection_url 

181 - parse the HTML content with beautifulsoup to extract the list of issues 

182 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

183 - crawl each issue if col_only is False 

184 - Returns the list of merged issues. 

185 It is an OrderedDict {pid: {"issues": xissues}} 

186 The key is the pid of the merged issues. 

187 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

188 the pid is then made with 1999-2000__6_ 

189 """ 

190 

191 if self.source is None: 

192 raise RuntimeError("ERROR: the source is not set") 

193 

194 content = self.download_file(self.collection_url) 

195 xissues = self.parse_collection_content(content) 

196 

197 # xissues = [ 

198 # issue 

199 # for issue in xissues 

200 # if int(issue.year) >= self.periode_begin and int(issue.year) <= self.periode_end 

201 # ] 

202 

203 """ 

204 Some collections split the same volumes in different pages 

205 Ex: Volume 6 (2000) and Volume 6 (1999) 

206 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

207 """ 

208 # merged_xissues = self.merge_xissues(xissues) 

209 

210 xissues_dict = {str(i.pid): i for i in xissues} 

211 

212 filtered_xissues = xissues_dict 

213 # Filter the issues to crawl if start_pid was set in the constructor 

214 if self.start_pid is not None: 

215 filtered_xissues = {} 

216 start = False 

217 for pid in sorted(xissues_dict): 

218 if pid == self.start_pid: 

219 start = True 

220 if start: 

221 filtered_xissues[pid] = xissues_dict[pid] 

222 

223 return filtered_xissues 

224 

225 def crawl_issue(self, xissue: IssueData): 

226 """ 

227 Crawl 1 wag page of an issue. 

228 - get the HTML content of the issue 

229 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

230 - crawl each article 

231 """ 

232 

233 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

234 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

235 if hasattr(xissue, "url") and xissue.url: 

236 content = self.download_file(xissue.url) 

237 self.parse_issue_content(content, xissue) 

238 

239 xarticles = xissue.articles 

240 

241 parsed_xarticles = [] 

242 

243 for xarticle in xarticles: 

244 parsed_xarticle = self.crawl_article(xarticle, xissue) 

245 if parsed_xarticle is not None: 

246 parsed_xarticles.append(parsed_xarticle) 

247 

248 xissue.articles = parsed_xarticles 

249 

250 if not self.test_mode and len(xissue.articles) > 0: 

251 self.add_xissue_into_database(xissue) 

252 

253 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

254 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

255 parsed_xarticle = xarticle 

256 if hasattr(xarticle, "url") and xarticle.url: 

257 url = xarticle.url 

258 

259 content = self.download_file(xarticle.url) 

260 pid = f"{xissue.pid}_{xarticle.pid}" 

261 

262 parsed_xarticle = self.parse_article_content( 

263 content, xissue, xarticle, xarticle.url, pid 

264 ) 

265 if parsed_xarticle.url is not None: 

266 url = parsed_xarticle.url 

267 # ARTICLE URL as en ExtLink (to display the link in the article page) 

268 ext_link = create_extlink() 

269 ext_link["rel"] = "source" 

270 ext_link["location"] = url 

271 ext_link["metadata"] = self.source_domain 

272 parsed_xarticle.ext_links.append(ext_link) 

273 

274 # The article title may have formulas surrounded with '$' 

275 return self.process_article_metadata(parsed_xarticle) 

276 

277 def process_article_metadata(self, xarticle: ArticleData): 

278 html, xml = get_html_and_xml_from_text_with_formulas( 

279 xarticle.title_tex, 

280 delimiter_inline=self.delimiter_inline_formula, 

281 delimiter_disp=self.delimiter_disp_formula, 

282 ) 

283 xml = get_title_xml(xml, with_tex_values=False) 

284 xarticle.title_html = html 

285 xarticle.title_xml = xml 

286 

287 abstracts_to_parse = [ 

288 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract" 

289 ] 

290 # abstract may have formulas surrounded with '$' 

291 if len(abstracts_to_parse) > 0: 

292 for xabstract in abstracts_to_parse: 

293 html, xml = get_html_and_xml_from_text_with_formulas( 

294 xabstract["value_tex"], 

295 delimiter_inline=self.delimiter_inline_formula, 

296 delimiter_disp=self.delimiter_disp_formula, 

297 ) 

298 xabstract["value_html"] = html 

299 lang = xabstract["lang"] 

300 if lang == xarticle.lang: 

301 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

302 else: 

303 xabstract[ 

304 "value_xml" 

305 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

306 

307 update_data_for_jats(xarticle) 

308 

309 return xarticle 

310 

311 def get(self, url: str): 

312 attempt = 0 

313 response = None 

314 

315 while attempt < 3: 

316 # If we already have a key, we can skip the timeout 

317 if isinstance(self.session, CachedSession): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 if not self.session.cache.contains(url=url): 

319 delta = self.next_allowed_request - time.time() 

320 if delta > 0: 

321 time.sleep(delta) 

322 self.next_allowed_request = time.time() + 5 

323 try: 

324 headers = {"accept_encoding": "utf-8"} 

325 # For SSL Errors, use verify=False kwarg 

326 verify = True 

327 if url.startswith("https://hdml.di.ionio.gr/"): 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true

328 verify = False 

329 # self.session.cache.delete(urls=[url]) 

330 response = self.session.get(url, headers=headers, verify=verify) 

331 if not response.ok: 

332 raise requests.exceptions.HTTPError( 

333 f"Endpoint answered with code {response.status_code} : {url}", 

334 response=response, 

335 ) 

336 return response 

337 except ( 

338 requests.ConnectionError, 

339 requests.ConnectTimeout, 

340 requests.exceptions.HTTPError, 

341 ): 

342 attempt += 1 

343 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

344 

345 def download_file(self, url: str): 

346 """ 

347 Downloads a URL, saves its content on disk in filename and returns its content. 

348 """ 

349 response = self.get(url) 

350 content = self.decode_response(response) 

351 if content == "" or not content: 351 ↛ 352line 351 didn't jump to line 352 because the condition on line 351 was never true

352 raise requests.exceptions.HTTPError(response) 

353 return content 

354 

355 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

356 """Override this if the content-type headers from the sources are advertising something else than the actual content 

357 SASA needs this""" 

358 response.encoding = encoding 

359 return response.text 

360 

361 def add_xissue_into_database(self, xissue: IssueData): 

362 xissue.journal = self.collection 

363 

364 xpub = create_publisherdata() 

365 xpub.name = self.publisher 

366 xissue.publisher = xpub 

367 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

368 

369 attempt = 1 

370 success = False 

371 

372 while not success and attempt < 4: 

373 try: 

374 params = {"xissue": xissue, "use_body": False} 

375 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

376 cmd.do() 

377 success = True 

378 except SolrError: 

379 attempt += 1 

380 time.sleep(10) 

381 

382 def get_metadata_using_citation_meta( 

383 self, 

384 xarticle: ArticleData, 

385 xissue: IssueData, 

386 soup: BeautifulSoup, 

387 what: list[CitationLiteral] = [], 

388 ): 

389 """ 

390 :param xarticle: the xarticle that will collect the metadata 

391 :param xissue: the xissue that will collect the publisher 

392 :param soup: the BeautifulSoup object of tha article page 

393 :param what: list of citation_ items to collect. 

394 :return: None. The given article is modified 

395 """ 

396 

397 if "title" in what: 

398 # TITLE 

399 citation_title_node = soup.select_one("meta[name='citation_title']") 

400 if citation_title_node: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true

401 title = citation_title_node.get("content") 

402 if isinstance(title, str): 402 ↛ 405line 402 didn't jump to line 405 because the condition on line 402 was always true

403 xarticle.title_tex = title 

404 

405 if "author" in what: 405 ↛ 418line 405 didn't jump to line 418 because the condition on line 405 was always true

406 # AUTHORS 

407 citation_author_nodes = soup.select("meta[name='citation_author']") 

408 for citation_author_node in citation_author_nodes: 

409 text_author = citation_author_node.get("content") 

410 if not isinstance(text_author, str): 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 continue 

412 author = create_contributor() 

413 author["role"] = "author" 

414 author["string_name"] = text_author 

415 

416 xarticle.contributors.append(author) 

417 

418 if "pdf" in what: 418 ↛ 426line 418 didn't jump to line 426 because the condition on line 418 was always true

419 # PDF 

420 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

421 if citation_pdf_node: 

422 pdf_url = citation_pdf_node.get("content") 

423 if isinstance(pdf_url, str): 423 ↛ 426line 423 didn't jump to line 426 because the condition on line 423 was always true

424 add_pdf_link_to_xarticle(xarticle, pdf_url) 

425 

426 if "lang" in what: 

427 # LANG 

428 citation_lang_node = soup.select_one("meta[name='citation_language']") 

429 if citation_lang_node: 429 ↛ 435line 429 didn't jump to line 435 because the condition on line 429 was always true

430 # TODO: check other language code 

431 content_text = citation_lang_node.get("content") 

432 if isinstance(content_text, str): 432 ↛ 435line 432 didn't jump to line 435 because the condition on line 432 was always true

433 xarticle.lang = standardize_tag(content_text) 

434 

435 if "abstract" in what: 

436 # ABSTRACT 

437 abstract_node = soup.select_one("div.entry-content") 

438 if abstract_node is not None: 

439 abstract_section_node = abstract_node.select_one("p") 

440 if abstract_section_node: 440 ↛ 452line 440 didn't jump to line 452 because the condition on line 440 was always true

441 abstract = str(abstract_section_node) 

442 xarticle.abstracts.append( 

443 { 

444 "tag": "abstract", 

445 "value_html": "", 

446 "value_tex": abstract, 

447 "value_xml": "", 

448 "lang": self.detect_language(abstract, xarticle), 

449 } 

450 ) 

451 

452 if "page" in what: 452 ↛ 470line 452 didn't jump to line 470 because the condition on line 452 was always true

453 # PAGES 

454 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

455 if citation_fpage_node: 

456 page = citation_fpage_node.get("content") 

457 if isinstance(page, str): 457 ↛ 462line 457 didn't jump to line 462 because the condition on line 457 was always true

458 page = page.split("(")[0] 

459 if len(page) < 32: 459 ↛ 462line 459 didn't jump to line 462 because the condition on line 459 was always true

460 xarticle.fpage = page 

461 

462 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

463 if citation_lpage_node: 

464 page = citation_lpage_node.get("content") 

465 if isinstance(page, str): 465 ↛ 470line 465 didn't jump to line 470 because the condition on line 465 was always true

466 page = page.split("(")[0] 

467 if len(page) < 32: 467 ↛ 470line 467 didn't jump to line 470 because the condition on line 467 was always true

468 xarticle.lpage = page 

469 

470 if "doi" in what: 

471 # DOI 

472 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

473 if citation_doi_node: 

474 doi = citation_doi_node.get("content") 

475 if isinstance(doi, str): 475 ↛ 483line 475 didn't jump to line 483 because the condition on line 475 was always true

476 doi = doi.strip() 

477 pos = doi.find("10.") 

478 if pos > 0: 

479 doi = doi[pos:] 

480 xarticle.doi = doi 

481 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

482 

483 if "mr" in what: 

484 # MR 

485 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

486 if citation_mr_node: 486 ↛ 487line 486 didn't jump to line 487 because the condition on line 486 was never true

487 mr = citation_mr_node.get("content") 

488 if isinstance(mr, str): 

489 mr = mr.strip() 

490 if mr.find("MR") == 0: 

491 mr = mr[2:] 

492 xarticle.extids.append(("mr-item-id", mr)) 

493 

494 if "zbl" in what: 

495 # ZBL 

496 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

497 if citation_zbl_node: 

498 zbl = citation_zbl_node.get("content") 

499 if isinstance(zbl, str): 499 ↛ 505line 499 didn't jump to line 505 because the condition on line 499 was always true

500 zbl = zbl.strip() 

501 if zbl.find("Zbl") == 0: 501 ↛ 505line 501 didn't jump to line 505 because the condition on line 501 was always true

502 zbl = zbl[3:].strip() 

503 xarticle.extids.append(("zbl-item-id", zbl)) 

504 

505 if "publisher" in what: 

506 # PUBLISHER 

507 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

508 if citation_publisher_node: 

509 pub = citation_publisher_node.get("content") 

510 if isinstance(pub, str): 510 ↛ 517line 510 didn't jump to line 517 because the condition on line 510 was always true

511 pub = pub.strip() 

512 if pub != "": 512 ↛ 517line 512 didn't jump to line 517 because the condition on line 512 was always true

513 xpub = create_publisherdata() 

514 xpub.name = pub 

515 xissue.publisher = xpub 

516 

517 if "keywords" in what: 

518 # KEYWORDS 

519 citation_kwd_node = soup.select_one("meta[name='citation_keywords']") 

520 if citation_kwd_node: 

521 kwds = citation_kwd_node.get("content") 

522 if isinstance(kwds, str): 522 ↛ exitline 522 didn't return from function 'get_metadata_using_citation_meta' because the condition on line 522 was always true

523 kwds = kwds.split(",") 

524 for kwd in kwds: 

525 if kwd == "": 

526 continue 

527 kwd = kwd.strip() 

528 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

529 

530 def create_xissue( 

531 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

532 ): 

533 if url is not None and url.endswith("/"): 533 ↛ 534line 533 didn't jump to line 534 because the condition on line 533 was never true

534 url = url[:-1] 

535 xissue = create_issuedata() 

536 xissue.url = url 

537 

538 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

539 

540 xissue.year = year 

541 

542 if volume_number is not None: 

543 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

544 

545 if issue_number is not None: 

546 xissue.number = issue_number.replace(",", "-") 

547 return xissue 

548 

549 def detect_language(self, text: str, article: ArticleData | None = None): 

550 if article and article.lang is not None and article.lang != "und": 

551 return article.lang 

552 

553 language = self.language_detector.detect_language_of(text) 

554 

555 if not language: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 return "und" 

557 return language.iso_code_639_1.name.lower() 

558 

559 def get_or_create_periode(self): 

560 if self.periode is not None: 560 ↛ 561line 560 didn't jump to line 561 because the condition on line 560 was never true

561 return self.periode 

562 

563 if self.collection is None or self.source is None: 563 ↛ 564line 563 didn't jump to line 564 because the condition on line 563 was never true

564 raise ValueError("You need to set a collection or a source before creating a periode") 

565 

566 qs = Periode.objects.filter(collection=self.collection, source=self.source) 

567 if qs.exists(): 567 ↛ 570line 567 didn't jump to line 570 because the condition on line 567 was always true

568 periode = qs.first() 

569 else: 

570 periode = Periode( 

571 collection=self.collection, 

572 source=self.source, 

573 title=self.collection.title_tex, 

574 issue_href=self.issue_href, 

575 collection_href=self.collection_url, 

576 doi_href="", 

577 published=False, 

578 begin=self.periode_begin, 

579 end=self.periode_end, 

580 first_issue=self.periode_first_issue, 

581 last_issue=self.periode_last_issue, 

582 ) 

583 periode.save() 

584 

585 return periode 

586 

587 @classmethod 

588 def get_or_create_source(cls): 

589 source, created = Source.objects.get_or_create( 

590 domain=cls.source_domain, 

591 defaults={ 

592 "name": cls.source_name, 

593 "website": cls.source_website, 

594 "create_xissue": True, 

595 "periode_href": "", 

596 "article_href": "", 

597 "pdf_href": "", 

598 }, 

599 ) 

600 if created: 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true

601 source.save() 

602 return source 

603 

604 @staticmethod 

605 def create_crawled_bibitem(value_xml: str): 

606 xref = RefData(lang="en") 

607 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

608 

609 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>' 

610 xref.citation_xml = value_xml 

611 xref = check_bibitem_xml(xref) 

612 

613 # Bakes extlink badges into the bibliography html 

614 # Maybe we should put this into another file (jats_parser ?) 

615 for extid in xref.extids: 

616 href = resolve_id(extid[0], extid[1]) 

617 if (not href) or (not xref.citation_html): 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true

618 continue 

619 str_format = extid[0] 

620 if str_format in extids_formats: 620 ↛ 622line 620 didn't jump to line 622 because the condition on line 620 was always true

621 str_format = extids_formats[str_format] 

622 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

623 

624 return xref 

625 

626 @staticmethod 

627 def create_bibliography(bibitems: Sequence[RefData]): 

628 xml_str = "<ref-list>\n" 

629 html_str = "<div>\n" 

630 

631 for item in bibitems: 

632 xml_str += f"\t{item.citation_xml}\n" 

633 html_str += f"\t<p>{item.citation_html}</p>\n" 

634 xml_str += "</ref-list>" 

635 

636 # for item in bibitems: 

637 # html_str = 

638 # html_str += f"\t<p>{item.citation_html}</p>\n" 

639 html_str += "</div>" 

640 

641 tex_str = "<div>\n" 

642 for item in bibitems: 

643 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

644 tex_str += "</div>" 

645 

646 biblio_dict = create_abstract( 

647 tag="biblio", 

648 value_html=html_str, 

649 value_tex=tex_str, 

650 value_xml=xml_str, 

651 lang="en", 

652 ) 

653 

654 return biblio_dict 

655 

656 @staticmethod 

657 def get_issue_pid( 

658 collection_id: str, 

659 year: str, 

660 volume_number: str | None = None, 

661 issue_number: str | None = None, 

662 ): 

663 # Replace any non-word character with an underscore 

664 pid = f"{collection_id}_{year}" 

665 if volume_number is not None: 

666 pid += f"_{volume_number}" 

667 if issue_number is not None: 

668 pid += f"_{issue_number}" 

669 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

670 return pid 

671 

672 @staticmethod 

673 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

674 pages_split = pages.split(separator) 

675 if len(pages_split) == 0: 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true

676 article.page_range = pages 

677 if len(pages_split) > 0: 677 ↛ exitline 677 didn't return from function 'set_pages' because the condition on line 677 was always true

678 if pages[0].isnumeric(): 

679 article.fpage = pages_split[0] 

680 if ( 

681 len(pages_split) > 1 

682 and pages_split[0] != pages_split[1] 

683 and pages_split[1].isnumeric() 

684 ): 

685 article.lpage = pages_split[1]