Coverage for src/crawler/base_crawler.py: 79%

430 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-02 15:25 +0000

1import time 

2from collections.abc import Sequence 

3from datetime import timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.citation import ( 

16 get_article_title_xml, 

17 get_author_xml, 

18 get_fpage_xml, 

19 get_lpage_xml, 

20 get_source_xml, 

21 get_year_xml, 

22) 

23from ptf.cmds.xml.jats.builder.issue import get_title_xml 

24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml 

25from ptf.display.resolver import extids_formats, resolve_id 

26from ptf.model_data import ( 

27 ArticleData, 

28 ContributorDict, 

29 IssueData, 

30 RefData, 

31 ResourceData, 

32 create_abstract, 

33 create_contributor, 

34 create_extlink, 

35 create_issuedata, 

36 create_publisherdata, 

37) 

38from ptf.model_data_converter import update_data_for_jats 

39from pylatexenc.latex2text import LatexNodes2Text 

40from pysolr import SolrError 

41from requests_cache import CachedSession, FileCache 

42 

43from crawler.models import Source 

44from crawler.models.container_source import ContainerSource 

45from crawler.types import CitationLiteral 

46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

47 

48# TODO: pass a class factory instead of a dependency to a site 

49# TODO: pass a class factory instead of a dependency to a site 

50 

51 

52class BaseCollectionCrawler: 

53 """ 

54 Base collection for the crawlers. 

55 To create a crawler: 

56 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

57 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

58 3) update factory.py so that crawler_factory can return your new crawler 

59 """ 

60 

61 source_name = "" 

62 source_domain = "" 

63 source_website = "" 

64 

65 issue_href = "" 

66 

67 collection = None 

68 source = None 

69 user = None 

70 session: requests.Session | CachedSession 

71 # Updated in constructor with user agent from settings_local 

72 headers = {"accept_encoding": "utf-8"} 

73 

74 next_allowed_request: float = time.time() 

75 

76 # seconds to wait between two http requests 

77 requests_interval = 5 

78 

79 latext_parser = LatexNodes2Text() 

80 

81 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

82 # do not use the "$" to surround tex formulas 

83 delimiter_inline_formula = "$" 

84 delimiter_disp_formula = "$" 

85 

86 # HACK : Workaround for tests (monkeypatching) 

87 # We store the class here, so we can monkeypatch it when running tests 

88 # subCrawlers = { 

89 # LofplCrawler: None 

90 # } 

91 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

92 

93 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

94 

95 force_refresh = False 

96 

97 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

98 

99 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

100 ignore_missing_pdf = True 

101 

102 def __init__( 

103 self, 

104 *args, 

105 username: str, 

106 collection_id: str, 

107 collection_url: str, 

108 test_mode: bool = False, 

109 publisher: str = "mathdoc", 

110 force_refresh=False, 

111 ): 

112 for CrawlerClass in self.subCrawlers: 

113 self.subCrawlers[CrawlerClass] = CrawlerClass( 

114 *args, 

115 username=username, 

116 collection_id=collection_id, 

117 collection_url=collection_url, 

118 test_mode=test_mode, 

119 publisher=publisher, 

120 ) 

121 

122 self.username = username 

123 

124 self.collection_id = collection_id 

125 self.collection_url = ( 

126 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

127 ) 

128 

129 self.test_mode = test_mode 

130 self.publisher = publisher 

131 

132 # Skipped when running tests 

133 self.initialize() 

134 

135 self.session = CachedSession( 

136 backend=FileCache( 

137 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"), 

138 decode_content=False, 

139 ), 

140 expire_after=timedelta(days=30), 

141 ) 

142 self.headers.update( 

143 { 

144 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

145 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

146 } 

147 ) 

148 

149 self.force_refresh = force_refresh 

150 

151 def initialize(self): 

152 """ 

153 Acts as a "second" init function to skip model accesses during test data generation 

154 """ 

155 self.collection = get_or_create_collection(self.collection_id) 

156 self.source = self.get_or_create_source() 

157 self.user = User.objects.get(username=self.username) 

158 

159 @classmethod 

160 def can_crawl(cls, pid: str) -> bool: 

161 return True 

162 

163 def parse_collection_content(self, content: str) -> list[IssueData]: 

164 """ 

165 Parse the HTML content with BeautifulSoup 

166 returns a list of xissue. 

167 Override this function in a derived class 

168 """ 

169 return [] 

170 

171 def parse_issue_content(self, content: str, xissue: IssueData): 

172 """ 

173 Parse the HTML content with BeautifulSoup 

174 Fills the xissue.articles 

175 Override this function in a derived class. 

176 

177 CAV : You are supposed to create articles there. Please assign a PID to each article. 

178 The PID can be `a + article_index`, like this : `a0` `a21` 

179 """ 

180 

181 def parse_article_content( 

182 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

183 ) -> ArticleData | None: 

184 """ 

185 Parse the HTML content with BeautifulSoup 

186 returns the xarticle. 

187 Override this function in a derived class. 

188 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

189 The article url is also passed as a parameter 

190 

191 CAV : You are supposed to assign articles pid again here 

192 """ 

193 return xarticle 

194 

195 def crawl_collection(self): 

196 # TODO: Comments, filter 

197 """ 

198 Crawl an entire collection. ptf.models.Container objects are created. 

199 - get the HTML content of the collection_url 

200 - parse the HTML content with beautifulsoup to extract the list of issues 

201 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

202 - crawl each issue if col_only is False 

203 - Returns the list of merged issues. 

204 It is an OrderedDict {pid: {"issues": xissues}} 

205 The key is the pid of the merged issues. 

206 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

207 the pid is then made with 1999-2000__6_ 

208 """ 

209 

210 if self.source is None: 

211 raise RuntimeError("ERROR: the source is not set") 

212 

213 content = self.download_file(self.collection_url) 

214 xissues = self.parse_collection_content(content) 

215 

216 """ 

217 Some collections split the same volumes in different pages 

218 Ex: Volume 6 (2000) and Volume 6 (1999) 

219 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

220 """ 

221 # merged_xissues = self.merge_xissues(xissues) 

222 

223 xissues_dict = {str(i.pid): i for i in xissues} 

224 

225 return xissues_dict 

226 

227 def crawl_issue(self, xissue: IssueData): 

228 """ 

229 Crawl 1 wag page of an issue. 

230 - get the HTML content of the issue 

231 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

232 - crawl each article 

233 """ 

234 

235 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

236 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

237 

238 issue_url = xissue.url 

239 if issue_url is not None: 

240 if issue_url.endswith(".pdf"): 

241 add_pdf_link_to_xarticle(xissue, issue_url) 

242 xissue.url = None 

243 else: 

244 content = self.download_file(issue_url) 

245 self.parse_issue_content(content, xissue) 

246 

247 xarticles = xissue.articles 

248 

249 parsed_xarticles = [] 

250 

251 for xarticle in xarticles: 

252 parsed_xarticle = self.crawl_article(xarticle, xissue) 

253 if parsed_xarticle is not None: 

254 parsed_xarticles.append(parsed_xarticle) 

255 

256 xissue.articles = parsed_xarticles 

257 

258 article_has_pdf = self.article_has_pdf(xissue) 

259 

260 if self.ignore_missing_pdf: 

261 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

262 

263 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

264 self.process_resource_metadata(xissue) 

265 self.add_xissue_into_database(xissue) 

266 

267 @staticmethod 

268 def article_has_source(art: ArticleData | IssueData): 

269 return ( 

270 next( 

271 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

272 None, 

273 ) 

274 is not None 

275 ) 

276 

277 @staticmethod 

278 def article_has_pdf(art: ArticleData | IssueData): 

279 return ( 

280 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

281 is not None 

282 ) 

283 

284 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

285 # ARTICLE URL as en ExtLink (to display the link in the article page) 

286 if xarticle.url is None: 

287 if not self.article_has_source(xarticle): 287 ↛ 297line 287 didn't jump to line 297 because the condition on line 287 was always true

288 if xissue.url: 

289 article_source = xissue.url 

290 else: 

291 article_source = self.collection_url 

292 ext_link = create_extlink() 

293 ext_link["rel"] = "source" 

294 ext_link["location"] = article_source 

295 ext_link["metadata"] = self.source_domain 

296 xarticle.ext_links.append(ext_link) 

297 return self.process_resource_metadata(xarticle) 

298 

299 content = self.download_file(xarticle.url) 

300 

301 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

302 if parsed_xarticle is None: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 return None 

304 

305 if parsed_xarticle.doi: 

306 parsed_xarticle.pid = ( 

307 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

308 ) 

309 else: 

310 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

311 

312 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

313 ext_link = create_extlink() 

314 ext_link["rel"] = "source" 

315 ext_link["location"] = parsed_xarticle.url 

316 ext_link["metadata"] = self.source_domain 

317 parsed_xarticle.ext_links.append(ext_link) 

318 

319 # The article title may have formulas surrounded with '$' 

320 return self.process_resource_metadata(parsed_xarticle) 

321 

322 def process_resource_metadata(self, xresource: ResourceData): 

323 html, xml = get_html_and_xml_from_text_with_formulas( 

324 xresource.title_tex, 

325 delimiter_inline=self.delimiter_inline_formula, 

326 delimiter_disp=self.delimiter_disp_formula, 

327 ) 

328 xml = get_title_xml(xml, with_tex_values=False) 

329 xresource.title_html = html 

330 xresource.title_xml = xml 

331 

332 abstracts_to_parse = [ 

333 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

334 ] 

335 # abstract may have formulas surrounded with '$' 

336 if len(abstracts_to_parse) > 0: 

337 for xabstract in abstracts_to_parse: 

338 html, xml = get_html_and_xml_from_text_with_formulas( 

339 xabstract["value_tex"], 

340 delimiter_inline=self.delimiter_inline_formula, 

341 delimiter_disp=self.delimiter_disp_formula, 

342 ) 

343 xabstract["value_html"] = html 

344 lang = xabstract["lang"] 

345 if lang == xresource.lang: 

346 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

347 else: 

348 xabstract[ 

349 "value_xml" 

350 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

351 

352 if isinstance(xresource, ArticleData): 

353 update_data_for_jats(xresource) 

354 return xresource 

355 

356 def get(self, url: str, force_refresh=False): 

357 attempt = 0 

358 response = None 

359 

360 while attempt < 3: 

361 # If we already have a key, we can skip the timeout 

362 if isinstance(self.session, CachedSession): 362 ↛ 367line 362 didn't jump to line 367 because the condition on line 362 was always true

363 if not self.session.cache.contains(url=url): 

364 delta = self.next_allowed_request - time.time() 

365 if delta > 0: 

366 time.sleep(delta) 

367 self.next_allowed_request = time.time() + self.requests_interval 

368 try: 

369 # For SSL Errors, use verify=False kwarg 

370 verify = True 

371 if url.startswith("https://hdml.di.ionio.gr/"): 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 verify = False 

373 # self.session.cache.delete(urls=[url]) 

374 if isinstance(self.session, CachedSession): 374 ↛ 379line 374 didn't jump to line 379 because the condition on line 374 was always true

375 response = self.session.get( 

376 url, headers=self.headers, verify=verify, force_refresh=force_refresh 

377 ) 

378 else: 

379 response = self.session.get(url, headers=self.headers, verify=verify) 

380 if not response.ok: 

381 raise requests.exceptions.HTTPError( 

382 f"Endpoint answered with code {response.status_code} : {url}", 

383 response=response, 

384 ) 

385 return response 

386 except ( 

387 requests.ConnectionError, 

388 requests.ConnectTimeout, 

389 requests.exceptions.HTTPError, 

390 ): 

391 attempt += 1 

392 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

393 

394 def download_file(self, url: str, force_refresh=False): 

395 """ 

396 Downloads a URL, saves its content on disk in filename and returns its content. 

397 """ 

398 response = self.get(url, force_refresh=force_refresh or self.force_refresh) 

399 content = self.decode_response(response) 

400 if content == "" or not content: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 raise requests.exceptions.HTTPError(response) 

402 return content 

403 

404 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

405 """Override this if the content-type headers from the sources are advertising something else than the actual content 

406 SASA needs this""" 

407 response.encoding = encoding 

408 return response.text 

409 

410 def add_xissue_into_database(self, xissue: IssueData): 

411 xissue.journal = self.collection 

412 

413 if xissue.year == "": 

414 raise ValueError("Failsafe : Cannot insert issue without a year") 

415 

416 xpub = create_publisherdata() 

417 xpub.name = self.publisher 

418 xissue.publisher = xpub 

419 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

420 

421 attempt = 1 

422 success = False 

423 

424 while not success and attempt < 4: 

425 try: 

426 params = {"xissue": xissue, "use_body": False} 

427 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

428 container = cmd.do() 

429 success = True 

430 ContainerSource.objects.create(source=self.source, container=container) 

431 except SolrError: 

432 attempt += 1 

433 time.sleep(10) 

434 

435 def get_metadata_using_citation_meta( 

436 self, 

437 xarticle: ArticleData, 

438 xissue: IssueData, 

439 soup: BeautifulSoup, 

440 what: list[CitationLiteral] = [], 

441 ): 

442 """ 

443 :param xarticle: the xarticle that will collect the metadata 

444 :param xissue: the xissue that will collect the publisher 

445 :param soup: the BeautifulSoup object of tha article page 

446 :param what: list of citation_ items to collect. 

447 :return: None. The given article is modified 

448 """ 

449 

450 if "title" in what: 

451 # TITLE 

452 citation_title_node = soup.select_one("meta[name='citation_title']") 

453 if citation_title_node: 453 ↛ 458line 453 didn't jump to line 458 because the condition on line 453 was always true

454 title = citation_title_node.get("content") 

455 if isinstance(title, str): 455 ↛ 458line 455 didn't jump to line 458 because the condition on line 455 was always true

456 xarticle.title_tex = title 

457 

458 if "author" in what: 458 ↛ 487line 458 didn't jump to line 487 because the condition on line 458 was always true

459 # AUTHORS 

460 citation_author_nodes = soup.select("meta[name^='citation_author']") 

461 current_author: ContributorDict | None = None 

462 for citation_author_node in citation_author_nodes: 

463 if citation_author_node.get("name") == "citation_author": 

464 text_author = citation_author_node.get("content") 

465 if not isinstance(text_author, str): 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true

466 raise ValueError("Cannot parse author") 

467 if text_author == "": 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true

468 current_author = None 

469 continue 

470 current_author = create_contributor(role="author", string_name=text_author) 

471 xarticle.contributors.append(current_author) 

472 continue 

473 if current_author is None: 473 ↛ 474line 473 didn't jump to line 474 because the condition on line 473 was never true

474 print("Couldn't parse citation author") 

475 continue 

476 if citation_author_node.get("name") == "citation_author_institution": 

477 text_institution = citation_author_node.get("content") 

478 if not isinstance(text_institution, str): 478 ↛ 479line 478 didn't jump to line 479 because the condition on line 478 was never true

479 continue 

480 current_author["addresses"].append(text_institution) 

481 if citation_author_node.get("name") == "citation_author_ocrid": 481 ↛ 482line 481 didn't jump to line 482 because the condition on line 481 was never true

482 text_orcid = citation_author_node.get("content") 

483 if not isinstance(text_orcid, str): 

484 continue 

485 current_author["orcid"] = text_orcid 

486 

487 if "pdf" in what: 

488 # PDF 

489 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

490 if citation_pdf_node: 

491 pdf_url = citation_pdf_node.get("content") 

492 if isinstance(pdf_url, str): 492 ↛ 495line 492 didn't jump to line 495 because the condition on line 492 was always true

493 add_pdf_link_to_xarticle(xarticle, pdf_url) 

494 

495 if "lang" in what: 

496 # LANG 

497 citation_lang_node = soup.select_one("meta[name='citation_language']") 

498 if citation_lang_node: 498 ↛ 504line 498 didn't jump to line 504 because the condition on line 498 was always true

499 # TODO: check other language code 

500 content_text = citation_lang_node.get("content") 

501 if isinstance(content_text, str): 501 ↛ 504line 501 didn't jump to line 504 because the condition on line 501 was always true

502 xarticle.lang = standardize_tag(content_text) 

503 

504 if "abstract" in what: 

505 # ABSTRACT 

506 abstract_node = soup.select_one("meta[name='citation_abstract']") 

507 if abstract_node is not None: 

508 abstract = abstract_node.get("content") 

509 if not isinstance(abstract, str): 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true

510 raise ValueError("Couldn't parse abstract from meta") 

511 abstract = BeautifulSoup(abstract, "html.parser").text 

512 lang = abstract_node.get("lang") 

513 if not isinstance(lang, str): 513 ↛ 514line 513 didn't jump to line 514 because the condition on line 513 was never true

514 lang = self.detect_language(abstract, xarticle) 

515 xarticle.abstracts.append( 

516 { 

517 "tag": "abstract", 

518 "value_html": "", 

519 "value_tex": abstract, 

520 "value_xml": "", 

521 "lang": lang, 

522 } 

523 ) 

524 

525 if "page" in what: 

526 # PAGES 

527 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

528 if citation_fpage_node: 

529 page = citation_fpage_node.get("content") 

530 if isinstance(page, str): 530 ↛ 535line 530 didn't jump to line 535 because the condition on line 530 was always true

531 page = page.split("(")[0] 

532 if len(page) < 32: 532 ↛ 535line 532 didn't jump to line 535 because the condition on line 532 was always true

533 xarticle.fpage = page 

534 

535 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

536 if citation_lpage_node: 

537 page = citation_lpage_node.get("content") 

538 if isinstance(page, str): 538 ↛ 543line 538 didn't jump to line 543 because the condition on line 538 was always true

539 page = page.split("(")[0] 

540 if len(page) < 32: 540 ↛ 543line 540 didn't jump to line 543 because the condition on line 540 was always true

541 xarticle.lpage = page 

542 

543 if "doi" in what: 

544 # DOI 

545 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

546 if citation_doi_node: 

547 doi = citation_doi_node.get("content") 

548 if isinstance(doi, str): 548 ↛ 555line 548 didn't jump to line 555 because the condition on line 548 was always true

549 doi = doi.strip() 

550 pos = doi.find("10.") 

551 if pos > 0: 

552 doi = doi[pos:] 

553 xarticle.doi = doi 

554 

555 if "mr" in what: 

556 # MR 

557 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

558 if citation_mr_node: 558 ↛ 559line 558 didn't jump to line 559 because the condition on line 558 was never true

559 mr = citation_mr_node.get("content") 

560 if isinstance(mr, str): 

561 mr = mr.strip() 

562 if mr.find("MR") == 0: 

563 mr = mr[2:] 

564 xarticle.extids.append(("mr-item-id", mr)) 

565 

566 if "zbl" in what: 

567 # ZBL 

568 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

569 if citation_zbl_node: 

570 zbl = citation_zbl_node.get("content") 

571 if isinstance(zbl, str): 571 ↛ 577line 571 didn't jump to line 577 because the condition on line 571 was always true

572 zbl = zbl.strip() 

573 if zbl.find("Zbl") == 0: 573 ↛ 577line 573 didn't jump to line 577 because the condition on line 573 was always true

574 zbl = zbl[3:].strip() 

575 xarticle.extids.append(("zbl-item-id", zbl)) 

576 

577 if "publisher" in what: 

578 # PUBLISHER 

579 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

580 if citation_publisher_node: 

581 pub = citation_publisher_node.get("content") 

582 if isinstance(pub, str): 582 ↛ 589line 582 didn't jump to line 589 because the condition on line 582 was always true

583 pub = pub.strip() 

584 if pub != "": 584 ↛ 589line 584 didn't jump to line 589 because the condition on line 584 was always true

585 xpub = create_publisherdata() 

586 xpub.name = pub 

587 xissue.publisher = xpub 

588 

589 if "keywords" in what: 

590 # KEYWORDS 

591 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

592 for kwd_node in citation_kwd_nodes: 

593 kwds = kwd_node.get("content") 

594 if isinstance(kwds, str): 594 ↛ 592line 594 didn't jump to line 592 because the condition on line 594 was always true

595 kwds = kwds.split(",") 

596 for kwd in kwds: 

597 if kwd == "": 

598 continue 

599 kwd = kwd.strip() 

600 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

601 

602 if "references" in what: 

603 citation_references = soup.select("meta[name='citation_reference']") 

604 for index, tag in enumerate(citation_references): 

605 content = tag.get("content") 

606 if not isinstance(content, str): 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true

607 raise ValueError("Cannot parse citation_reference meta") 

608 xarticle.bibitems.append( 

609 self.__parse_meta_citation_reference(content, str(index + 1)) 

610 ) 

611 

612 def create_xissue( 

613 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

614 ): 

615 if url is not None and url.endswith("/"): 

616 url = url[:-1] 

617 xissue = create_issuedata() 

618 xissue.url = url 

619 

620 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

621 

622 xissue.year = year 

623 

624 if volume_number is not None: 624 ↛ 627line 624 didn't jump to line 627 because the condition on line 624 was always true

625 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

626 

627 if issue_number is not None: 

628 xissue.number = issue_number.replace(",", "-") 

629 return xissue 

630 

631 def detect_language(self, text: str, article: ArticleData | None = None): 

632 if article and article.lang is not None and article.lang != "und": 

633 return article.lang 

634 

635 language = self.language_detector.detect_language_of(text) 

636 

637 if not language: 637 ↛ 638line 637 didn't jump to line 638 because the condition on line 637 was never true

638 return "und" 

639 return language.iso_code_639_1.name.lower() 

640 

641 references_mapping = { 

642 "citation_title": get_article_title_xml, 

643 "citation_journal_title": get_source_xml, 

644 "citation_publication_date": get_year_xml, 

645 "citation_firstpage": get_fpage_xml, 

646 "citation_lastpage": get_lpage_xml, 

647 } 

648 

649 @classmethod 

650 def __parse_meta_citation_reference(cls, content: str, label=None): 

651 categories = content.split(";") 

652 

653 if len(categories) == 1: 

654 return cls.create_crawled_bibitem(content, label=label) 

655 

656 citation_data = [c.split("=") for c in categories if "=" in c] 

657 del categories 

658 

659 xml_string = "" 

660 authors_parsed = False 

661 authors_strings = [] 

662 for data in citation_data: 

663 key = data[0].strip() 

664 citation_content = data[1] 

665 if key == "citation_author": 

666 authors_strings.append(get_author_xml(template_str=citation_content)) 

667 continue 

668 elif not authors_parsed: 

669 xml_string += ", ".join(authors_strings) 

670 authors_parsed = True 

671 

672 if key in cls.references_mapping: 

673 xml_string += " " + cls.references_mapping[key](citation_content) 

674 

675 return cls.create_crawled_bibitem(xml_string, label=label) 

676 

677 @classmethod 

678 def get_or_create_source(cls): 

679 source, created = Source.objects.get_or_create( 

680 domain=cls.source_domain, 

681 defaults={ 

682 "name": cls.source_name, 

683 "website": cls.source_website, 

684 }, 

685 ) 

686 if created: 686 ↛ 687line 686 didn't jump to line 687 because the condition on line 686 was never true

687 source.save() 

688 return source 

689 

690 @staticmethod 

691 def create_crawled_bibitem(ref_value: str | JatsRef, label=None): 

692 if isinstance(ref_value, str): 

693 xref = RefData(lang="en") 

694 value_xml = "" 

695 if label: 

696 value_xml += f"<label>{label}</label>" 

697 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

698 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>' 

699 xref.citation_xml = value_xml 

700 else: 

701 xref = ref_value 

702 

703 xref = check_bibitem_xml(xref) 

704 

705 # Bakes extlink badges into the bibliography html 

706 # Maybe we should put this into another file (jats_parser ?) 

707 for extid in xref.extids: 

708 href = resolve_id(extid[0], extid[1]) 

709 if (not href) or (not xref.citation_html): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 continue 

711 str_format = extid[0] 

712 if str_format in extids_formats: 712 ↛ 714line 712 didn't jump to line 714 because the condition on line 712 was always true

713 str_format = extids_formats[str_format] 

714 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

715 

716 return xref 

717 

718 @staticmethod 

719 def create_bibliography(bibitems: Sequence[RefData]): 

720 xml_str = "<ref-list>\n" 

721 html_str = "<div>\n" 

722 

723 for item in bibitems: 

724 xml_str += f"\t{item.citation_xml}\n" 

725 html_str += f"\t<p>{item.citation_html}</p>\n" 

726 xml_str += "</ref-list>" 

727 

728 # for item in bibitems: 

729 # html_str = 

730 # html_str += f"\t<p>{item.citation_html}</p>\n" 

731 html_str += "</div>" 

732 

733 tex_str = "<div>\n" 

734 for item in bibitems: 

735 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

736 tex_str += "</div>" 

737 

738 biblio_dict = create_abstract( 

739 tag="biblio", 

740 value_html=html_str, 

741 value_tex=tex_str, 

742 value_xml=xml_str, 

743 lang="en", 

744 ) 

745 

746 return biblio_dict 

747 

748 @staticmethod 

749 def get_issue_pid( 

750 collection_id: str, 

751 year: str, 

752 volume_number: str | None = None, 

753 issue_number: str | None = None, 

754 ): 

755 # Replace any non-word character with an underscore 

756 pid = f"{collection_id}_{year}" 

757 if volume_number is not None: 

758 pid += f"_{volume_number}" 

759 if issue_number is not None: 

760 pid += f"_{issue_number}" 

761 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

762 return pid 

763 

764 @staticmethod 

765 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

766 pages_split = pages.split(separator) 

767 if len(pages_split) == 0: 767 ↛ 768line 767 didn't jump to line 768 because the condition on line 767 was never true

768 article.page_range = pages 

769 if len(pages_split) > 0: 769 ↛ exitline 769 didn't return from function 'set_pages' because the condition on line 769 was always true

770 if pages[0].isnumeric(): 

771 article.fpage = pages_split[0] 

772 if ( 

773 len(pages_split) > 1 

774 and pages_split[0] != pages_split[1] 

775 and pages_split[1].isnumeric() 

776 ): 

777 article.lpage = pages_split[1]