Coverage for src/crawler/base_crawler.py: 78%

428 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-03-28 11:29 +0000

1import time 

2from collections.abc import Sequence 

3from datetime import timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.citation import ( 

16 get_article_title_xml, 

17 get_author_xml, 

18 get_fpage_xml, 

19 get_lpage_xml, 

20 get_source_xml, 

21 get_year_xml, 

22) 

23from ptf.cmds.xml.jats.builder.issue import get_title_xml 

24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml 

25from ptf.display.resolver import extids_formats, resolve_id 

26from ptf.model_data import ( 

27 ArticleData, 

28 ContributorDict, 

29 IssueData, 

30 RefData, 

31 create_abstract, 

32 create_contributor, 

33 create_extlink, 

34 create_issuedata, 

35 create_publisherdata, 

36) 

37from ptf.model_data_converter import update_data_for_jats 

38from pylatexenc.latex2text import LatexNodes2Text 

39from pysolr import SolrError 

40from requests_cache import CachedSession, FileCache 

41 

42from crawler.models import Source 

43from crawler.models.container_source import ContainerSource 

44from crawler.types import CitationLiteral 

45from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

46 

47# TODO: pass a class factory instead of a dependency to a site 

48# TODO: pass a class factory instead of a dependency to a site 

49 

50 

51class BaseCollectionCrawler: 

52 """ 

53 Base collection for the crawlers. 

54 To create a crawler: 

55 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

56 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

57 3) update factory.py so that crawler_factory can return your new crawler 

58 """ 

59 

60 source_name = "" 

61 source_domain = "" 

62 source_website = "" 

63 

64 issue_href = "" 

65 

66 collection = None 

67 source = None 

68 user = None 

69 session: requests.Session | CachedSession 

70 # Updated in constructor with user agent from settings_local 

71 headers = {"accept_encoding": "utf-8"} 

72 

73 next_allowed_request: float = time.time() 

74 

75 # seconds to wait between two http requests 

76 requests_interval = 5 

77 

78 latext_parser = LatexNodes2Text() 

79 

80 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

81 # do not use the "$" to surround tex formulas 

82 delimiter_inline_formula = "$" 

83 delimiter_disp_formula = "$" 

84 

85 # HACK : Workaround for tests (monkeypatching) 

86 # We store the class here, so we can monkeypatch it when running tests 

87 # subCrawlers = { 

88 # LofplCrawler: None 

89 # } 

90 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

91 

92 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

93 

94 force_refresh = False 

95 

96 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

97 

98 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

99 ignore_missing_pdf = True 

100 

101 def __init__( 

102 self, 

103 *args, 

104 username: str, 

105 collection_id: str, 

106 collection_url: str, 

107 test_mode: bool = False, 

108 publisher: str = "mathdoc", 

109 force_refresh=False, 

110 ): 

111 for CrawlerClass in self.subCrawlers: 

112 self.subCrawlers[CrawlerClass] = CrawlerClass( 

113 *args, 

114 username=username, 

115 collection_id=collection_id, 

116 collection_url=collection_url, 

117 test_mode=test_mode, 

118 publisher=publisher, 

119 ) 

120 

121 self.username = username 

122 

123 self.collection_id = collection_id 

124 self.collection_url = ( 

125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

126 ) 

127 

128 self.test_mode = test_mode 

129 self.publisher = publisher 

130 

131 # Skipped when running tests 

132 self.initialize() 

133 

134 self.session = CachedSession( 

135 backend=FileCache( 

136 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"), 

137 decode_content=False, 

138 ), 

139 expire_after=timedelta(days=30), 

140 ) 

141 self.headers.update( 

142 { 

143 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

144 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

145 } 

146 ) 

147 

148 self.force_refresh = force_refresh 

149 

150 def initialize(self): 

151 """ 

152 Acts as a "second" init function to skip model accesses during test data generation 

153 """ 

154 self.collection = get_or_create_collection(self.collection_id) 

155 self.source = self.get_or_create_source() 

156 self.user = User.objects.get(username=self.username) 

157 

158 @classmethod 

159 def can_crawl(cls, pid: str) -> bool: 

160 return True 

161 

162 def parse_collection_content(self, content: str) -> list[IssueData]: 

163 """ 

164 Parse the HTML content with BeautifulSoup 

165 returns a list of xissue. 

166 Override this function in a derived class 

167 """ 

168 return [] 

169 

170 def parse_issue_content(self, content: str, xissue: IssueData): 

171 """ 

172 Parse the HTML content with BeautifulSoup 

173 Fills the xissue.articles 

174 Override this function in a derived class. 

175 

176 CAV : You are supposed to create articles there. Please assign a PID to each article. 

177 The PID can be `a + article_index`, like this : `a0` `a21` 

178 """ 

179 

180 def parse_article_content( 

181 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

182 ) -> ArticleData | None: 

183 """ 

184 Parse the HTML content with BeautifulSoup 

185 returns the xarticle. 

186 Override this function in a derived class. 

187 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

188 The article url is also passed as a parameter 

189 

190 CAV : You are supposed to assign articles pid again here 

191 """ 

192 return xarticle 

193 

194 def crawl_collection(self): 

195 # TODO: Comments, filter 

196 """ 

197 Crawl an entire collection. ptf.models.Container objects are created. 

198 - get the HTML content of the collection_url 

199 - parse the HTML content with beautifulsoup to extract the list of issues 

200 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

201 - crawl each issue if col_only is False 

202 - Returns the list of merged issues. 

203 It is an OrderedDict {pid: {"issues": xissues}} 

204 The key is the pid of the merged issues. 

205 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

206 the pid is then made with 1999-2000__6_ 

207 """ 

208 

209 if self.source is None: 

210 raise RuntimeError("ERROR: the source is not set") 

211 

212 content = self.download_file(self.collection_url) 

213 xissues = self.parse_collection_content(content) 

214 

215 """ 

216 Some collections split the same volumes in different pages 

217 Ex: Volume 6 (2000) and Volume 6 (1999) 

218 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

219 """ 

220 # merged_xissues = self.merge_xissues(xissues) 

221 

222 xissues_dict = {str(i.pid): i for i in xissues} 

223 

224 return xissues_dict 

225 

226 def crawl_issue(self, xissue: IssueData): 

227 """ 

228 Crawl 1 wag page of an issue. 

229 - get the HTML content of the issue 

230 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

231 - crawl each article 

232 """ 

233 

234 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

235 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

236 

237 issue_url = xissue.url 

238 if issue_url is not None: 

239 if issue_url.endswith(".pdf"): 

240 add_pdf_link_to_xarticle(xissue, issue_url) 

241 xissue.url = None 

242 else: 

243 content = self.download_file(issue_url) 

244 self.parse_issue_content(content, xissue) 

245 

246 xarticles = xissue.articles 

247 

248 parsed_xarticles = [] 

249 

250 for xarticle in xarticles: 

251 parsed_xarticle = self.crawl_article(xarticle, xissue) 

252 if parsed_xarticle is not None: 

253 parsed_xarticles.append(parsed_xarticle) 

254 

255 xissue.articles = parsed_xarticles 

256 

257 article_has_pdf = self.article_has_pdf(xissue) 

258 

259 if self.ignore_missing_pdf: 

260 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

261 

262 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

263 self.add_xissue_into_database(xissue) 

264 

265 @staticmethod 

266 def article_has_source(art: ArticleData | IssueData): 

267 return ( 

268 next( 

269 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

270 None, 

271 ) 

272 is not None 

273 ) 

274 

275 @staticmethod 

276 def article_has_pdf(art: ArticleData | IssueData): 

277 return ( 

278 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

279 is not None 

280 ) 

281 

282 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

283 # ARTICLE URL as en ExtLink (to display the link in the article page) 

284 if xarticle.url is None: 

285 if not self.article_has_source(xarticle): 285 ↛ 295line 285 didn't jump to line 295 because the condition on line 285 was always true

286 if xissue.url: 

287 article_source = xissue.url 

288 else: 

289 article_source = self.collection_url 

290 ext_link = create_extlink() 

291 ext_link["rel"] = "source" 

292 ext_link["location"] = article_source 

293 ext_link["metadata"] = self.source_domain 

294 xarticle.ext_links.append(ext_link) 

295 return self.process_article_metadata(xarticle) 

296 

297 content = self.download_file(xarticle.url) 

298 

299 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

300 if parsed_xarticle is None: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 return None 

302 

303 if parsed_xarticle.doi: 

304 parsed_xarticle.pid = ( 

305 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

306 ) 

307 else: 

308 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

309 

310 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

311 ext_link = create_extlink() 

312 ext_link["rel"] = "source" 

313 ext_link["location"] = parsed_xarticle.url 

314 ext_link["metadata"] = self.source_domain 

315 parsed_xarticle.ext_links.append(ext_link) 

316 

317 # The article title may have formulas surrounded with '$' 

318 return self.process_article_metadata(parsed_xarticle) 

319 

320 def process_article_metadata(self, xarticle: ArticleData): 

321 html, xml = get_html_and_xml_from_text_with_formulas( 

322 xarticle.title_tex, 

323 delimiter_inline=self.delimiter_inline_formula, 

324 delimiter_disp=self.delimiter_disp_formula, 

325 ) 

326 xml = get_title_xml(xml, with_tex_values=False) 

327 xarticle.title_html = html 

328 xarticle.title_xml = xml 

329 

330 abstracts_to_parse = [ 

331 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract" 

332 ] 

333 # abstract may have formulas surrounded with '$' 

334 if len(abstracts_to_parse) > 0: 

335 for xabstract in abstracts_to_parse: 

336 html, xml = get_html_and_xml_from_text_with_formulas( 

337 xabstract["value_tex"], 

338 delimiter_inline=self.delimiter_inline_formula, 

339 delimiter_disp=self.delimiter_disp_formula, 

340 ) 

341 xabstract["value_html"] = html 

342 lang = xabstract["lang"] 

343 if lang == xarticle.lang: 

344 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

345 else: 

346 xabstract[ 

347 "value_xml" 

348 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

349 

350 update_data_for_jats(xarticle) 

351 

352 return xarticle 

353 

354 def get(self, url: str, force_refresh=False): 

355 attempt = 0 

356 response = None 

357 

358 while attempt < 3: 

359 # If we already have a key, we can skip the timeout 

360 if isinstance(self.session, CachedSession): 360 ↛ 365line 360 didn't jump to line 365 because the condition on line 360 was always true

361 if not self.session.cache.contains(url=url): 

362 delta = self.next_allowed_request - time.time() 

363 if delta > 0: 

364 time.sleep(delta) 

365 self.next_allowed_request = time.time() + self.requests_interval 

366 try: 

367 # For SSL Errors, use verify=False kwarg 

368 verify = True 

369 if url.startswith("https://hdml.di.ionio.gr/"): 369 ↛ 370line 369 didn't jump to line 370 because the condition on line 369 was never true

370 verify = False 

371 # self.session.cache.delete(urls=[url]) 

372 if isinstance(self.session, CachedSession): 372 ↛ 377line 372 didn't jump to line 377 because the condition on line 372 was always true

373 response = self.session.get( 

374 url, headers=self.headers, verify=verify, force_refresh=force_refresh 

375 ) 

376 else: 

377 response = self.session.get(url, headers=self.headers, verify=verify) 

378 if not response.ok: 

379 raise requests.exceptions.HTTPError( 

380 f"Endpoint answered with code {response.status_code} : {url}", 

381 response=response, 

382 ) 

383 return response 

384 except ( 

385 requests.ConnectionError, 

386 requests.ConnectTimeout, 

387 requests.exceptions.HTTPError, 

388 ): 

389 attempt += 1 

390 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

391 

392 def download_file(self, url: str, force_refresh=False): 

393 """ 

394 Downloads a URL, saves its content on disk in filename and returns its content. 

395 """ 

396 response = self.get(url, force_refresh=force_refresh or self.force_refresh) 

397 content = self.decode_response(response) 

398 if content == "" or not content: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 raise requests.exceptions.HTTPError(response) 

400 return content 

401 

402 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

403 """Override this if the content-type headers from the sources are advertising something else than the actual content 

404 SASA needs this""" 

405 response.encoding = encoding 

406 return response.text 

407 

408 def add_xissue_into_database(self, xissue: IssueData): 

409 xissue.journal = self.collection 

410 

411 if xissue.year == "": 

412 raise ValueError("Failsafe : Cannot insert issue without a year") 

413 

414 xpub = create_publisherdata() 

415 xpub.name = self.publisher 

416 xissue.publisher = xpub 

417 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

418 

419 attempt = 1 

420 success = False 

421 

422 while not success and attempt < 4: 

423 try: 

424 params = {"xissue": xissue, "use_body": False} 

425 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

426 container = cmd.do() 

427 success = True 

428 ContainerSource.objects.create(source=self.source, container=container) 

429 except SolrError: 

430 attempt += 1 

431 time.sleep(10) 

432 

433 def get_metadata_using_citation_meta( 

434 self, 

435 xarticle: ArticleData, 

436 xissue: IssueData, 

437 soup: BeautifulSoup, 

438 what: list[CitationLiteral] = [], 

439 ): 

440 """ 

441 :param xarticle: the xarticle that will collect the metadata 

442 :param xissue: the xissue that will collect the publisher 

443 :param soup: the BeautifulSoup object of tha article page 

444 :param what: list of citation_ items to collect. 

445 :return: None. The given article is modified 

446 """ 

447 

448 if "title" in what: 

449 # TITLE 

450 citation_title_node = soup.select_one("meta[name='citation_title']") 

451 if citation_title_node: 451 ↛ 456line 451 didn't jump to line 456 because the condition on line 451 was always true

452 title = citation_title_node.get("content") 

453 if isinstance(title, str): 453 ↛ 456line 453 didn't jump to line 456 because the condition on line 453 was always true

454 xarticle.title_tex = title 

455 

456 if "author" in what: 456 ↛ 485line 456 didn't jump to line 485 because the condition on line 456 was always true

457 # AUTHORS 

458 citation_author_nodes = soup.select("meta[name^='citation_author']") 

459 current_author: ContributorDict | None = None 

460 for citation_author_node in citation_author_nodes: 

461 if citation_author_node.get("name") == "citation_author": 

462 text_author = citation_author_node.get("content") 

463 if not isinstance(text_author, str): 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true

464 raise ValueError("Cannot parse author") 

465 if text_author == "": 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true

466 current_author = None 

467 continue 

468 current_author = create_contributor(role="author", string_name=text_author) 

469 xarticle.contributors.append(current_author) 

470 continue 

471 if current_author is None: 471 ↛ 472line 471 didn't jump to line 472 because the condition on line 471 was never true

472 print("Couldn't parse citation author") 

473 continue 

474 if citation_author_node.get("name") == "citation_author_institution": 

475 text_institution = citation_author_node.get("content") 

476 if not isinstance(text_institution, str): 476 ↛ 477line 476 didn't jump to line 477 because the condition on line 476 was never true

477 continue 

478 current_author["addresses"].append(text_institution) 

479 if citation_author_node.get("name") == "citation_author_ocrid": 479 ↛ 480line 479 didn't jump to line 480 because the condition on line 479 was never true

480 text_orcid = citation_author_node.get("content") 

481 if not isinstance(text_orcid, str): 

482 continue 

483 current_author["orcid"] = text_orcid 

484 

485 if "pdf" in what: 

486 # PDF 

487 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

488 if citation_pdf_node: 

489 pdf_url = citation_pdf_node.get("content") 

490 if isinstance(pdf_url, str): 490 ↛ 493line 490 didn't jump to line 493 because the condition on line 490 was always true

491 add_pdf_link_to_xarticle(xarticle, pdf_url) 

492 

493 if "lang" in what: 

494 # LANG 

495 citation_lang_node = soup.select_one("meta[name='citation_language']") 

496 if citation_lang_node: 496 ↛ 502line 496 didn't jump to line 502 because the condition on line 496 was always true

497 # TODO: check other language code 

498 content_text = citation_lang_node.get("content") 

499 if isinstance(content_text, str): 499 ↛ 502line 499 didn't jump to line 502 because the condition on line 499 was always true

500 xarticle.lang = standardize_tag(content_text) 

501 

502 if "abstract" in what: 

503 # ABSTRACT 

504 abstract_node = soup.select_one("meta[name='citation_abstract']") 

505 if abstract_node is not None: 

506 abstract = abstract_node.get("content") 

507 if not isinstance(abstract, str): 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true

508 raise ValueError("Couldn't parse abstract from meta") 

509 abstract = BeautifulSoup(abstract, "html.parser").text 

510 lang = abstract_node.get("lang") 

511 if not isinstance(lang, str): 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true

512 lang = self.detect_language(abstract, xarticle) 

513 xarticle.abstracts.append( 

514 { 

515 "tag": "abstract", 

516 "value_html": "", 

517 "value_tex": abstract, 

518 "value_xml": "", 

519 "lang": lang, 

520 } 

521 ) 

522 

523 if "page" in what: 

524 # PAGES 

525 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

526 if citation_fpage_node: 

527 page = citation_fpage_node.get("content") 

528 if isinstance(page, str): 528 ↛ 533line 528 didn't jump to line 533 because the condition on line 528 was always true

529 page = page.split("(")[0] 

530 if len(page) < 32: 530 ↛ 533line 530 didn't jump to line 533 because the condition on line 530 was always true

531 xarticle.fpage = page 

532 

533 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

534 if citation_lpage_node: 

535 page = citation_lpage_node.get("content") 

536 if isinstance(page, str): 536 ↛ 541line 536 didn't jump to line 541 because the condition on line 536 was always true

537 page = page.split("(")[0] 

538 if len(page) < 32: 538 ↛ 541line 538 didn't jump to line 541 because the condition on line 538 was always true

539 xarticle.lpage = page 

540 

541 if "doi" in what: 

542 # DOI 

543 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

544 if citation_doi_node: 

545 doi = citation_doi_node.get("content") 

546 if isinstance(doi, str): 546 ↛ 553line 546 didn't jump to line 553 because the condition on line 546 was always true

547 doi = doi.strip() 

548 pos = doi.find("10.") 

549 if pos > 0: 

550 doi = doi[pos:] 

551 xarticle.doi = doi 

552 

553 if "mr" in what: 

554 # MR 

555 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

556 if citation_mr_node: 556 ↛ 557line 556 didn't jump to line 557 because the condition on line 556 was never true

557 mr = citation_mr_node.get("content") 

558 if isinstance(mr, str): 

559 mr = mr.strip() 

560 if mr.find("MR") == 0: 

561 mr = mr[2:] 

562 xarticle.extids.append(("mr-item-id", mr)) 

563 

564 if "zbl" in what: 

565 # ZBL 

566 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

567 if citation_zbl_node: 

568 zbl = citation_zbl_node.get("content") 

569 if isinstance(zbl, str): 569 ↛ 575line 569 didn't jump to line 575 because the condition on line 569 was always true

570 zbl = zbl.strip() 

571 if zbl.find("Zbl") == 0: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true

572 zbl = zbl[3:].strip() 

573 xarticle.extids.append(("zbl-item-id", zbl)) 

574 

575 if "publisher" in what: 

576 # PUBLISHER 

577 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

578 if citation_publisher_node: 

579 pub = citation_publisher_node.get("content") 

580 if isinstance(pub, str): 580 ↛ 587line 580 didn't jump to line 587 because the condition on line 580 was always true

581 pub = pub.strip() 

582 if pub != "": 582 ↛ 587line 582 didn't jump to line 587 because the condition on line 582 was always true

583 xpub = create_publisherdata() 

584 xpub.name = pub 

585 xissue.publisher = xpub 

586 

587 if "keywords" in what: 

588 # KEYWORDS 

589 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

590 for kwd_node in citation_kwd_nodes: 

591 kwds = kwd_node.get("content") 

592 if isinstance(kwds, str): 592 ↛ 590line 592 didn't jump to line 590 because the condition on line 592 was always true

593 kwds = kwds.split(",") 

594 for kwd in kwds: 

595 if kwd == "": 

596 continue 

597 kwd = kwd.strip() 

598 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

599 

600 if "references" in what: 

601 citation_references = soup.select("meta[name='citation_reference']") 

602 for index, tag in enumerate(citation_references): 

603 content = tag.get("content") 

604 if not isinstance(content, str): 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 raise ValueError("Cannot parse citation_reference meta") 

606 xarticle.bibitems.append( 

607 self.__parse_meta_citation_reference(content, str(index + 1)) 

608 ) 

609 

610 def create_xissue( 

611 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

612 ): 

613 if url is not None and url.endswith("/"): 

614 url = url[:-1] 

615 xissue = create_issuedata() 

616 xissue.url = url 

617 

618 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

619 

620 xissue.year = year 

621 

622 if volume_number is not None: 622 ↛ 625line 622 didn't jump to line 625 because the condition on line 622 was always true

623 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

624 

625 if issue_number is not None: 

626 xissue.number = issue_number.replace(",", "-") 

627 return xissue 

628 

629 def detect_language(self, text: str, article: ArticleData | None = None): 

630 if article and article.lang is not None and article.lang != "und": 

631 return article.lang 

632 

633 language = self.language_detector.detect_language_of(text) 

634 

635 if not language: 635 ↛ 636line 635 didn't jump to line 636 because the condition on line 635 was never true

636 return "und" 

637 return language.iso_code_639_1.name.lower() 

638 

639 references_mapping = { 

640 "citation_title": get_article_title_xml, 

641 "citation_journal_title": get_source_xml, 

642 "citation_publication_date": get_year_xml, 

643 "citation_firstpage": get_fpage_xml, 

644 "citation_lastpage": get_lpage_xml, 

645 } 

646 

647 @classmethod 

648 def __parse_meta_citation_reference(cls, content: str, label=None): 

649 categories = content.split(";") 

650 

651 if len(categories) == 1: 

652 return cls.create_crawled_bibitem(content, label=label) 

653 

654 citation_data = [c.split("=") for c in categories if "=" in c] 

655 del categories 

656 

657 xml_string = "" 

658 authors_parsed = False 

659 authors_strings = [] 

660 for data in citation_data: 

661 key = data[0].strip() 

662 citation_content = data[1] 

663 if key == "citation_author": 

664 authors_strings.append(get_author_xml(template_str=citation_content)) 

665 continue 

666 elif not authors_parsed: 

667 xml_string += ", ".join(authors_strings) 

668 authors_parsed = True 

669 

670 if key in cls.references_mapping: 

671 xml_string += " " + cls.references_mapping[key](citation_content) 

672 

673 return cls.create_crawled_bibitem(xml_string, label=label) 

674 

675 @classmethod 

676 def get_or_create_source(cls): 

677 source, created = Source.objects.get_or_create( 

678 domain=cls.source_domain, 

679 defaults={ 

680 "name": cls.source_name, 

681 "website": cls.source_website, 

682 }, 

683 ) 

684 if created: 684 ↛ 685line 684 didn't jump to line 685 because the condition on line 684 was never true

685 source.save() 

686 return source 

687 

688 @staticmethod 

689 def create_crawled_bibitem(ref_value: str | JatsRef, label=None): 

690 if isinstance(ref_value, str): 

691 xref = RefData(lang="en") 

692 value_xml = "" 

693 if label: 

694 value_xml += f"<label>{label}</label>" 

695 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

696 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>' 

697 xref.citation_xml = value_xml 

698 else: 

699 xref = ref_value 

700 

701 xref = check_bibitem_xml(xref) 

702 

703 # Bakes extlink badges into the bibliography html 

704 # Maybe we should put this into another file (jats_parser ?) 

705 for extid in xref.extids: 

706 href = resolve_id(extid[0], extid[1]) 

707 if (not href) or (not xref.citation_html): 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true

708 continue 

709 str_format = extid[0] 

710 if str_format in extids_formats: 710 ↛ 712line 710 didn't jump to line 712 because the condition on line 710 was always true

711 str_format = extids_formats[str_format] 

712 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

713 

714 return xref 

715 

716 @staticmethod 

717 def create_bibliography(bibitems: Sequence[RefData]): 

718 xml_str = "<ref-list>\n" 

719 html_str = "<div>\n" 

720 

721 for item in bibitems: 

722 xml_str += f"\t{item.citation_xml}\n" 

723 html_str += f"\t<p>{item.citation_html}</p>\n" 

724 xml_str += "</ref-list>" 

725 

726 # for item in bibitems: 

727 # html_str = 

728 # html_str += f"\t<p>{item.citation_html}</p>\n" 

729 html_str += "</div>" 

730 

731 tex_str = "<div>\n" 

732 for item in bibitems: 

733 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

734 tex_str += "</div>" 

735 

736 biblio_dict = create_abstract( 

737 tag="biblio", 

738 value_html=html_str, 

739 value_tex=tex_str, 

740 value_xml=xml_str, 

741 lang="en", 

742 ) 

743 

744 return biblio_dict 

745 

746 @staticmethod 

747 def get_issue_pid( 

748 collection_id: str, 

749 year: str, 

750 volume_number: str | None = None, 

751 issue_number: str | None = None, 

752 ): 

753 # Replace any non-word character with an underscore 

754 pid = f"{collection_id}_{year}" 

755 if volume_number is not None: 755 ↛ 757line 755 didn't jump to line 757 because the condition on line 755 was always true

756 pid += f"_{volume_number}" 

757 if issue_number is not None: 

758 pid += f"_{issue_number}" 

759 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

760 return pid 

761 

762 @staticmethod 

763 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

764 pages_split = pages.split(separator) 

765 if len(pages_split) == 0: 765 ↛ 766line 765 didn't jump to line 766 because the condition on line 765 was never true

766 article.page_range = pages 

767 if len(pages_split) > 0: 767 ↛ exitline 767 didn't return from function 'set_pages' because the condition on line 767 was always true

768 if pages[0].isnumeric(): 

769 article.fpage = pages_split[0] 

770 if ( 

771 len(pages_split) > 1 

772 and pages_split[0] != pages_split[1] 

773 and pages_split[1].isnumeric() 

774 ): 

775 article.lpage = pages_split[1]