Coverage for src/crawler/base_crawler.py: 79%

433 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import time 

2from collections.abc import Sequence 

3from datetime import timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetector, LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.citation import ( 

16 get_article_title_xml, 

17 get_author_xml, 

18 get_fpage_xml, 

19 get_lpage_xml, 

20 get_source_xml, 

21 get_year_xml, 

22) 

23from ptf.cmds.xml.jats.builder.issue import get_title_xml 

24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml 

25from ptf.display.resolver import extids_formats, resolve_id 

26from ptf.model_data import ( 

27 ArticleData, 

28 ContributorDict, 

29 IssueData, 

30 RefData, 

31 create_abstract, 

32 create_contributor, 

33 create_extlink, 

34 create_issuedata, 

35 create_publisherdata, 

36) 

37from ptf.model_data_converter import update_data_for_jats 

38from pylatexenc.latex2text import LatexNodes2Text 

39from pysolr import SolrError 

40from requests_cache import CachedSession, FileCache 

41 

42from crawler.models import Periode, Source 

43from crawler.types import CitationLiteral 

44from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

45 

46# TODO: pass a class factory instead of a dependency to a site 

47# TODO: pass a class factory instead of a dependency to a site 

48 

49 

50class BaseCollectionCrawler: 

51 """ 

52 Base collection for the crawlers. 

53 To create a crawler: 

54 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

55 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

56 3) update factory.py so that crawler_factory can return your new crawler 

57 """ 

58 

59 source_name = "" 

60 source_domain = "" 

61 source_website = "" 

62 

63 periode_begin: int = 0 

64 periode_end: int = 9999 

65 

66 issue_href = "" 

67 

68 collection = None 

69 source = None 

70 periode = None 

71 user = None 

72 session: requests.Session | CachedSession 

73 # Updated in constructor with user agent from settings_local 

74 headers = {"accept_encoding": "utf-8"} 

75 

76 next_allowed_request: float = time.time() 

77 

78 # seconds to wait between two http requests 

79 requests_interval = 5 

80 

81 latext_parser = LatexNodes2Text() 

82 

83 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

84 # do not use the "$" to surround tex formulas 

85 delimiter_inline_formula = "$" 

86 delimiter_disp_formula = "$" 

87 

88 # HACK : Workaround for tests (monkeypatching) 

89 # We store the class here, so we can monkeypatch it when running tests 

90 # subCrawlers = { 

91 # LofplCrawler: None 

92 # } 

93 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

94 

95 language_detector: LanguageDetector 

96 

97 force_refresh = False 

98 

99 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

100 

101 def __init__( 

102 self, 

103 *args, 

104 username: str, 

105 collection_id: str, 

106 collection_url: str, 

107 test_mode: bool = False, 

108 publisher: str = "mathdoc", 

109 force_refresh=False, 

110 ): 

111 for CrawlerClass in self.subCrawlers: 

112 self.subCrawlers[CrawlerClass] = CrawlerClass( 

113 *args, 

114 username=username, 

115 collection_id=collection_id, 

116 collection_url=collection_url, 

117 test_mode=test_mode, 

118 publisher=publisher, 

119 ) 

120 

121 self.username = username 

122 

123 self.collection_id = collection_id 

124 self.collection_url = ( 

125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

126 ) 

127 

128 self.test_mode = test_mode 

129 self.publisher = publisher 

130 

131 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page 

132 # AMP sets or creates the Periode during the __init__ 

133 # TODO: see with other sources when to create the Periode 

134 self.periode = None 

135 self.periode_first_issue = None 

136 self.periode_last_issue = None 

137 

138 self.language_detector = LanguageDetectorBuilder.from_all_languages().build() 

139 

140 # Skipped when running tests 

141 self.initialize() 

142 

143 self.session = CachedSession( 

144 backend=FileCache( 

145 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"), 

146 decode_content=False, 

147 ), 

148 expire_after=timedelta(days=30), 

149 ) 

150 self.headers.update( 

151 { 

152 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

153 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

154 } 

155 ) 

156 

157 self.force_refresh = force_refresh 

158 

159 def initialize(self): 

160 """ 

161 Acts as a "second" init function to skip model accesses during test data generation 

162 """ 

163 self.collection = get_or_create_collection(self.collection_id) 

164 self.source = self.get_or_create_source() 

165 self.periode = self.get_or_create_periode() 

166 self.user = User.objects.get(username=self.username) 

167 

168 def parse_collection_content(self, content: str) -> list[IssueData]: 

169 """ 

170 Parse the HTML content with BeautifulSoup 

171 returns a list of xissue. 

172 Override this function in a derived class 

173 """ 

174 return [] 

175 

176 def parse_issue_content(self, content: str, xissue: IssueData): 

177 """ 

178 Parse the HTML content with BeautifulSoup 

179 Fills the xissue.articles 

180 Override this function in a derived class. 

181 

182 CAV : You are supposed to create articles there. Please assign a PID to each article. 

183 The PID can be `a + article_index`, like this : `a0` `a21` 

184 """ 

185 

186 def parse_article_content( 

187 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str 

188 ) -> ArticleData | None: 

189 """ 

190 Parse the HTML content with BeautifulSoup 

191 returns the xarticle. 

192 Override this function in a derived class. 

193 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

194 The article url is also passed as a parameter 

195 

196 CAV : You are supposed to assign articles pid again here 

197 """ 

198 xarticle.pid = pid 

199 return xarticle 

200 

201 def crawl_collection(self): 

202 # TODO: Comments, filter 

203 """ 

204 Crawl an entire collection. ptf.models.Container objects are created. 

205 - get the HTML content of the collection_url 

206 - parse the HTML content with beautifulsoup to extract the list of issues 

207 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

208 - crawl each issue if col_only is False 

209 - Returns the list of merged issues. 

210 It is an OrderedDict {pid: {"issues": xissues}} 

211 The key is the pid of the merged issues. 

212 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

213 the pid is then made with 1999-2000__6_ 

214 """ 

215 

216 if self.source is None: 

217 raise RuntimeError("ERROR: the source is not set") 

218 

219 content = self.download_file(self.collection_url) 

220 xissues = self.parse_collection_content(content) 

221 

222 # xissues = [ 

223 # issue 

224 # for issue in xissues 

225 # if int(issue.year) >= self.periode_begin and int(issue.year) <= self.periode_end 

226 # ] 

227 

228 """ 

229 Some collections split the same volumes in different pages 

230 Ex: Volume 6 (2000) and Volume 6 (1999) 

231 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

232 """ 

233 # merged_xissues = self.merge_xissues(xissues) 

234 

235 xissues_dict = {str(i.pid): i for i in xissues} 

236 

237 return xissues_dict 

238 

239 def crawl_issue(self, xissue: IssueData): 

240 """ 

241 Crawl 1 wag page of an issue. 

242 - get the HTML content of the issue 

243 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

244 - crawl each article 

245 """ 

246 

247 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

248 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

249 

250 issue_url = xissue.url 

251 if issue_url is not None: 

252 if issue_url.endswith(".pdf"): 

253 add_pdf_link_to_xarticle(xissue, issue_url) 

254 xissue.url = None 

255 else: 

256 content = self.download_file(issue_url) 

257 self.parse_issue_content(content, xissue) 

258 

259 xarticles = xissue.articles 

260 

261 parsed_xarticles = [] 

262 

263 for xarticle in xarticles: 

264 parsed_xarticle = self.crawl_article(xarticle, xissue) 

265 if parsed_xarticle is not None: 

266 parsed_xarticles.append(parsed_xarticle) 

267 

268 xissue.articles = parsed_xarticles 

269 

270 article_has_pdf = ( 

271 next((link["mimetype"] == "application/pdf" for link in xissue.ext_links), None) 

272 is not None 

273 ) 

274 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

275 self.add_xissue_into_database(xissue) 

276 

277 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

278 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

279 def article_has_source(art): 

280 return ( 

281 next( 

282 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

283 None, 

284 ) 

285 is not None 

286 ) 

287 

288 # ARTICLE URL as en ExtLink (to display the link in the article page) 

289 if xarticle.url is None: 

290 if not article_has_source(xarticle): 290 ↛ 300line 290 didn't jump to line 300 because the condition on line 290 was always true

291 if xissue.url: 

292 article_source = xissue.url 

293 else: 

294 article_source = self.collection_url 

295 ext_link = create_extlink() 

296 ext_link["rel"] = "source" 

297 ext_link["location"] = article_source 

298 ext_link["metadata"] = self.source_domain 

299 xarticle.ext_links.append(ext_link) 

300 return self.process_article_metadata(xarticle) 

301 

302 content = self.download_file(xarticle.url) 

303 pid = f"{xissue.pid}_{xarticle.pid}" 

304 

305 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url, pid) 

306 if parsed_xarticle is None: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 return None 

308 

309 if not article_has_source(parsed_xarticle) and parsed_xarticle.url: 

310 ext_link = create_extlink() 

311 ext_link["rel"] = "source" 

312 ext_link["location"] = parsed_xarticle.url 

313 ext_link["metadata"] = self.source_domain 

314 parsed_xarticle.ext_links.append(ext_link) 

315 

316 # The article title may have formulas surrounded with '$' 

317 return self.process_article_metadata(parsed_xarticle) 

318 

319 def process_article_metadata(self, xarticle: ArticleData): 

320 html, xml = get_html_and_xml_from_text_with_formulas( 

321 xarticle.title_tex, 

322 delimiter_inline=self.delimiter_inline_formula, 

323 delimiter_disp=self.delimiter_disp_formula, 

324 ) 

325 xml = get_title_xml(xml, with_tex_values=False) 

326 xarticle.title_html = html 

327 xarticle.title_xml = xml 

328 

329 abstracts_to_parse = [ 

330 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract" 

331 ] 

332 # abstract may have formulas surrounded with '$' 

333 if len(abstracts_to_parse) > 0: 

334 for xabstract in abstracts_to_parse: 

335 html, xml = get_html_and_xml_from_text_with_formulas( 

336 xabstract["value_tex"], 

337 delimiter_inline=self.delimiter_inline_formula, 

338 delimiter_disp=self.delimiter_disp_formula, 

339 ) 

340 xabstract["value_html"] = html 

341 lang = xabstract["lang"] 

342 if lang == xarticle.lang: 

343 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

344 else: 

345 xabstract[ 

346 "value_xml" 

347 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

348 

349 update_data_for_jats(xarticle) 

350 

351 return xarticle 

352 

353 def get(self, url: str, force_refresh=False): 

354 attempt = 0 

355 response = None 

356 

357 while attempt < 3: 

358 # If we already have a key, we can skip the timeout 

359 if isinstance(self.session, CachedSession): 359 ↛ 364line 359 didn't jump to line 364 because the condition on line 359 was always true

360 if not self.session.cache.contains(url=url): 

361 delta = self.next_allowed_request - time.time() 

362 if delta > 0: 

363 time.sleep(delta) 

364 self.next_allowed_request = time.time() + self.requests_interval 

365 try: 

366 # For SSL Errors, use verify=False kwarg 

367 verify = True 

368 if url.startswith("https://hdml.di.ionio.gr/"): 368 ↛ 369line 368 didn't jump to line 369 because the condition on line 368 was never true

369 verify = False 

370 # self.session.cache.delete(urls=[url]) 

371 if isinstance(self.session, CachedSession): 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true

372 response = self.session.get( 

373 url, headers=self.headers, verify=verify, force_refresh=force_refresh 

374 ) 

375 else: 

376 response = self.session.get(url, headers=self.headers, verify=verify) 

377 if not response.ok: 

378 raise requests.exceptions.HTTPError( 

379 f"Endpoint answered with code {response.status_code} : {url}", 

380 response=response, 

381 ) 

382 return response 

383 except ( 

384 requests.ConnectionError, 

385 requests.ConnectTimeout, 

386 requests.exceptions.HTTPError, 

387 ): 

388 attempt += 1 

389 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

390 

391 def download_file(self, url: str, force_refresh=False): 

392 """ 

393 Downloads a URL, saves its content on disk in filename and returns its content. 

394 """ 

395 response = self.get(url, force_refresh=force_refresh or self.force_refresh) 

396 content = self.decode_response(response) 

397 if content == "" or not content: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 raise requests.exceptions.HTTPError(response) 

399 return content 

400 

401 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

402 """Override this if the content-type headers from the sources are advertising something else than the actual content 

403 SASA needs this""" 

404 response.encoding = encoding 

405 return response.text 

406 

407 def add_xissue_into_database(self, xissue: IssueData): 

408 xissue.journal = self.collection 

409 

410 xpub = create_publisherdata() 

411 xpub.name = self.publisher 

412 xissue.publisher = xpub 

413 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

414 

415 attempt = 1 

416 success = False 

417 

418 while not success and attempt < 4: 

419 try: 

420 params = {"xissue": xissue, "use_body": False} 

421 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

422 cmd.do() 

423 success = True 

424 except SolrError: 

425 attempt += 1 

426 time.sleep(10) 

427 

428 def get_metadata_using_citation_meta( 

429 self, 

430 xarticle: ArticleData, 

431 xissue: IssueData, 

432 soup: BeautifulSoup, 

433 what: list[CitationLiteral] = [], 

434 ): 

435 """ 

436 :param xarticle: the xarticle that will collect the metadata 

437 :param xissue: the xissue that will collect the publisher 

438 :param soup: the BeautifulSoup object of tha article page 

439 :param what: list of citation_ items to collect. 

440 :return: None. The given article is modified 

441 """ 

442 

443 if "title" in what: 

444 # TITLE 

445 citation_title_node = soup.select_one("meta[name='citation_title']") 

446 if citation_title_node: 446 ↛ 451line 446 didn't jump to line 451 because the condition on line 446 was always true

447 title = citation_title_node.get("content") 

448 if isinstance(title, str): 448 ↛ 451line 448 didn't jump to line 451 because the condition on line 448 was always true

449 xarticle.title_tex = title 

450 

451 if "author" in what: 451 ↛ 480line 451 didn't jump to line 480 because the condition on line 451 was always true

452 # AUTHORS 

453 citation_author_nodes = soup.select("meta[name^='citation_author']") 

454 current_author: ContributorDict | None = None 

455 for citation_author_node in citation_author_nodes: 

456 if citation_author_node.get("name") == "citation_author": 

457 text_author = citation_author_node.get("content") 

458 if not isinstance(text_author, str): 458 ↛ 459line 458 didn't jump to line 459 because the condition on line 458 was never true

459 raise ValueError("Cannot parse author") 

460 if text_author == "": 460 ↛ 461line 460 didn't jump to line 461 because the condition on line 460 was never true

461 current_author = None 

462 continue 

463 current_author = create_contributor(role="author", string_name=text_author) 

464 xarticle.contributors.append(current_author) 

465 continue 

466 if current_author is None: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true

467 print("Couldn't parse citation author") 

468 continue 

469 if citation_author_node.get("name") == "citation_author_institution": 

470 text_institution = citation_author_node.get("content") 

471 if not isinstance(text_institution, str): 471 ↛ 472line 471 didn't jump to line 472 because the condition on line 471 was never true

472 continue 

473 current_author["addresses"].append(text_institution) 

474 if citation_author_node.get("name") == "citation_author_ocrid": 474 ↛ 475line 474 didn't jump to line 475 because the condition on line 474 was never true

475 text_orcid = citation_author_node.get("content") 

476 if not isinstance(text_orcid, str): 

477 continue 

478 current_author["orcid"] = text_orcid 

479 

480 if "pdf" in what: 480 ↛ 488line 480 didn't jump to line 488 because the condition on line 480 was always true

481 # PDF 

482 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

483 if citation_pdf_node: 

484 pdf_url = citation_pdf_node.get("content") 

485 if isinstance(pdf_url, str): 485 ↛ 488line 485 didn't jump to line 488 because the condition on line 485 was always true

486 add_pdf_link_to_xarticle(xarticle, pdf_url) 

487 

488 if "lang" in what: 

489 # LANG 

490 citation_lang_node = soup.select_one("meta[name='citation_language']") 

491 if citation_lang_node: 491 ↛ 497line 491 didn't jump to line 497 because the condition on line 491 was always true

492 # TODO: check other language code 

493 content_text = citation_lang_node.get("content") 

494 if isinstance(content_text, str): 494 ↛ 497line 494 didn't jump to line 497 because the condition on line 494 was always true

495 xarticle.lang = standardize_tag(content_text) 

496 

497 if "abstract" in what: 

498 # ABSTRACT 

499 abstract_node = soup.select_one("meta[name='citation_abstract']") 

500 if abstract_node is not None: 

501 abstract = abstract_node.get("content") 

502 if not isinstance(abstract, str): 502 ↛ 503line 502 didn't jump to line 503 because the condition on line 502 was never true

503 raise ValueError("Couldn't parse abstract from meta") 

504 abstract = BeautifulSoup(abstract, "html.parser").text 

505 lang = abstract_node.get("lang") 

506 if not isinstance(lang, str): 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 lang = self.detect_language(abstract, xarticle) 

508 xarticle.abstracts.append( 

509 { 

510 "tag": "abstract", 

511 "value_html": "", 

512 "value_tex": abstract, 

513 "value_xml": "", 

514 "lang": lang, 

515 } 

516 ) 

517 

518 if "page" in what: 

519 # PAGES 

520 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

521 if citation_fpage_node: 

522 page = citation_fpage_node.get("content") 

523 if isinstance(page, str): 523 ↛ 528line 523 didn't jump to line 528 because the condition on line 523 was always true

524 page = page.split("(")[0] 

525 if len(page) < 32: 525 ↛ 528line 525 didn't jump to line 528 because the condition on line 525 was always true

526 xarticle.fpage = page 

527 

528 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

529 if citation_lpage_node: 

530 page = citation_lpage_node.get("content") 

531 if isinstance(page, str): 531 ↛ 536line 531 didn't jump to line 536 because the condition on line 531 was always true

532 page = page.split("(")[0] 

533 if len(page) < 32: 533 ↛ 536line 533 didn't jump to line 536 because the condition on line 533 was always true

534 xarticle.lpage = page 

535 

536 if "doi" in what: 

537 # DOI 

538 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

539 if citation_doi_node: 

540 doi = citation_doi_node.get("content") 

541 if isinstance(doi, str): 541 ↛ 549line 541 didn't jump to line 549 because the condition on line 541 was always true

542 doi = doi.strip() 

543 pos = doi.find("10.") 

544 if pos > 0: 

545 doi = doi[pos:] 

546 xarticle.doi = doi 

547 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

548 

549 if "mr" in what: 

550 # MR 

551 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

552 if citation_mr_node: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true

553 mr = citation_mr_node.get("content") 

554 if isinstance(mr, str): 

555 mr = mr.strip() 

556 if mr.find("MR") == 0: 

557 mr = mr[2:] 

558 xarticle.extids.append(("mr-item-id", mr)) 

559 

560 if "zbl" in what: 

561 # ZBL 

562 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

563 if citation_zbl_node: 

564 zbl = citation_zbl_node.get("content") 

565 if isinstance(zbl, str): 565 ↛ 571line 565 didn't jump to line 571 because the condition on line 565 was always true

566 zbl = zbl.strip() 

567 if zbl.find("Zbl") == 0: 567 ↛ 571line 567 didn't jump to line 571 because the condition on line 567 was always true

568 zbl = zbl[3:].strip() 

569 xarticle.extids.append(("zbl-item-id", zbl)) 

570 

571 if "publisher" in what: 

572 # PUBLISHER 

573 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

574 if citation_publisher_node: 

575 pub = citation_publisher_node.get("content") 

576 if isinstance(pub, str): 576 ↛ 583line 576 didn't jump to line 583 because the condition on line 576 was always true

577 pub = pub.strip() 

578 if pub != "": 578 ↛ 583line 578 didn't jump to line 583 because the condition on line 578 was always true

579 xpub = create_publisherdata() 

580 xpub.name = pub 

581 xissue.publisher = xpub 

582 

583 if "keywords" in what: 

584 # KEYWORDS 

585 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

586 for kwd_node in citation_kwd_nodes: 

587 kwds = kwd_node.get("content") 

588 if isinstance(kwds, str): 588 ↛ 586line 588 didn't jump to line 586 because the condition on line 588 was always true

589 kwds = kwds.split(",") 

590 for kwd in kwds: 

591 if kwd == "": 

592 continue 

593 kwd = kwd.strip() 

594 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

595 

596 if "references" in what: 

597 citation_references = soup.select("meta[name='citation_reference']") 

598 for index, tag in enumerate(citation_references): 

599 content = tag.get("content") 

600 if not isinstance(content, str): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true

601 raise ValueError("Cannot parse citation_reference meta") 

602 xarticle.bibitems.append( 

603 self.__parse_meta_citation_reference(content, str(index + 1)) 

604 ) 

605 

606 def create_xissue( 

607 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

608 ): 

609 if url is not None and url.endswith("/"): 

610 url = url[:-1] 

611 xissue = create_issuedata() 

612 xissue.url = url 

613 

614 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

615 

616 xissue.year = year 

617 

618 if volume_number is not None: 

619 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

620 

621 if issue_number is not None: 

622 xissue.number = issue_number.replace(",", "-") 

623 return xissue 

624 

625 def detect_language(self, text: str, article: ArticleData | None = None): 

626 if article and article.lang is not None and article.lang != "und": 

627 return article.lang 

628 

629 language = self.language_detector.detect_language_of(text) 

630 

631 if not language: 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true

632 return "und" 

633 return language.iso_code_639_1.name.lower() 

634 

635 def get_or_create_periode(self): 

636 if self.periode is not None: 636 ↛ 637line 636 didn't jump to line 637 because the condition on line 636 was never true

637 return self.periode 

638 

639 if self.collection is None or self.source is None: 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true

640 raise ValueError("You need to set a collection or a source before creating a periode") 

641 

642 qs = Periode.objects.filter(collection=self.collection, source=self.source) 

643 if qs.exists(): 643 ↛ 646line 643 didn't jump to line 646 because the condition on line 643 was always true

644 periode = qs.first() 

645 else: 

646 periode = Periode( 

647 collection=self.collection, 

648 source=self.source, 

649 title=self.collection.title_tex, 

650 issue_href=self.issue_href, 

651 collection_href=self.collection_url, 

652 doi_href="", 

653 published=False, 

654 begin=self.periode_begin, 

655 end=self.periode_end, 

656 first_issue=self.periode_first_issue, 

657 last_issue=self.periode_last_issue, 

658 ) 

659 periode.save() 

660 

661 return periode 

662 

663 references_mapping = { 

664 "citation_title": get_article_title_xml, 

665 "citation_journal_title": get_source_xml, 

666 "citation_publication_date": get_year_xml, 

667 "citation_firstpage": get_fpage_xml, 

668 "citation_lastpage": get_lpage_xml, 

669 } 

670 

671 @classmethod 

672 def __parse_meta_citation_reference(cls, content: str, label=None): 

673 categories = content.split(";") 

674 

675 if len(categories) == 1: 

676 return cls.create_crawled_bibitem(content, label=label) 

677 

678 citation_data = [c.split("=") for c in categories if "=" in c] 

679 del categories 

680 

681 xml_string = "" 

682 authors_parsed = False 

683 authors_strings = [] 

684 for data in citation_data: 

685 key = data[0].strip() 

686 citation_content = data[1] 

687 if key == "citation_author": 

688 authors_strings.append(get_author_xml(template_str=citation_content)) 

689 continue 

690 elif not authors_parsed: 

691 xml_string += ", ".join(authors_strings) 

692 authors_parsed = True 

693 

694 if key in cls.references_mapping: 

695 xml_string += " " + cls.references_mapping[key](citation_content) 

696 

697 return cls.create_crawled_bibitem(xml_string, label=label) 

698 

699 @classmethod 

700 def get_or_create_source(cls): 

701 source, created = Source.objects.get_or_create( 

702 domain=cls.source_domain, 

703 defaults={ 

704 "name": cls.source_name, 

705 "website": cls.source_website, 

706 "create_xissue": True, 

707 "periode_href": "", 

708 "article_href": "", 

709 "pdf_href": "", 

710 }, 

711 ) 

712 if created: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 source.save() 

714 return source 

715 

716 @staticmethod 

717 def create_crawled_bibitem(ref_value: str | JatsRef, label=None): 

718 if isinstance(ref_value, str): 

719 xref = RefData(lang="en") 

720 value_xml = "" 

721 if label: 

722 value_xml += f"<label>{label}</label>" 

723 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

724 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>' 

725 xref.citation_xml = value_xml 

726 else: 

727 xref = ref_value 

728 

729 xref = check_bibitem_xml(xref) 

730 

731 # Bakes extlink badges into the bibliography html 

732 # Maybe we should put this into another file (jats_parser ?) 

733 for extid in xref.extids: 

734 href = resolve_id(extid[0], extid[1]) 

735 if (not href) or (not xref.citation_html): 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true

736 continue 

737 str_format = extid[0] 

738 if str_format in extids_formats: 738 ↛ 740line 738 didn't jump to line 740 because the condition on line 738 was always true

739 str_format = extids_formats[str_format] 

740 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

741 

742 return xref 

743 

744 @staticmethod 

745 def create_bibliography(bibitems: Sequence[RefData]): 

746 xml_str = "<ref-list>\n" 

747 html_str = "<div>\n" 

748 

749 for item in bibitems: 

750 xml_str += f"\t{item.citation_xml}\n" 

751 html_str += f"\t<p>{item.citation_html}</p>\n" 

752 xml_str += "</ref-list>" 

753 

754 # for item in bibitems: 

755 # html_str = 

756 # html_str += f"\t<p>{item.citation_html}</p>\n" 

757 html_str += "</div>" 

758 

759 tex_str = "<div>\n" 

760 for item in bibitems: 

761 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

762 tex_str += "</div>" 

763 

764 biblio_dict = create_abstract( 

765 tag="biblio", 

766 value_html=html_str, 

767 value_tex=tex_str, 

768 value_xml=xml_str, 

769 lang="en", 

770 ) 

771 

772 return biblio_dict 

773 

774 @staticmethod 

775 def get_issue_pid( 

776 collection_id: str, 

777 year: str, 

778 volume_number: str | None = None, 

779 issue_number: str | None = None, 

780 ): 

781 # Replace any non-word character with an underscore 

782 pid = f"{collection_id}_{year}" 

783 if volume_number is not None: 

784 pid += f"_{volume_number}" 

785 if issue_number is not None: 

786 pid += f"_{issue_number}" 

787 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

788 return pid 

789 

790 @staticmethod 

791 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

792 pages_split = pages.split(separator) 

793 if len(pages_split) == 0: 793 ↛ 794line 793 didn't jump to line 794 because the condition on line 793 was never true

794 article.page_range = pages 

795 if len(pages_split) > 0: 795 ↛ exitline 795 didn't return from function 'set_pages' because the condition on line 795 was always true

796 if pages[0].isnumeric(): 

797 article.fpage = pages_split[0] 

798 if ( 

799 len(pages_split) > 1 

800 and pages_split[0] != pages_split[1] 

801 and pages_split[1].isnumeric() 

802 ): 

803 article.lpage = pages_split[1]