Coverage for src/crawler/base_crawler.py: 79%

439 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import time 

2from collections.abc import Sequence 

3from datetime import timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.citation import ( 

16 get_article_title_xml, 

17 get_author_xml, 

18 get_fpage_xml, 

19 get_lpage_xml, 

20 get_source_xml, 

21 get_year_xml, 

22) 

23from ptf.cmds.xml.jats.builder.issue import get_title_xml 

24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml 

25from ptf.display.resolver import extids_formats, resolve_id 

26from ptf.model_data import ( 

27 ArticleData, 

28 ContributorDict, 

29 IssueData, 

30 RefData, 

31 ResourceData, 

32 create_abstract, 

33 create_contributor, 

34 create_extlink, 

35 create_issuedata, 

36 create_publisherdata, 

37) 

38from ptf.model_data_converter import update_data_for_jats 

39from pylatexenc.latex2text import LatexNodes2Text 

40from pysolr import SolrError 

41from requests_cache import CachedSession, FileCache 

42 

43from crawler.models import Source 

44from crawler.models.container_source import ContainerSource 

45from crawler.types import CitationLiteral 

46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

47 

48# TODO: pass a class factory instead of a dependency to a site 

49# TODO: pass a class factory instead of a dependency to a site 

50 

51 

52class BaseCollectionCrawler: 

53 """ 

54 Base collection for the crawlers. 

55 To create a crawler: 

56 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

57 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

58 3) update factory.py so that crawler_factory can return your new crawler 

59 """ 

60 

61 source_name = "" 

62 source_domain = "" 

63 source_website = "" 

64 

65 issue_href = "" 

66 

67 collection = None 

68 source = None 

69 user = None 

70 session: requests.Session | CachedSession 

71 # Updated in constructor with user agent from settings_local 

72 headers = {"accept_encoding": "utf-8"} 

73 

74 next_allowed_request: float = time.time() 

75 

76 # seconds to wait between two http requests 

77 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

78 

79 latext_parser = LatexNodes2Text() 

80 

81 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

82 # do not use the "$" to surround tex formulas 

83 delimiter_inline_formula = "$" 

84 delimiter_disp_formula = "$" 

85 

86 # HACK : Workaround for tests (monkeypatching) 

87 # We store the class here, so we can monkeypatch it when running tests 

88 # subCrawlers = { 

89 # LofplCrawler: None 

90 # } 

91 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

92 

93 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

94 

95 force_refresh = False 

96 

97 # Whereas to include headers in requests cache key 

98 match_headers = False 

99 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

100 

101 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

102 ignore_missing_pdf = True 

103 

104 def __init__( 

105 self, 

106 *args, 

107 username: str, 

108 collection_id: str, 

109 collection_url: str, 

110 test_mode: bool = False, 

111 publisher: str = "mathdoc", 

112 force_refresh=False, 

113 ): 

114 for CrawlerClass in self.subCrawlers: 

115 self.subCrawlers[CrawlerClass] = CrawlerClass( 

116 *args, 

117 username=username, 

118 collection_id=collection_id, 

119 collection_url=collection_url, 

120 test_mode=test_mode, 

121 publisher=publisher, 

122 ) 

123 

124 self.username = username 

125 

126 self.collection_id = collection_id 

127 self.collection_url = ( 

128 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

129 ) 

130 

131 self.test_mode = test_mode 

132 self.publisher = publisher 

133 

134 # Skipped when running tests 

135 self.initialize() 

136 

137 self.session = CachedSession( 

138 match_headers=self.match_headers, 

139 backend=FileCache( 

140 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"), 

141 decode_content=False, 

142 ), 

143 expire_after=timedelta(days=30), 

144 ) 

145 self.headers.update( 

146 { 

147 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

148 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

149 } 

150 ) 

151 

152 self.force_refresh = force_refresh 

153 

154 def initialize(self): 

155 """ 

156 Acts as a "second" init function to skip model accesses during test data generation 

157 """ 

158 self.collection = get_or_create_collection(self.collection_id) 

159 self.source = self.get_or_create_source() 

160 self.user = User.objects.get(username=self.username) 

161 

162 @classmethod 

163 def can_crawl(cls, pid: str) -> bool: 

164 return True 

165 

166 def parse_collection_content(self, content: str) -> list[IssueData]: 

167 """ 

168 Parse the HTML content with BeautifulSoup 

169 returns a list of xissue. 

170 Override this function in a derived class 

171 """ 

172 return [] 

173 

174 def parse_issue_content(self, content: str, xissue: IssueData): 

175 """ 

176 Parse the HTML content with BeautifulSoup 

177 Fills the xissue.articles 

178 Override this function in a derived class. 

179 

180 CAV : You are supposed to create articles there. Please assign a PID to each article. 

181 The PID can be `a + article_index`, like this : `a0` `a21` 

182 """ 

183 

184 def parse_article_content( 

185 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

186 ) -> ArticleData | None: 

187 """ 

188 Parse the HTML content with BeautifulSoup 

189 returns the xarticle. 

190 Override this function in a derived class. 

191 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

192 The article url is also passed as a parameter 

193 

194 CAV : You are supposed to assign articles pid again here 

195 """ 

196 return xarticle 

197 

198 def crawl_collection(self): 

199 # TODO: Comments, filter 

200 """ 

201 Crawl an entire collection. ptf.models.Container objects are created. 

202 - get the HTML content of the collection_url 

203 - parse the HTML content with beautifulsoup to extract the list of issues 

204 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

205 - crawl each issue if col_only is False 

206 - Returns the list of merged issues. 

207 It is an OrderedDict {pid: {"issues": xissues}} 

208 The key is the pid of the merged issues. 

209 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

210 the pid is then made with 1999-2000__6_ 

211 """ 

212 

213 if self.source is None: 

214 raise RuntimeError("ERROR: the source is not set") 

215 

216 content = self.download_file(self.collection_url) 

217 xissues = self.parse_collection_content(content) 

218 

219 """ 

220 Some collections split the same volumes in different pages 

221 Ex: Volume 6 (2000) and Volume 6 (1999) 

222 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

223 """ 

224 # merged_xissues = self.merge_xissues(xissues) 

225 

226 xissues_dict = {str(i.pid): i for i in xissues} 

227 

228 return xissues_dict 

229 

230 def crawl_issue(self, xissue: IssueData): 

231 """ 

232 Crawl 1 wag page of an issue. 

233 - get the HTML content of the issue 

234 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

235 - crawl each article 

236 """ 

237 

238 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

239 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

240 

241 issue_url = xissue.url 

242 if issue_url is not None: 

243 if issue_url.endswith(".pdf"): 

244 add_pdf_link_to_xarticle(xissue, issue_url) 

245 xissue.url = None 

246 else: 

247 content = self.download_file(issue_url) 

248 self.parse_issue_content(content, xissue) 

249 

250 xarticles = xissue.articles 

251 

252 parsed_xarticles = [] 

253 

254 for xarticle in xarticles: 

255 parsed_xarticle = self.crawl_article(xarticle, xissue) 

256 if parsed_xarticle is not None: 

257 parsed_xarticles.append(parsed_xarticle) 

258 

259 xissue.articles = parsed_xarticles 

260 

261 article_has_pdf = self.article_has_pdf(xissue) 

262 

263 if self.ignore_missing_pdf: 

264 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

265 

266 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

267 self.process_resource_metadata(xissue) 

268 self.add_xissue_into_database(xissue) 

269 

270 @staticmethod 

271 def article_has_source(art: ArticleData | IssueData): 

272 return ( 

273 next( 

274 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

275 None, 

276 ) 

277 is not None 

278 ) 

279 

280 @staticmethod 

281 def article_has_pdf(art: ArticleData | IssueData): 

282 return ( 

283 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

284 is not None 

285 ) 

286 

287 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

288 # ARTICLE URL as en ExtLink (to display the link in the article page) 

289 if xarticle.url is None: 

290 if not self.article_has_source(xarticle): 290 ↛ 300line 290 didn't jump to line 300 because the condition on line 290 was always true

291 if xissue.url: 

292 article_source = xissue.url 

293 else: 

294 article_source = self.collection_url 

295 ext_link = create_extlink() 

296 ext_link["rel"] = "source" 

297 ext_link["location"] = article_source 

298 ext_link["metadata"] = self.source_domain 

299 xarticle.ext_links.append(ext_link) 

300 return self.process_resource_metadata(xarticle) 

301 

302 content = self.download_file(xarticle.url) 

303 

304 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

305 if parsed_xarticle is None: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 return None 

307 

308 if parsed_xarticle.doi: 

309 parsed_xarticle.pid = ( 

310 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

311 ) 

312 else: 

313 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

314 

315 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

316 ext_link = create_extlink() 

317 ext_link["rel"] = "source" 

318 ext_link["location"] = parsed_xarticle.url 

319 ext_link["metadata"] = self.source_domain 

320 parsed_xarticle.ext_links.append(ext_link) 

321 

322 # The article title may have formulas surrounded with '$' 

323 return self.process_resource_metadata(parsed_xarticle) 

324 

325 def process_resource_metadata(self, xresource: ResourceData): 

326 # Process title tex 

327 html, xml = get_html_and_xml_from_text_with_formulas( 

328 xresource.title_tex, 

329 delimiter_inline=self.delimiter_inline_formula, 

330 delimiter_disp=self.delimiter_disp_formula, 

331 ) 

332 xml = get_title_xml(xml, with_tex_values=False) 

333 xresource.title_html = html 

334 xresource.title_xml = xml 

335 del xml 

336 del html 

337 

338 # Process trans_title tex 

339 html, xml = get_html_and_xml_from_text_with_formulas( 

340 xresource.trans_title_tex, 

341 delimiter_inline=self.delimiter_inline_formula, 

342 delimiter_disp=self.delimiter_disp_formula, 

343 ) 

344 xml = get_title_xml(xml, with_tex_values=False) 

345 xresource.trans_title_html = html 

346 xresource.trans_title_xml = xml 

347 del xml 

348 del html 

349 

350 abstracts_to_parse = [ 

351 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

352 ] 

353 # abstract may have formulas surrounded with '$' 

354 if len(abstracts_to_parse) > 0: 

355 for xabstract in abstracts_to_parse: 

356 html, xml = get_html_and_xml_from_text_with_formulas( 

357 xabstract["value_tex"], 

358 delimiter_inline=self.delimiter_inline_formula, 

359 delimiter_disp=self.delimiter_disp_formula, 

360 ) 

361 xabstract["value_html"] = html 

362 lang = xabstract["lang"] 

363 if lang == xresource.lang: 

364 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

365 else: 

366 xabstract["value_xml"] = ( 

367 f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

368 ) 

369 

370 if isinstance(xresource, ArticleData): 

371 update_data_for_jats(xresource) 

372 return xresource 

373 

374 def get(self, url: str, force_refresh=False, headers={}): 

375 attempt = 0 

376 response = None 

377 

378 while attempt < 3: 

379 # If we already have a key, we can skip the timeout 

380 if isinstance(self.session, CachedSession): 380 ↛ 385line 380 didn't jump to line 385 because the condition on line 380 was always true

381 if not self.session.cache.contains(url=url) or force_refresh: 

382 delta = self.next_allowed_request - time.time() 

383 if delta > 0: 

384 time.sleep(delta) 

385 self.next_allowed_request = time.time() + self.requests_interval 

386 try: 

387 # For SSL Errors, use verify=False kwarg 

388 verify = True 

389 if url.startswith("https://hdml.di.ionio.gr/"): 389 ↛ 390line 389 didn't jump to line 390 because the condition on line 389 was never true

390 verify = False 

391 # self.session.cache.delete(urls=[url]) 

392 if isinstance(self.session, CachedSession): 392 ↛ 400line 392 didn't jump to line 400 because the condition on line 392 was always true

393 response = self.session.get( 

394 url, 

395 headers={**self.headers, **headers}, 

396 verify=verify, 

397 force_refresh=force_refresh, 

398 ) 

399 else: 

400 response = self.session.get( 

401 url, headers={**self.headers, **headers}, verify=verify 

402 ) 

403 if not response.ok: 

404 raise requests.exceptions.HTTPError( 

405 f"Endpoint answered with code {response.status_code} : {url}", 

406 response=response, 

407 ) 

408 return response 

409 except ( 

410 requests.ConnectionError, 

411 requests.ConnectTimeout, 

412 requests.exceptions.HTTPError, 

413 ): 

414 attempt += 1 

415 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

416 

417 def download_file(self, url: str, force_refresh=False, headers={}): 

418 """ 

419 Downloads a URL, saves its content on disk in filename and returns its content. 

420 """ 

421 response = self.get( 

422 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

423 ) 

424 content = self.decode_response(response) 

425 if content == "" or not content: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true

426 raise requests.exceptions.HTTPError(response) 

427 return content 

428 

429 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

430 """Override this if the content-type headers from the sources are advertising something else than the actual content 

431 SASA needs this""" 

432 response.encoding = encoding 

433 return response.text 

434 

435 def add_xissue_into_database(self, xissue: IssueData): 

436 xissue.journal = self.collection 

437 

438 if xissue.year == "": 

439 raise ValueError("Failsafe : Cannot insert issue without a year") 

440 

441 xpub = create_publisherdata() 

442 xpub.name = self.publisher 

443 xissue.publisher = xpub 

444 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

445 

446 attempt = 1 

447 success = False 

448 

449 while not success and attempt < 4: 

450 try: 

451 params = {"xissue": xissue, "use_body": False} 

452 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

453 container = cmd.do() 

454 success = True 

455 ContainerSource.objects.create(source=self.source, container=container) 

456 except SolrError: 

457 attempt += 1 

458 time.sleep(10) 

459 

460 def get_metadata_using_citation_meta( 

461 self, 

462 xarticle: ArticleData, 

463 xissue: IssueData, 

464 soup: BeautifulSoup, 

465 what: list[CitationLiteral] = [], 

466 ): 

467 """ 

468 :param xarticle: the xarticle that will collect the metadata 

469 :param xissue: the xissue that will collect the publisher 

470 :param soup: the BeautifulSoup object of tha article page 

471 :param what: list of citation_ items to collect. 

472 :return: None. The given article is modified 

473 """ 

474 

475 if "title" in what: 

476 # TITLE 

477 citation_title_node = soup.select_one("meta[name='citation_title']") 

478 if citation_title_node: 478 ↛ 483line 478 didn't jump to line 483 because the condition on line 478 was always true

479 title = citation_title_node.get("content") 

480 if isinstance(title, str): 480 ↛ 483line 480 didn't jump to line 483 because the condition on line 480 was always true

481 xarticle.title_tex = title 

482 

483 if "author" in what: 483 ↛ 512line 483 didn't jump to line 512 because the condition on line 483 was always true

484 # AUTHORS 

485 citation_author_nodes = soup.select("meta[name^='citation_author']") 

486 current_author: ContributorDict | None = None 

487 for citation_author_node in citation_author_nodes: 

488 if citation_author_node.get("name") == "citation_author": 

489 text_author = citation_author_node.get("content") 

490 if not isinstance(text_author, str): 490 ↛ 491line 490 didn't jump to line 491 because the condition on line 490 was never true

491 raise ValueError("Cannot parse author") 

492 if text_author == "": 492 ↛ 493line 492 didn't jump to line 493 because the condition on line 492 was never true

493 current_author = None 

494 continue 

495 current_author = create_contributor(role="author", string_name=text_author) 

496 xarticle.contributors.append(current_author) 

497 continue 

498 if current_author is None: 498 ↛ 499line 498 didn't jump to line 499 because the condition on line 498 was never true

499 print("Couldn't parse citation author") 

500 continue 

501 if citation_author_node.get("name") == "citation_author_institution": 

502 text_institution = citation_author_node.get("content") 

503 if not isinstance(text_institution, str): 503 ↛ 504line 503 didn't jump to line 504 because the condition on line 503 was never true

504 continue 

505 current_author["addresses"].append(text_institution) 

506 if citation_author_node.get("name") == "citation_author_ocrid": 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 text_orcid = citation_author_node.get("content") 

508 if not isinstance(text_orcid, str): 

509 continue 

510 current_author["orcid"] = text_orcid 

511 

512 if "pdf" in what: 

513 # PDF 

514 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

515 if citation_pdf_node: 

516 pdf_url = citation_pdf_node.get("content") 

517 if isinstance(pdf_url, str): 517 ↛ 520line 517 didn't jump to line 520 because the condition on line 517 was always true

518 add_pdf_link_to_xarticle(xarticle, pdf_url) 

519 

520 if "lang" in what: 

521 # LANG 

522 citation_lang_node = soup.select_one("meta[name='citation_language']") 

523 if citation_lang_node: 523 ↛ 529line 523 didn't jump to line 529 because the condition on line 523 was always true

524 # TODO: check other language code 

525 content_text = citation_lang_node.get("content") 

526 if isinstance(content_text, str): 526 ↛ 529line 526 didn't jump to line 529 because the condition on line 526 was always true

527 xarticle.lang = standardize_tag(content_text) 

528 

529 if "abstract" in what: 

530 # ABSTRACT 

531 abstract_node = soup.select_one("meta[name='citation_abstract']") 

532 if abstract_node is not None: 

533 abstract = abstract_node.get("content") 

534 if not isinstance(abstract, str): 534 ↛ 535line 534 didn't jump to line 535 because the condition on line 534 was never true

535 raise ValueError("Couldn't parse abstract from meta") 

536 abstract = BeautifulSoup(abstract, "html.parser").text 

537 lang = abstract_node.get("lang") 

538 if not isinstance(lang, str): 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 lang = self.detect_language(abstract, xarticle) 

540 xarticle.abstracts.append( 

541 { 

542 "tag": "abstract", 

543 "value_html": "", 

544 "value_tex": abstract, 

545 "value_xml": "", 

546 "lang": lang, 

547 } 

548 ) 

549 

550 if "page" in what: 

551 # PAGES 

552 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

553 if citation_fpage_node: 

554 page = citation_fpage_node.get("content") 

555 if isinstance(page, str): 555 ↛ 560line 555 didn't jump to line 560 because the condition on line 555 was always true

556 page = page.split("(")[0] 

557 if len(page) < 32: 557 ↛ 560line 557 didn't jump to line 560 because the condition on line 557 was always true

558 xarticle.fpage = page 

559 

560 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

561 if citation_lpage_node: 

562 page = citation_lpage_node.get("content") 

563 if isinstance(page, str): 563 ↛ 568line 563 didn't jump to line 568 because the condition on line 563 was always true

564 page = page.split("(")[0] 

565 if len(page) < 32: 565 ↛ 568line 565 didn't jump to line 568 because the condition on line 565 was always true

566 xarticle.lpage = page 

567 

568 if "doi" in what: 

569 # DOI 

570 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

571 if citation_doi_node: 

572 doi = citation_doi_node.get("content") 

573 if isinstance(doi, str): 573 ↛ 580line 573 didn't jump to line 580 because the condition on line 573 was always true

574 doi = doi.strip() 

575 pos = doi.find("10.") 

576 if pos > 0: 

577 doi = doi[pos:] 

578 xarticle.doi = doi 

579 

580 if "mr" in what: 

581 # MR 

582 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

583 if citation_mr_node: 583 ↛ 584line 583 didn't jump to line 584 because the condition on line 583 was never true

584 mr = citation_mr_node.get("content") 

585 if isinstance(mr, str): 

586 mr = mr.strip() 

587 if mr.find("MR") == 0: 

588 mr = mr[2:] 

589 xarticle.extids.append(("mr-item-id", mr)) 

590 

591 if "zbl" in what: 

592 # ZBL 

593 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

594 if citation_zbl_node: 

595 zbl = citation_zbl_node.get("content") 

596 if isinstance(zbl, str): 596 ↛ 602line 596 didn't jump to line 602 because the condition on line 596 was always true

597 zbl = zbl.strip() 

598 if zbl.find("Zbl") == 0: 598 ↛ 602line 598 didn't jump to line 602 because the condition on line 598 was always true

599 zbl = zbl[3:].strip() 

600 xarticle.extids.append(("zbl-item-id", zbl)) 

601 

602 if "publisher" in what: 

603 # PUBLISHER 

604 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

605 if citation_publisher_node: 

606 pub = citation_publisher_node.get("content") 

607 if isinstance(pub, str): 607 ↛ 614line 607 didn't jump to line 614 because the condition on line 607 was always true

608 pub = pub.strip() 

609 if pub != "": 609 ↛ 614line 609 didn't jump to line 614 because the condition on line 609 was always true

610 xpub = create_publisherdata() 

611 xpub.name = pub 

612 xissue.publisher = xpub 

613 

614 if "keywords" in what: 

615 # KEYWORDS 

616 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

617 for kwd_node in citation_kwd_nodes: 

618 kwds = kwd_node.get("content") 

619 if isinstance(kwds, str): 619 ↛ 617line 619 didn't jump to line 617 because the condition on line 619 was always true

620 kwds = kwds.split(",") 

621 for kwd in kwds: 

622 if kwd == "": 

623 continue 

624 kwd = kwd.strip() 

625 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

626 

627 if "references" in what: 

628 citation_references = soup.select("meta[name='citation_reference']") 

629 for index, tag in enumerate(citation_references): 

630 content = tag.get("content") 

631 if not isinstance(content, str): 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true

632 raise ValueError("Cannot parse citation_reference meta") 

633 xarticle.bibitems.append( 

634 self.__parse_meta_citation_reference(content, str(index + 1)) 

635 ) 

636 

637 def create_xissue( 

638 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

639 ): 

640 if url is not None and url.endswith("/"): 

641 url = url[:-1] 

642 xissue = create_issuedata() 

643 xissue.url = url 

644 

645 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

646 

647 xissue.year = year 

648 

649 if volume_number is not None: 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was always true

650 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

651 

652 if issue_number is not None: 

653 xissue.number = issue_number.replace(",", "-") 

654 return xissue 

655 

656 def detect_language(self, text: str, article: ArticleData | None = None): 

657 if article and article.lang is not None and article.lang != "und": 

658 return article.lang 

659 

660 language = self.language_detector.detect_language_of(text) 

661 

662 if not language: 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true

663 return "und" 

664 return language.iso_code_639_1.name.lower() 

665 

666 references_mapping = { 

667 "citation_title": get_article_title_xml, 

668 "citation_journal_title": get_source_xml, 

669 "citation_publication_date": get_year_xml, 

670 "citation_firstpage": get_fpage_xml, 

671 "citation_lastpage": get_lpage_xml, 

672 } 

673 

674 @classmethod 

675 def __parse_meta_citation_reference(cls, content: str, label=None): 

676 categories = content.split(";") 

677 

678 if len(categories) == 1: 

679 return cls.create_crawled_bibitem(content, label=label) 

680 

681 citation_data = [c.split("=") for c in categories if "=" in c] 

682 del categories 

683 

684 xml_string = "" 

685 authors_parsed = False 

686 authors_strings = [] 

687 for data in citation_data: 

688 key = data[0].strip() 

689 citation_content = data[1] 

690 if key == "citation_author": 

691 authors_strings.append(get_author_xml(template_str=citation_content)) 

692 continue 

693 elif not authors_parsed: 

694 xml_string += ", ".join(authors_strings) 

695 authors_parsed = True 

696 

697 if key in cls.references_mapping: 

698 xml_string += " " + cls.references_mapping[key](citation_content) 

699 

700 return cls.create_crawled_bibitem(xml_string, label=label) 

701 

702 @classmethod 

703 def get_or_create_source(cls): 

704 source, created = Source.objects.get_or_create( 

705 domain=cls.source_domain, 

706 defaults={ 

707 "name": cls.source_name, 

708 "website": cls.source_website, 

709 }, 

710 ) 

711 if created: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 source.save() 

713 return source 

714 

715 @staticmethod 

716 def create_crawled_bibitem(ref_value: str | JatsRef, label=None): 

717 if isinstance(ref_value, str): 

718 xref = RefData(lang="en") 

719 value_xml = "" 

720 if label: 

721 value_xml += f"<label>{label}</label>" 

722 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 

723 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>' 

724 xref.citation_xml = value_xml 

725 else: 

726 xref = ref_value 

727 

728 xref = check_bibitem_xml(xref) 

729 

730 # Bakes extlink badges into the bibliography html 

731 # Maybe we should put this into another file (jats_parser ?) 

732 for extid in xref.extids: 

733 href = resolve_id(extid[0], extid[1]) 

734 if (not href) or (not xref.citation_html): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true

735 continue 

736 str_format = extid[0] 

737 if str_format in extids_formats: 737 ↛ 739line 737 didn't jump to line 739 because the condition on line 737 was always true

738 str_format = extids_formats[str_format] 

739 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 

740 

741 return xref 

742 

743 @staticmethod 

744 def create_bibliography(bibitems: Sequence[RefData]): 

745 xml_str = "<ref-list>\n" 

746 html_str = "<div>\n" 

747 

748 for item in bibitems: 

749 xml_str += f"\t{item.citation_xml}\n" 

750 html_str += f"\t<p>{item.citation_html}</p>\n" 

751 xml_str += "</ref-list>" 

752 

753 # for item in bibitems: 

754 # html_str = 

755 # html_str += f"\t<p>{item.citation_html}</p>\n" 

756 html_str += "</div>" 

757 

758 tex_str = "<div>\n" 

759 for item in bibitems: 

760 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

761 tex_str += "</div>" 

762 

763 biblio_dict = create_abstract( 

764 tag="biblio", 

765 value_html=html_str, 

766 value_tex=tex_str, 

767 value_xml=xml_str, 

768 lang="en", 

769 ) 

770 

771 return biblio_dict 

772 

773 @staticmethod 

774 def get_issue_pid( 

775 collection_id: str, 

776 year: str, 

777 volume_number: str | None = None, 

778 issue_number: str | None = None, 

779 ): 

780 # Replace any non-word character with an underscore 

781 pid = f"{collection_id}_{year}" 

782 if volume_number is not None: 782 ↛ 784line 782 didn't jump to line 784 because the condition on line 782 was always true

783 pid += f"_{volume_number}" 

784 if issue_number is not None: 

785 pid += f"_{issue_number}" 

786 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

787 return pid 

788 

789 @staticmethod 

790 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

791 pages_split = pages.split(separator) 

792 if len(pages_split) == 0: 792 ↛ 793line 792 didn't jump to line 793 because the condition on line 792 was never true

793 article.page_range = pages 

794 if len(pages_split) > 0: 794 ↛ exitline 794 didn't return from function 'set_pages' because the condition on line 794 was always true

795 if pages[0].isnumeric(): 

796 article.fpage = pages_split[0] 

797 if ( 

798 len(pages_split) > 1 

799 and pages_split[0] != pages_split[1] 

800 and pages_split[1].isnumeric() 

801 ): 

802 article.lpage = pages_split[1]