Coverage for src/crawler/base_crawler.py: 76%

402 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import time 

2from datetime import timedelta 

3 

4import regex 

5import requests 

6from bs4 import BeautifulSoup 

7from django.conf import settings 

8from django.contrib.auth.models import User 

9from django.utils import timezone 

10from langcodes import standardize_tag 

11from lingua import LanguageDetectorBuilder 

12from ptf.cmds import xml_cmds 

13from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

14from ptf.cmds.xml.jats.builder.citation import ( 

15 get_article_title_xml, 

16 get_author_xml, 

17 get_fpage_xml, 

18 get_lpage_xml, 

19 get_source_xml, 

20 get_year_xml, 

21) 

22from ptf.cmds.xml.jats.builder.issue import get_title_xml 

23from ptf.cmds.xml.jats.jats_parser import JatsBase 

24from ptf.model_data import ( 

25 ArticleData, 

26 ContributorDict, 

27 IssueData, 

28 ResourceData, 

29 create_contributor, 

30 create_extlink, 

31 create_issuedata, 

32 create_publisherdata, 

33) 

34from ptf.model_data_converter import update_data_for_jats 

35from pylatexenc.latex2text import LatexNodes2Text 

36from pysolr import SolrError 

37from requests_cache import CachedSession, MongoCache 

38 

39from crawler.models import Source 

40from crawler.models.container_source import ContainerSource 

41from crawler.types import CitationLiteral 

42from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

43 

44# TODO: pass a class factory instead of a dependency to a site 

45# TODO: pass a class factory instead of a dependency to a site 

46 

47 

48class BaseCollectionCrawler: 

49 """ 

50 Base collection for the crawlers. 

51 To create a crawler: 

52 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

53 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

54 3) update factory.py so that crawler_factory can return your new crawler 

55 """ 

56 

57 source_name = "" 

58 source_domain = "" 

59 source_website = "" 

60 

61 issue_href = "" 

62 

63 collection = None 

64 source = None 

65 user = None 

66 session: requests.Session | CachedSession 

67 # Updated in constructor with user agent from settings_local 

68 headers = {"accept_encoding": "utf-8"} 

69 

70 next_allowed_request: float = time.time() 

71 

72 # seconds to wait between two http requests 

73 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

74 

75 latext_parser = LatexNodes2Text() 

76 

77 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

78 # do not use the "$" to surround tex formulas 

79 delimiter_inline_formula = "$" 

80 delimiter_disp_formula = "$" 

81 

82 # HACK : Workaround for tests (monkeypatching) 

83 # We store the class here, so we can monkeypatch it when running tests 

84 # subCrawlers = { 

85 # LofplCrawler: None 

86 # } 

87 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

88 

89 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

90 

91 force_refresh = False 

92 

93 # Whereas to include headers in requests cache key 

94 match_headers = False 

95 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

96 

97 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

98 ignore_missing_pdf = True 

99 

100 def __init__( 

101 self, 

102 *args, 

103 username: str, 

104 collection_id: str, 

105 collection_url: str, 

106 test_mode: bool = False, 

107 publisher: str = "mathdoc", 

108 force_refresh=False, 

109 ): 

110 for CrawlerClass in self.subCrawlers: 

111 self.subCrawlers[CrawlerClass] = CrawlerClass( 

112 *args, 

113 username=username, 

114 collection_id=collection_id, 

115 collection_url=collection_url, 

116 test_mode=test_mode, 

117 publisher=publisher, 

118 ) 

119 

120 self.username = username 

121 

122 self.collection_id = collection_id 

123 self.collection_url = ( 

124 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

125 ) 

126 

127 self.test_mode = test_mode 

128 self.publisher = publisher 

129 

130 # Skipped when running tests 

131 self.initialize() 

132 

133 self.session = requests.session() 

134 

135 self.force_refresh = force_refresh 

136 

137 def initialize(self): 

138 """ 

139 Acts as a "second" init function to skip model accesses during test data generation 

140 """ 

141 self.collection = get_or_create_collection(self.collection_id) 

142 self.source = self.get_or_create_source() 

143 self.user = User.objects.get(username=self.username) 

144 self.session = CachedSession( 

145 match_headers=self.match_headers, 

146 backend=MongoCache( 

147 getattr(settings, "MONGO_HOSTNAME", "localhost"), 

148 ), 

149 expire_after=timedelta(days=30), 

150 ) 

151 

152 @classmethod 

153 def can_crawl(cls, pid: str) -> bool: 

154 return True 

155 

156 def parse_collection_content(self, content: str) -> list[IssueData]: 

157 """ 

158 Parse the HTML content with BeautifulSoup 

159 returns a list of xissue. 

160 Override this function in a derived class 

161 """ 

162 return [] 

163 

164 def parse_issue_content(self, content: str, xissue: IssueData): 

165 """ 

166 Parse the HTML content with BeautifulSoup 

167 Fills the xissue.articles 

168 Override this function in a derived class. 

169 

170 CAV : You are supposed to create articles there. Please assign a PID to each article. 

171 The PID can be `a + article_index`, like this : `a0` `a21` 

172 """ 

173 

174 def parse_article_content( 

175 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

176 ) -> ArticleData | None: 

177 """ 

178 Parse the HTML content with BeautifulSoup 

179 returns the xarticle. 

180 Override this function in a derived class. 

181 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

182 The article url is also passed as a parameter 

183 

184 CAV : You are supposed to assign articles pid again here 

185 """ 

186 return xarticle 

187 

188 def crawl_collection(self): 

189 # TODO: Comments, filter 

190 """ 

191 Crawl an entire collection. ptf.models.Container objects are created. 

192 - get the HTML content of the collection_url 

193 - parse the HTML content with beautifulsoup to extract the list of issues 

194 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

195 - crawl each issue if col_only is False 

196 - Returns the list of merged issues. 

197 It is an OrderedDict {pid: {"issues": xissues}} 

198 The key is the pid of the merged issues. 

199 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

200 the pid is then made with 1999-2000__6_ 

201 """ 

202 

203 if self.source is None: 

204 raise RuntimeError("ERROR: the source is not set") 

205 

206 content = self.download_file(self.collection_url) 

207 xissues = self.parse_collection_content(content) 

208 

209 """ 

210 Some collections split the same volumes in different pages 

211 Ex: Volume 6 (2000) and Volume 6 (1999) 

212 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

213 """ 

214 # merged_xissues = self.merge_xissues(xissues) 

215 

216 xissues_dict = {str(i.pid): i for i in xissues} 

217 

218 return xissues_dict 

219 

220 def crawl_issue(self, xissue: IssueData): 

221 """ 

222 Crawl 1 wag page of an issue. 

223 - get the HTML content of the issue 

224 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

225 - crawl each article 

226 """ 

227 

228 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

229 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

230 

231 issue_url = xissue.url 

232 if issue_url is not None: 

233 if issue_url.endswith(".pdf"): 

234 add_pdf_link_to_xarticle(xissue, issue_url) 

235 xissue.url = None 

236 else: 

237 content = self.download_file(issue_url) 

238 self.parse_issue_content(content, xissue) 

239 

240 xarticles = xissue.articles 

241 

242 parsed_xarticles = [] 

243 

244 for xarticle in xarticles: 

245 parsed_xarticle = self.crawl_article(xarticle, xissue) 

246 if parsed_xarticle is not None: 

247 parsed_xarticles.append(parsed_xarticle) 

248 

249 xissue.articles = parsed_xarticles 

250 

251 article_has_pdf = self.article_has_pdf(xissue) 

252 

253 if self.ignore_missing_pdf: 

254 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

255 

256 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

257 self.process_resource_metadata(xissue) 

258 self.add_xissue_into_database(xissue) 

259 

260 @staticmethod 

261 def article_has_source(art: ArticleData | IssueData): 

262 return ( 

263 next( 

264 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

265 None, 

266 ) 

267 is not None 

268 ) 

269 

270 @staticmethod 

271 def article_has_pdf(art: ArticleData | IssueData): 

272 return ( 

273 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

274 is not None 

275 ) 

276 

277 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

278 # ARTICLE URL as en ExtLink (to display the link in the article page) 

279 if xarticle.url is None: 

280 if not self.article_has_source(xarticle): 280 ↛ 290line 280 didn't jump to line 290 because the condition on line 280 was always true

281 if xissue.url: 

282 article_source = xissue.url 

283 else: 

284 article_source = self.collection_url 

285 ext_link = create_extlink() 

286 ext_link["rel"] = "source" 

287 ext_link["location"] = article_source 

288 ext_link["metadata"] = self.source_domain 

289 xarticle.ext_links.append(ext_link) 

290 return self.process_resource_metadata(xarticle) 

291 

292 content = self.download_file(xarticle.url) 

293 

294 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

295 if parsed_xarticle is None: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 return None 

297 

298 if parsed_xarticle.doi: 

299 parsed_xarticle.pid = ( 

300 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

301 ) 

302 else: 

303 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

304 

305 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

306 ext_link = create_extlink() 

307 ext_link["rel"] = "source" 

308 ext_link["location"] = parsed_xarticle.url 

309 ext_link["metadata"] = self.source_domain 

310 parsed_xarticle.ext_links.append(ext_link) 

311 

312 # The article title may have formulas surrounded with '$' 

313 return self.process_resource_metadata(parsed_xarticle) 

314 

315 def process_resource_metadata(self, xresource: ResourceData): 

316 # Process title tex 

317 html, xml = get_html_and_xml_from_text_with_formulas( 

318 xresource.title_tex, 

319 delimiter_inline=self.delimiter_inline_formula, 

320 delimiter_disp=self.delimiter_disp_formula, 

321 ) 

322 xml = get_title_xml(xml, with_tex_values=False) 

323 xresource.title_html = html 

324 xresource.title_xml = xml 

325 del xml 

326 del html 

327 

328 # Process trans_title tex 

329 html, xml = get_html_and_xml_from_text_with_formulas( 

330 xresource.trans_title_tex, 

331 delimiter_inline=self.delimiter_inline_formula, 

332 delimiter_disp=self.delimiter_disp_formula, 

333 ) 

334 xml = get_title_xml(xml, with_tex_values=False) 

335 xresource.trans_title_html = html 

336 xresource.trans_title_xml = xml 

337 del xml 

338 del html 

339 

340 abstracts_to_parse = [ 

341 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

342 ] 

343 # abstract may have formulas surrounded with '$' 

344 if len(abstracts_to_parse) > 0: 

345 for xabstract in abstracts_to_parse: 

346 html, xml = get_html_and_xml_from_text_with_formulas( 

347 xabstract["value_tex"], 

348 delimiter_inline=self.delimiter_inline_formula, 

349 delimiter_disp=self.delimiter_disp_formula, 

350 ) 

351 xabstract["value_html"] = html 

352 lang = xabstract["lang"] 

353 if lang == xresource.lang: 

354 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

355 else: 

356 xabstract["value_xml"] = ( 

357 f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 

358 ) 

359 

360 if isinstance(xresource, ArticleData): 

361 update_data_for_jats(xresource) 

362 return xresource 

363 

364 def get(self, url: str, force_refresh=False, headers={}): 

365 attempt = 0 

366 response = None 

367 

368 while attempt < 3: 

369 # If we already have a key, we can skip the timeout 

370 if isinstance(self.session, CachedSession): 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true

371 if not self.session.cache.contains(url=url) or force_refresh: 

372 delta = self.next_allowed_request - time.time() 

373 if delta > 0: 

374 time.sleep(delta) 

375 self.next_allowed_request = time.time() + self.requests_interval 

376 try: 

377 # For SSL Errors, use verify=False kwarg 

378 verify = True 

379 if url.startswith("https://hdml.di.ionio.gr/"): 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true

380 verify = False 

381 # self.session.cache.delete(urls=[url]) 

382 if isinstance(self.session, CachedSession): 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true

383 response = self.session.get( 

384 url, 

385 headers={**self.headers, **headers}, 

386 verify=verify, 

387 force_refresh=force_refresh, 

388 ) 

389 else: 

390 response = self.session.get( 

391 url, headers={**self.headers, **headers}, verify=verify 

392 ) 

393 if not response.ok: 

394 raise requests.exceptions.HTTPError( 

395 f"Endpoint answered with code {response.status_code} : {url}", 

396 response=response, 

397 ) 

398 return response 

399 except ( 

400 requests.ConnectionError, 

401 requests.ConnectTimeout, 

402 requests.exceptions.HTTPError, 

403 ): 

404 attempt += 1 

405 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

406 

407 def download_file(self, url: str, force_refresh=False, headers={}): 

408 """ 

409 Downloads a URL, saves its content on disk in filename and returns its content. 

410 """ 

411 response = self.get( 

412 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

413 ) 

414 content = self.decode_response(response) 

415 if content == "" or not content: 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true

416 raise requests.exceptions.HTTPError(response) 

417 return content 

418 

419 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

420 """Override this if the content-type headers from the sources are advertising something else than the actual content 

421 SASA needs this""" 

422 response.encoding = encoding 

423 return response.text 

424 

425 def add_xissue_into_database(self, xissue: IssueData): 

426 xissue.journal = self.collection 

427 

428 if xissue.year == "": 

429 raise ValueError("Failsafe : Cannot insert issue without a year") 

430 

431 xpub = create_publisherdata() 

432 xpub.name = self.publisher 

433 xissue.publisher = xpub 

434 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

435 

436 attempt = 1 

437 success = False 

438 

439 while not success and attempt < 4: 

440 try: 

441 params = {"xissue": xissue, "use_body": False} 

442 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

443 container = cmd.do() 

444 success = True 

445 ContainerSource.objects.create(source=self.source, container=container) 

446 except SolrError: 

447 attempt += 1 

448 time.sleep(10) 

449 

450 def get_metadata_using_citation_meta( 

451 self, 

452 xarticle: ArticleData, 

453 xissue: IssueData, 

454 soup: BeautifulSoup, 

455 what: list[CitationLiteral] = [], 

456 ): 

457 """ 

458 :param xarticle: the xarticle that will collect the metadata 

459 :param xissue: the xissue that will collect the publisher 

460 :param soup: the BeautifulSoup object of tha article page 

461 :param what: list of citation_ items to collect. 

462 :return: None. The given article is modified 

463 """ 

464 

465 if "title" in what: 

466 # TITLE 

467 citation_title_node = soup.select_one("meta[name='citation_title']") 

468 if citation_title_node: 468 ↛ 473line 468 didn't jump to line 473 because the condition on line 468 was always true

469 title = citation_title_node.get("content") 

470 if isinstance(title, str): 470 ↛ 473line 470 didn't jump to line 473 because the condition on line 470 was always true

471 xarticle.title_tex = title 

472 

473 if "author" in what: 473 ↛ 502line 473 didn't jump to line 502 because the condition on line 473 was always true

474 # AUTHORS 

475 citation_author_nodes = soup.select("meta[name^='citation_author']") 

476 current_author: ContributorDict | None = None 

477 for citation_author_node in citation_author_nodes: 

478 if citation_author_node.get("name") == "citation_author": 

479 text_author = citation_author_node.get("content") 

480 if not isinstance(text_author, str): 480 ↛ 481line 480 didn't jump to line 481 because the condition on line 480 was never true

481 raise ValueError("Cannot parse author") 

482 if text_author == "": 482 ↛ 483line 482 didn't jump to line 483 because the condition on line 482 was never true

483 current_author = None 

484 continue 

485 current_author = create_contributor(role="author", string_name=text_author) 

486 xarticle.contributors.append(current_author) 

487 continue 

488 if current_author is None: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true

489 print("Couldn't parse citation author") 

490 continue 

491 if citation_author_node.get("name") == "citation_author_institution": 

492 text_institution = citation_author_node.get("content") 

493 if not isinstance(text_institution, str): 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true

494 continue 

495 current_author["addresses"].append(text_institution) 

496 if citation_author_node.get("name") == "citation_author_ocrid": 496 ↛ 497line 496 didn't jump to line 497 because the condition on line 496 was never true

497 text_orcid = citation_author_node.get("content") 

498 if not isinstance(text_orcid, str): 

499 continue 

500 current_author["orcid"] = text_orcid 

501 

502 if "pdf" in what: 

503 # PDF 

504 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

505 if citation_pdf_node: 

506 pdf_url = citation_pdf_node.get("content") 

507 if isinstance(pdf_url, str): 507 ↛ 510line 507 didn't jump to line 510 because the condition on line 507 was always true

508 add_pdf_link_to_xarticle(xarticle, pdf_url) 

509 

510 if "lang" in what: 

511 # LANG 

512 citation_lang_node = soup.select_one("meta[name='citation_language']") 

513 if citation_lang_node: 513 ↛ 519line 513 didn't jump to line 519 because the condition on line 513 was always true

514 # TODO: check other language code 

515 content_text = citation_lang_node.get("content") 

516 if isinstance(content_text, str): 516 ↛ 519line 516 didn't jump to line 519 because the condition on line 516 was always true

517 xarticle.lang = standardize_tag(content_text) 

518 

519 if "abstract" in what: 

520 # ABSTRACT 

521 abstract_node = soup.select_one("meta[name='citation_abstract']") 

522 if abstract_node is not None: 

523 abstract = abstract_node.get("content") 

524 if not isinstance(abstract, str): 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true

525 raise ValueError("Couldn't parse abstract from meta") 

526 abstract = BeautifulSoup(abstract, "html.parser").text 

527 lang = abstract_node.get("lang") 

528 if not isinstance(lang, str): 528 ↛ 529line 528 didn't jump to line 529 because the condition on line 528 was never true

529 lang = self.detect_language(abstract, xarticle) 

530 xarticle.abstracts.append( 

531 { 

532 "tag": "abstract", 

533 "value_html": "", 

534 "value_tex": abstract, 

535 "value_xml": "", 

536 "lang": lang, 

537 } 

538 ) 

539 

540 if "page" in what: 

541 # PAGES 

542 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

543 if citation_fpage_node: 

544 page = citation_fpage_node.get("content") 

545 if isinstance(page, str): 545 ↛ 550line 545 didn't jump to line 550 because the condition on line 545 was always true

546 page = page.split("(")[0] 

547 if len(page) < 32: 547 ↛ 550line 547 didn't jump to line 550 because the condition on line 547 was always true

548 xarticle.fpage = page 

549 

550 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

551 if citation_lpage_node: 

552 page = citation_lpage_node.get("content") 

553 if isinstance(page, str): 553 ↛ 558line 553 didn't jump to line 558 because the condition on line 553 was always true

554 page = page.split("(")[0] 

555 if len(page) < 32: 555 ↛ 558line 555 didn't jump to line 558 because the condition on line 555 was always true

556 xarticle.lpage = page 

557 

558 if "doi" in what: 

559 # DOI 

560 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

561 if citation_doi_node: 

562 doi = citation_doi_node.get("content") 

563 if isinstance(doi, str): 563 ↛ 570line 563 didn't jump to line 570 because the condition on line 563 was always true

564 doi = doi.strip() 

565 pos = doi.find("10.") 

566 if pos > 0: 

567 doi = doi[pos:] 

568 xarticle.doi = doi 

569 

570 if "mr" in what: 

571 # MR 

572 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

573 if citation_mr_node: 573 ↛ 574line 573 didn't jump to line 574 because the condition on line 573 was never true

574 mr = citation_mr_node.get("content") 

575 if isinstance(mr, str): 

576 mr = mr.strip() 

577 if mr.find("MR") == 0: 

578 mr = mr[2:] 

579 xarticle.extids.append(("mr-item-id", mr)) 

580 

581 if "zbl" in what: 

582 # ZBL 

583 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

584 if citation_zbl_node: 

585 zbl = citation_zbl_node.get("content") 

586 if isinstance(zbl, str): 586 ↛ 592line 586 didn't jump to line 592 because the condition on line 586 was always true

587 zbl = zbl.strip() 

588 if zbl.find("Zbl") == 0: 588 ↛ 592line 588 didn't jump to line 592 because the condition on line 588 was always true

589 zbl = zbl[3:].strip() 

590 xarticle.extids.append(("zbl-item-id", zbl)) 

591 

592 if "publisher" in what: 

593 # PUBLISHER 

594 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

595 if citation_publisher_node: 

596 pub = citation_publisher_node.get("content") 

597 if isinstance(pub, str): 597 ↛ 604line 597 didn't jump to line 604 because the condition on line 597 was always true

598 pub = pub.strip() 

599 if pub != "": 599 ↛ 604line 599 didn't jump to line 604 because the condition on line 599 was always true

600 xpub = create_publisherdata() 

601 xpub.name = pub 

602 xissue.publisher = xpub 

603 

604 if "keywords" in what: 

605 # KEYWORDS 

606 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

607 for kwd_node in citation_kwd_nodes: 

608 kwds = kwd_node.get("content") 

609 if isinstance(kwds, str): 609 ↛ 607line 609 didn't jump to line 607 because the condition on line 609 was always true

610 kwds = kwds.split(",") 

611 for kwd in kwds: 

612 if kwd == "": 

613 continue 

614 kwd = kwd.strip() 

615 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

616 

617 if "references" in what: 

618 citation_references = soup.select("meta[name='citation_reference']") 

619 for index, tag in enumerate(citation_references): 

620 content = tag.get("content") 

621 if not isinstance(content, str): 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true

622 raise ValueError("Cannot parse citation_reference meta") 

623 xarticle.bibitems.append( 

624 self.__parse_meta_citation_reference(content, str(index + 1)) 

625 ) 

626 

627 def create_xissue( 

628 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

629 ): 

630 if url is not None and url.endswith("/"): 

631 url = url[:-1] 

632 xissue = create_issuedata() 

633 xissue.url = url 

634 

635 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

636 

637 xissue.year = year 

638 

639 if volume_number is not None: 

640 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

641 

642 if issue_number is not None: 

643 xissue.number = issue_number.replace(",", "-") 

644 return xissue 

645 

646 def detect_language(self, text: str, article: ArticleData | None = None): 

647 if article and article.lang is not None and article.lang != "und": 

648 return article.lang 

649 

650 language = self.language_detector.detect_language_of(text) 

651 

652 if not language: 652 ↛ 653line 652 didn't jump to line 653 because the condition on line 652 was never true

653 return "und" 

654 return language.iso_code_639_1.name.lower() 

655 

656 references_mapping = { 

657 "citation_title": get_article_title_xml, 

658 "citation_journal_title": get_source_xml, 

659 "citation_publication_date": get_year_xml, 

660 "citation_firstpage": get_fpage_xml, 

661 "citation_lastpage": get_lpage_xml, 

662 } 

663 

664 @classmethod 

665 def __parse_meta_citation_reference(cls, content: str, label=None): 

666 categories = content.split(";") 

667 

668 if len(categories) == 1: 

669 return JatsBase.bake_ref(content, label=label) 

670 

671 citation_data = [c.split("=") for c in categories if "=" in c] 

672 del categories 

673 

674 xml_string = "" 

675 authors_parsed = False 

676 authors_strings = [] 

677 for data in citation_data: 

678 key = data[0].strip() 

679 citation_content = data[1] 

680 if key == "citation_author": 

681 authors_strings.append(get_author_xml(template_str=citation_content)) 

682 continue 

683 elif not authors_parsed: 

684 xml_string += ", ".join(authors_strings) 

685 authors_parsed = True 

686 

687 if key in cls.references_mapping: 

688 xml_string += " " + cls.references_mapping[key](citation_content) 

689 

690 return JatsBase.bake_ref(xml_string, label=label) 

691 

692 @classmethod 

693 def get_or_create_source(cls): 

694 source, created = Source.objects.get_or_create( 

695 domain=cls.source_domain, 

696 defaults={ 

697 "name": cls.source_name, 

698 "website": cls.source_website, 

699 }, 

700 ) 

701 if created: 701 ↛ 702line 701 didn't jump to line 702 because the condition on line 701 was never true

702 source.save() 

703 return source 

704 

705 @staticmethod 

706 def get_issue_pid( 

707 collection_id: str, 

708 year: str, 

709 volume_number: str | None = None, 

710 issue_number: str | None = None, 

711 ): 

712 # Replace any non-word character with an underscore 

713 pid = f"{collection_id}_{year}" 

714 if volume_number is not None: 

715 pid += f"_{volume_number}" 

716 if issue_number is not None: 

717 pid += f"_{issue_number}" 

718 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

719 return pid 

720 

721 @staticmethod 

722 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

723 pages_split = pages.split(separator) 

724 if len(pages_split) == 0: 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true

725 article.page_range = pages 

726 if len(pages_split) > 0: 726 ↛ exitline 726 didn't return from function 'set_pages' because the condition on line 726 was always true

727 if pages[0].isnumeric(): 

728 article.fpage = pages_split[0] 

729 if ( 

730 len(pages_split) > 1 

731 and pages_split[0] != pages_split[1] 

732 and pages_split[1].isnumeric() 

733 ): 

734 article.lpage = pages_split[1]