Coverage for src/crawler/base_crawler.py: 75%

396 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1import time 

2from datetime import timedelta 

3 

4import regex 

5import requests 

6from bs4 import BeautifulSoup 

7from django.conf import settings 

8from django.contrib.auth.models import User 

9from django.utils import timezone 

10from langcodes import standardize_tag 

11from lingua import LanguageDetectorBuilder 

12from ptf.cmds import xml_cmds 

13from ptf.cmds.xml.ckeditor.utils import ( 

14 build_jats_data_from_html_field, 

15) 

16from ptf.cmds.xml.jats.builder.citation import ( 

17 get_article_title_xml, 

18 get_author_xml, 

19 get_fpage_xml, 

20 get_lpage_xml, 

21 get_source_xml, 

22 get_year_xml, 

23) 

24from ptf.cmds.xml.jats.jats_parser import JatsBase 

25from ptf.model_data import ( 

26 ArticleData, 

27 ContributorDict, 

28 IssueData, 

29 ResourceData, 

30 create_contributor, 

31 create_extlink, 

32 create_issuedata, 

33 create_publisherdata, 

34) 

35from ptf.model_data_converter import update_data_for_jats 

36from pylatexenc.latex2text import LatexNodes2Text 

37from pysolr import SolrError 

38from requests_cache import CachedSession, MongoCache 

39 

40from crawler.models import Source 

41from crawler.models.container_source import ContainerSource 

42from crawler.types import CitationLiteral 

43from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

44 

45# TODO: pass a class factory instead of a dependency to a site 

46# TODO: pass a class factory instead of a dependency to a site 

47 

48 

49class BaseCollectionCrawler: 

50 """ 

51 Base collection for the crawlers. 

52 To create a crawler: 

53 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

54 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

55 3) update factory.py so that crawler_factory can return your new crawler 

56 """ 

57 

58 source_name = "" 

59 source_domain = "" 

60 source_website = "" 

61 

62 issue_href = "" 

63 

64 collection = None 

65 source = None 

66 user = None 

67 session: requests.Session | CachedSession 

68 # Updated in constructor with user agent from settings_local 

69 headers = {"accept_encoding": "utf-8"} 

70 

71 next_allowed_request: float = time.time() 

72 

73 # seconds to wait between two http requests 

74 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

75 

76 latext_parser = LatexNodes2Text() 

77 

78 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

79 # do not use the "$" to surround tex formulas 

80 delimiter_inline_formula = "$" 

81 delimiter_disp_formula = "$" 

82 

83 # HACK : Workaround for tests (monkeypatching) 

84 # We store the class here, so we can monkeypatch it when running tests 

85 # subCrawlers = { 

86 # LofplCrawler: None 

87 # } 

88 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

89 

90 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

91 

92 force_refresh = False 

93 

94 # Whereas to include headers in requests cache key 

95 match_headers = False 

96 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

97 

98 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

99 ignore_missing_pdf = True 

100 

101 def __init__( 

102 self, 

103 *args, 

104 username: str, 

105 collection_id: str, 

106 collection_url: str, 

107 test_mode: bool = False, 

108 publisher: str = "mathdoc", 

109 force_refresh=False, 

110 ): 

111 for CrawlerClass in self.subCrawlers: 

112 self.subCrawlers[CrawlerClass] = CrawlerClass( 

113 *args, 

114 username=username, 

115 collection_id=collection_id, 

116 collection_url=collection_url, 

117 test_mode=test_mode, 

118 publisher=publisher, 

119 ) 

120 

121 self.username = username 

122 

123 self.collection_id = collection_id 

124 self.collection_url = ( 

125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

126 ) 

127 

128 self.test_mode = test_mode 

129 self.publisher = publisher 

130 

131 # Skipped when running tests 

132 self.initialize() 

133 

134 self.session = requests.session() 

135 

136 self.force_refresh = force_refresh 

137 

138 def initialize(self): 

139 """ 

140 Acts as a "second" init function to skip model accesses during test data generation 

141 """ 

142 self.collection = get_or_create_collection(self.collection_id) 

143 self.source = self.get_or_create_source() 

144 self.user = User.objects.get(username=self.username) 

145 self.session = CachedSession( 

146 match_headers=self.match_headers, 

147 backend=MongoCache( 

148 getattr(settings, "MONGO_HOSTNAME", "localhost"), 

149 ), 

150 expire_after=timedelta(days=30), 

151 ) 

152 

153 @classmethod 

154 def can_crawl(cls, pid: str) -> bool: 

155 return True 

156 

157 def parse_collection_content(self, content: str) -> list[IssueData]: 

158 """ 

159 Parse the HTML content with BeautifulSoup 

160 returns a list of xissue. 

161 Override this function in a derived class 

162 """ 

163 return [] 

164 

165 def parse_issue_content(self, content: str, xissue: IssueData): 

166 """ 

167 Parse the HTML content with BeautifulSoup 

168 Fills the xissue.articles 

169 Override this function in a derived class. 

170 

171 CAV : You are supposed to create articles there. Please assign a PID to each article. 

172 The PID can be `a + article_index`, like this : `a0` `a21` 

173 """ 

174 

175 def parse_article_content( 

176 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

177 ) -> ArticleData | None: 

178 """ 

179 Parse the HTML content with BeautifulSoup 

180 returns the xarticle. 

181 Override this function in a derived class. 

182 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

183 The article url is also passed as a parameter 

184 

185 CAV : You are supposed to assign articles pid again here 

186 """ 

187 return xarticle 

188 

189 def crawl_collection(self): 

190 # TODO: Comments, filter 

191 """ 

192 Crawl an entire collection. ptf.models.Container objects are created. 

193 - get the HTML content of the collection_url 

194 - parse the HTML content with beautifulsoup to extract the list of issues 

195 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

196 - crawl each issue if col_only is False 

197 - Returns the list of merged issues. 

198 It is an OrderedDict {pid: {"issues": xissues}} 

199 The key is the pid of the merged issues. 

200 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

201 the pid is then made with 1999-2000__6_ 

202 """ 

203 

204 if self.source is None: 

205 raise RuntimeError("ERROR: the source is not set") 

206 

207 content = self.download_file(self.collection_url) 

208 xissues = self.parse_collection_content(content) 

209 

210 """ 

211 Some collections split the same volumes in different pages 

212 Ex: Volume 6 (2000) and Volume 6 (1999) 

213 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

214 """ 

215 # merged_xissues = self.merge_xissues(xissues) 

216 

217 xissues_dict = {str(i.pid): i for i in xissues} 

218 

219 return xissues_dict 

220 

221 def crawl_issue(self, xissue: IssueData): 

222 """ 

223 Crawl 1 wag page of an issue. 

224 - get the HTML content of the issue 

225 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

226 - crawl each article 

227 """ 

228 

229 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

230 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

231 

232 issue_url = xissue.url 

233 if issue_url is not None: 

234 if issue_url.endswith(".pdf"): 

235 add_pdf_link_to_xarticle(xissue, issue_url) 

236 xissue.url = None 

237 else: 

238 content = self.download_file(issue_url) 

239 self.parse_issue_content(content, xissue) 

240 

241 xarticles = xissue.articles 

242 

243 parsed_xarticles = [] 

244 

245 for xarticle in xarticles: 

246 parsed_xarticle = self.crawl_article(xarticle, xissue) 

247 if parsed_xarticle is not None: 

248 parsed_xarticles.append(parsed_xarticle) 

249 

250 xissue.articles = parsed_xarticles 

251 

252 article_has_pdf = self.article_has_pdf(xissue) 

253 

254 if self.ignore_missing_pdf: 

255 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

256 

257 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

258 self.process_resource_metadata(xissue, resource_type="issue") 

259 self.add_xissue_into_database(xissue) 

260 

261 @staticmethod 

262 def article_has_source(art: ArticleData | IssueData): 

263 return ( 

264 next( 

265 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

266 None, 

267 ) 

268 is not None 

269 ) 

270 

271 @staticmethod 

272 def article_has_pdf(art: ArticleData | IssueData): 

273 return ( 

274 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

275 is not None 

276 ) 

277 

278 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

279 # ARTICLE URL as en ExtLink (to display the link in the article page) 

280 if xarticle.url is None: 

281 if not self.article_has_source(xarticle): 281 ↛ 291line 281 didn't jump to line 291 because the condition on line 281 was always true

282 if xissue.url: 

283 article_source = xissue.url 

284 else: 

285 article_source = self.collection_url 

286 ext_link = create_extlink() 

287 ext_link["rel"] = "source" 

288 ext_link["location"] = article_source 

289 ext_link["metadata"] = self.source_domain 

290 xarticle.ext_links.append(ext_link) 

291 return self.process_article_metadata(xarticle) 

292 

293 content = self.download_file(xarticle.url) 

294 

295 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

296 if parsed_xarticle is None: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 return None 

298 

299 if parsed_xarticle.doi: 

300 parsed_xarticle.pid = ( 

301 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

302 ) 

303 else: 

304 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

305 

306 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

307 ext_link = create_extlink() 

308 ext_link["rel"] = "source" 

309 ext_link["location"] = parsed_xarticle.url 

310 ext_link["metadata"] = self.source_domain 

311 parsed_xarticle.ext_links.append(ext_link) 

312 

313 # The article title may have formulas surrounded with '$' 

314 return self.process_article_metadata(parsed_xarticle) 

315 

316 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

317 tag = "article-title" if resource_type == "article" else "issue-title" 

318 

319 # Process title tex 

320 ckeditor_data = build_jats_data_from_html_field( 

321 xresource.title_tex, 

322 tag=tag, 

323 text_lang=xresource.lang, 

324 delimiter_inline=self.delimiter_inline_formula, 

325 delimiter_disp=self.delimiter_disp_formula, 

326 ) 

327 

328 xresource.title_html = ckeditor_data["value_html"] 

329 # xresource.title_tex = ckeditor_data["value_tex"] 

330 xresource.title_xml = ckeditor_data["value_xml"] 

331 

332 # Process trans_title tex 

333 if xresource.trans_title_tex: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 tag = "trans-article" if resource_type == "article" else "issue-title" 

335 

336 ckeditor_data = build_jats_data_from_html_field( 

337 xresource.trans_title_tex, 

338 tag=tag, 

339 text_lang=xresource.trans_lang, 

340 resource_lang=xresource.lang, 

341 delimiter_inline=self.delimiter_inline_formula, 

342 delimiter_disp=self.delimiter_disp_formula, 

343 ) 

344 

345 xresource.titles.append(ckeditor_data["title"]) 

346 

347 abstracts_to_parse = [ 

348 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

349 ] 

350 # abstract may have formulas surrounded with '$' 

351 if len(abstracts_to_parse) > 0: 

352 for xabstract in abstracts_to_parse: 

353 ckeditor_data = build_jats_data_from_html_field( 

354 xabstract["value_tex"], 

355 tag="abstract", 

356 text_lang=xabstract["lang"], 

357 resource_lang=xresource.lang, 

358 field_type="abstract", 

359 delimiter_inline=self.delimiter_inline_formula, 

360 delimiter_disp=self.delimiter_disp_formula, 

361 ) 

362 

363 xabstract["value_html"] = ckeditor_data["value_html"] 

364 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

365 xabstract["value_xml"] = ckeditor_data["value_xml"] 

366 

367 return xresource 

368 

369 def process_article_metadata(self, xresource: ResourceData): 

370 self.process_resource_metadata(xresource) 

371 update_data_for_jats(xresource) 

372 

373 return xresource 

374 

375 def get(self, url: str, force_refresh=False, headers={}): 

376 attempt = 0 

377 response = None 

378 

379 while attempt < 3: 

380 # If we already have a key, we can skip the timeout 

381 if isinstance(self.session, CachedSession): 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 if not self.session.cache.contains(url=url) or force_refresh: 

383 delta = self.next_allowed_request - time.time() 

384 if delta > 0: 

385 time.sleep(delta) 

386 self.next_allowed_request = time.time() + self.requests_interval 

387 try: 

388 # For SSL Errors, use verify=False kwarg 

389 verify = True 

390 if url.startswith("https://hdml.di.ionio.gr/"): 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 verify = False 

392 # self.session.cache.delete(urls=[url]) 

393 if isinstance(self.session, CachedSession): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 response = self.session.get( 

395 url, 

396 headers={**self.headers, **headers}, 

397 verify=verify, 

398 force_refresh=force_refresh, 

399 ) 

400 else: 

401 response = self.session.get( 

402 url, headers={**self.headers, **headers}, verify=verify 

403 ) 

404 if not response.ok: 

405 raise requests.exceptions.HTTPError( 

406 f"Endpoint answered with code {response.status_code} : {url}", 

407 response=response, 

408 ) 

409 return response 

410 except ( 

411 requests.ConnectionError, 

412 requests.ConnectTimeout, 

413 requests.exceptions.HTTPError, 

414 ): 

415 attempt += 1 

416 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

417 

418 def download_file(self, url: str, force_refresh=False, headers={}): 

419 """ 

420 Downloads a URL, saves its content on disk in filename and returns its content. 

421 """ 

422 response = self.get( 

423 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

424 ) 

425 content = self.decode_response(response) 

426 if content == "" or not content: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 raise requests.exceptions.HTTPError(response) 

428 return content 

429 

430 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

431 """Override this if the content-type headers from the sources are advertising something else than the actual content 

432 SASA needs this""" 

433 response.encoding = encoding 

434 return response.text 

435 

436 def add_xissue_into_database(self, xissue: IssueData): 

437 xissue.journal = self.collection 

438 

439 if xissue.year == "": 

440 raise ValueError("Failsafe : Cannot insert issue without a year") 

441 

442 xpub = create_publisherdata() 

443 xpub.name = self.publisher 

444 xissue.publisher = xpub 

445 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

446 

447 attempt = 1 

448 success = False 

449 

450 while not success and attempt < 4: 

451 try: 

452 params = {"xissue": xissue, "use_body": False} 

453 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 

454 container = cmd.do() 

455 success = True 

456 ContainerSource.objects.create(source=self.source, container=container) 

457 except SolrError: 

458 attempt += 1 

459 time.sleep(10) 

460 

461 def get_metadata_using_citation_meta( 

462 self, 

463 xarticle: ArticleData, 

464 xissue: IssueData, 

465 soup: BeautifulSoup, 

466 what: list[CitationLiteral] = [], 

467 ): 

468 """ 

469 :param xarticle: the xarticle that will collect the metadata 

470 :param xissue: the xissue that will collect the publisher 

471 :param soup: the BeautifulSoup object of tha article page 

472 :param what: list of citation_ items to collect. 

473 :return: None. The given article is modified 

474 """ 

475 

476 if "title" in what: 

477 # TITLE 

478 citation_title_node = soup.select_one("meta[name='citation_title']") 

479 if citation_title_node: 479 ↛ 484line 479 didn't jump to line 484 because the condition on line 479 was always true

480 title = citation_title_node.get("content") 

481 if isinstance(title, str): 481 ↛ 484line 481 didn't jump to line 484 because the condition on line 481 was always true

482 xarticle.title_tex = title 

483 

484 if "author" in what: 484 ↛ 513line 484 didn't jump to line 513 because the condition on line 484 was always true

485 # AUTHORS 

486 citation_author_nodes = soup.select("meta[name^='citation_author']") 

487 current_author: ContributorDict | None = None 

488 for citation_author_node in citation_author_nodes: 

489 if citation_author_node.get("name") == "citation_author": 

490 text_author = citation_author_node.get("content") 

491 if not isinstance(text_author, str): 491 ↛ 492line 491 didn't jump to line 492 because the condition on line 491 was never true

492 raise ValueError("Cannot parse author") 

493 if text_author == "": 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true

494 current_author = None 

495 continue 

496 current_author = create_contributor(role="author", string_name=text_author) 

497 xarticle.contributors.append(current_author) 

498 continue 

499 if current_author is None: 499 ↛ 500line 499 didn't jump to line 500 because the condition on line 499 was never true

500 print("Couldn't parse citation author") 

501 continue 

502 if citation_author_node.get("name") == "citation_author_institution": 

503 text_institution = citation_author_node.get("content") 

504 if not isinstance(text_institution, str): 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true

505 continue 

506 current_author["addresses"].append(text_institution) 

507 if citation_author_node.get("name") == "citation_author_ocrid": 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true

508 text_orcid = citation_author_node.get("content") 

509 if not isinstance(text_orcid, str): 

510 continue 

511 current_author["orcid"] = text_orcid 

512 

513 if "pdf" in what: 

514 # PDF 

515 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

516 if citation_pdf_node: 

517 pdf_url = citation_pdf_node.get("content") 

518 if isinstance(pdf_url, str): 518 ↛ 521line 518 didn't jump to line 521 because the condition on line 518 was always true

519 add_pdf_link_to_xarticle(xarticle, pdf_url) 

520 

521 if "lang" in what: 

522 # LANG 

523 citation_lang_node = soup.select_one("meta[name='citation_language']") 

524 if citation_lang_node: 524 ↛ 530line 524 didn't jump to line 530 because the condition on line 524 was always true

525 # TODO: check other language code 

526 content_text = citation_lang_node.get("content") 

527 if isinstance(content_text, str): 527 ↛ 530line 527 didn't jump to line 530 because the condition on line 527 was always true

528 xarticle.lang = standardize_tag(content_text) 

529 

530 if "abstract" in what: 

531 # ABSTRACT 

532 abstract_node = soup.select_one("meta[name='citation_abstract']") 

533 if abstract_node is not None: 

534 abstract = abstract_node.get("content") 

535 if not isinstance(abstract, str): 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true

536 raise ValueError("Couldn't parse abstract from meta") 

537 abstract = BeautifulSoup(abstract, "html.parser").text 

538 lang = abstract_node.get("lang") 

539 if not isinstance(lang, str): 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true

540 lang = self.detect_language(abstract, xarticle) 

541 xarticle.abstracts.append( 

542 { 

543 "tag": "abstract", 

544 "value_html": "", 

545 "value_tex": abstract, 

546 "value_xml": "", 

547 "lang": lang, 

548 } 

549 ) 

550 

551 if "page" in what: 

552 # PAGES 

553 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

554 if citation_fpage_node: 

555 page = citation_fpage_node.get("content") 

556 if isinstance(page, str): 556 ↛ 561line 556 didn't jump to line 561 because the condition on line 556 was always true

557 page = page.split("(")[0] 

558 if len(page) < 32: 558 ↛ 561line 558 didn't jump to line 561 because the condition on line 558 was always true

559 xarticle.fpage = page 

560 

561 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

562 if citation_lpage_node: 

563 page = citation_lpage_node.get("content") 

564 if isinstance(page, str): 564 ↛ 569line 564 didn't jump to line 569 because the condition on line 564 was always true

565 page = page.split("(")[0] 

566 if len(page) < 32: 566 ↛ 569line 566 didn't jump to line 569 because the condition on line 566 was always true

567 xarticle.lpage = page 

568 

569 if "doi" in what: 

570 # DOI 

571 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

572 if citation_doi_node: 

573 doi = citation_doi_node.get("content") 

574 if isinstance(doi, str): 574 ↛ 581line 574 didn't jump to line 581 because the condition on line 574 was always true

575 doi = doi.strip() 

576 pos = doi.find("10.") 

577 if pos > 0: 

578 doi = doi[pos:] 

579 xarticle.doi = doi 

580 

581 if "mr" in what: 

582 # MR 

583 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

584 if citation_mr_node: 584 ↛ 585line 584 didn't jump to line 585 because the condition on line 584 was never true

585 mr = citation_mr_node.get("content") 

586 if isinstance(mr, str): 

587 mr = mr.strip() 

588 if mr.find("MR") == 0: 

589 mr = mr[2:] 

590 xarticle.extids.append(("mr-item-id", mr)) 

591 

592 if "zbl" in what: 

593 # ZBL 

594 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

595 if citation_zbl_node: 

596 zbl = citation_zbl_node.get("content") 

597 if isinstance(zbl, str): 597 ↛ 603line 597 didn't jump to line 603 because the condition on line 597 was always true

598 zbl = zbl.strip() 

599 if zbl.find("Zbl") == 0: 599 ↛ 603line 599 didn't jump to line 603 because the condition on line 599 was always true

600 zbl = zbl[3:].strip() 

601 xarticle.extids.append(("zbl-item-id", zbl)) 

602 

603 if "publisher" in what: 

604 # PUBLISHER 

605 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

606 if citation_publisher_node: 

607 pub = citation_publisher_node.get("content") 

608 if isinstance(pub, str): 608 ↛ 615line 608 didn't jump to line 615 because the condition on line 608 was always true

609 pub = pub.strip() 

610 if pub != "": 610 ↛ 615line 610 didn't jump to line 615 because the condition on line 610 was always true

611 xpub = create_publisherdata() 

612 xpub.name = pub 

613 xissue.publisher = xpub 

614 

615 if "keywords" in what: 

616 # KEYWORDS 

617 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

618 for kwd_node in citation_kwd_nodes: 

619 kwds = kwd_node.get("content") 

620 if isinstance(kwds, str): 620 ↛ 618line 620 didn't jump to line 618 because the condition on line 620 was always true

621 kwds = kwds.split(",") 

622 for kwd in kwds: 

623 if kwd == "": 

624 continue 

625 kwd = kwd.strip() 

626 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

627 

628 if "references" in what: 

629 citation_references = soup.select("meta[name='citation_reference']") 

630 for index, tag in enumerate(citation_references): 

631 content = tag.get("content") 

632 if not isinstance(content, str): 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true

633 raise ValueError("Cannot parse citation_reference meta") 

634 xarticle.bibitems.append( 

635 self.__parse_meta_citation_reference(content, str(index + 1)) 

636 ) 

637 

638 def create_xissue( 

639 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

640 ): 

641 if url is not None and url.endswith("/"): 

642 url = url[:-1] 

643 xissue = create_issuedata() 

644 xissue.url = url 

645 

646 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

647 

648 xissue.year = year 

649 

650 if volume_number is not None: 

651 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

652 

653 if issue_number is not None: 

654 xissue.number = issue_number.replace(",", "-") 

655 return xissue 

656 

657 def detect_language(self, text: str, article: ArticleData | None = None): 

658 if article and article.lang is not None and article.lang != "und": 

659 return article.lang 

660 

661 language = self.language_detector.detect_language_of(text) 

662 

663 if not language: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true

664 return "und" 

665 return language.iso_code_639_1.name.lower() 

666 

667 references_mapping = { 

668 "citation_title": get_article_title_xml, 

669 "citation_journal_title": get_source_xml, 

670 "citation_publication_date": get_year_xml, 

671 "citation_firstpage": get_fpage_xml, 

672 "citation_lastpage": get_lpage_xml, 

673 } 

674 

675 @classmethod 

676 def __parse_meta_citation_reference(cls, content: str, label=None): 

677 categories = content.split(";") 

678 

679 if len(categories) == 1: 

680 return JatsBase.bake_ref(content, label=label) 

681 

682 citation_data = [c.split("=") for c in categories if "=" in c] 

683 del categories 

684 

685 xml_string = "" 

686 authors_parsed = False 

687 authors_strings = [] 

688 for data in citation_data: 

689 key = data[0].strip() 

690 citation_content = data[1] 

691 if key == "citation_author": 

692 authors_strings.append(get_author_xml(template_str=citation_content)) 

693 continue 

694 elif not authors_parsed: 

695 xml_string += ", ".join(authors_strings) 

696 authors_parsed = True 

697 

698 if key in cls.references_mapping: 

699 xml_string += " " + cls.references_mapping[key](citation_content) 

700 

701 return JatsBase.bake_ref(xml_string, label=label) 

702 

703 @classmethod 

704 def get_or_create_source(cls): 

705 source, created = Source.objects.get_or_create( 

706 domain=cls.source_domain, 

707 defaults={ 

708 "name": cls.source_name, 

709 "website": cls.source_website, 

710 }, 

711 ) 

712 if created: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 source.save() 

714 return source 

715 

716 @staticmethod 

717 def get_issue_pid( 

718 collection_id: str, 

719 year: str, 

720 volume_number: str | None = None, 

721 issue_number: str | None = None, 

722 ): 

723 # Replace any non-word character with an underscore 

724 pid = f"{collection_id}_{year}" 

725 if volume_number is not None: 

726 pid += f"_{volume_number}" 

727 if issue_number is not None: 

728 pid += f"_{issue_number}" 

729 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

730 return pid 

731 

732 @staticmethod 

733 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

734 pages_split = pages.split(separator) 

735 if len(pages_split) == 0: 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true

736 article.page_range = pages 

737 if len(pages_split) > 0: 737 ↛ exitline 737 didn't return from function 'set_pages' because the condition on line 737 was always true

738 if pages[0].isnumeric(): 

739 article.fpage = pages_split[0] 

740 if ( 

741 len(pages_split) > 1 

742 and pages_split[0] != pages_split[1] 

743 and pages_split[1].isnumeric() 

744 ): 

745 article.lpage = pages_split[1]