Coverage for src/crawler/base_crawler.py: 68%

520 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-11-21 14:41 +0000

1import logging 

2import time 

3from concurrent.futures import ( 

4 Executor, 

5 ThreadPoolExecutor, 

6) 

7from datetime import datetime, timedelta 

8from email.policy import EmailPolicy 

9 

10import regex 

11import requests 

12from bs4 import BeautifulSoup 

13from django.conf import settings 

14from django.contrib.auth.models import User 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetector, LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_titledata, 

43) 

44from ptf.model_data_converter import update_data_for_jats 

45from pylatexenc.latex2text import LatexNodes2Text 

46from pymongo.errors import DocumentTooLarge 

47from pysolr import SolrError 

48from requests.adapters import HTTPAdapter 

49from requests_cache import CachedSession, MongoCache 

50from urllib3 import Retry 

51 

52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

53from crawler.models import Source 

54from crawler.types import CitationLiteral 

55from crawler.utils import ( 

56 add_pdf_link_to_xarticle, 

57 cleanup_str, 

58 get_all_cols, 

59 get_or_create_collection, 

60) 

61 

62 

63class CrawlerTitleDict(TitleDict): 

64 title_tex: str | None 

65 

66 

67class BaseCollectionCrawler: 

68 """ 

69 Base collection for the crawlers. 

70 To create a crawler: 

71 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

72 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

73 3) update factory.py so that crawler_factory can return your new crawler 

74 """ 

75 

76 logger = logging.getLogger(__name__) 

77 tracer = trace.get_tracer(__name__) 

78 

79 source_name = "" 

80 source_domain = "" 

81 source_website = "" 

82 

83 issue_href = "" 

84 

85 collection = None 

86 source = None 

87 user = None 

88 session: requests.Session | CachedSession 

89 

90 verify = True 

91 headers = { 

92 "accept_encoding": "utf-8", 

93 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

94 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

95 } 

96 

97 next_allowed_request: float = time.time() 

98 

99 # seconds to wait between two http requests 

100 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

101 # seconds to wait before aborting the connection (if no bytes are recieved) 

102 requests_timeout = 60 

103 

104 latext_parser = LatexNodes2Text() 

105 

106 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

107 # do not use the "$" to surround tex formulas 

108 delimiter_inline_formula = "$" 

109 delimiter_disp_formula = "$" 

110 

111 # HACK : Workaround for tests (monkeypatching) 

112 # We store the class here, so we can monkeypatch it when running tests 

113 # subCrawlers = { 

114 # LofplCrawler: None 

115 # } 

116 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

117 

118 _language_detector: LanguageDetector | None = None 

119 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

120 

121 force_refresh = False 

122 

123 # Whereas to include headers in requests cache key 

124 match_headers = False 

125 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

126 

127 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

128 ignore_missing_pdf = True 

129 

130 database_executor: Executor 

131 

132 @classmethod 

133 def get_view_id(cls): 

134 return cls.source_domain 

135 

136 @property 

137 def language_detector(self): 

138 """Crawler Instance singleton for language builder. 

139 Late init of LanguageDetector to save on memory""" 

140 if not self._language_detector: 

141 self._language_detector = self._language_detector_builder.build() 

142 return self._language_detector 

143 

144 def __init__( 

145 self, 

146 *args, 

147 username: str, 

148 collection_id: str, 

149 dry: bool = False, 

150 publisher: str = "", 

151 force_refresh=False, 

152 collection_url: str | None = None, 

153 ): 

154 if not collection_url: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 all_cols = get_all_cols() 

156 col = all_cols[collection_id] 

157 collection_url = col["sources"][self.source_domain] 

158 self.collection_url = collection_url 

159 for CrawlerClass in self.subCrawlers: 159 ↛ 160line 159 didn't jump to line 160 because the loop on line 159 never started

160 self.subCrawlers[CrawlerClass] = CrawlerClass( 

161 *args, 

162 username=username, 

163 collection_id=collection_id, 

164 dry=dry, 

165 publisher=publisher, 

166 collection_url=collection_url, 

167 ) 

168 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

169 

170 self.username = username 

171 

172 self.collection_id = collection_id 

173 

174 self.dry = dry 

175 self.publisher = publisher 

176 

177 self.session = requests.session() 

178 

179 # Skipped when running tests 

180 self.initialize() 

181 self.session.verify = self.verify 

182 self.force_refresh = force_refresh 

183 

184 # We implemented custom retry behaviour, so we don't want to make extra requests here 

185 retries = Retry( 

186 total=0, 

187 ) 

188 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

189 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

190 

191 self.database_executor = ThreadPoolExecutor( 

192 max_workers=2, thread_name_prefix="crawler_database_thread" 

193 ) 

194 

195 def initialize(self): 

196 """ 

197 Acts as a "second" init function to skip model accesses during test data generation 

198 """ 

199 self.collection = get_or_create_collection(self.collection_id) 

200 self.source = self.get_or_create_source() 

201 self.user = User.objects.get(username=self.username) 

202 self.session = CachedSession( 

203 match_headers=self.match_headers, 

204 headers=self.headers, 

205 backend=MongoCache( 

206 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), 

207 ), 

208 expire_after=timedelta(days=30), 

209 ) 

210 

211 @classmethod 

212 def can_crawl(cls, pid: str) -> bool: 

213 return True 

214 

215 def parse_collection_content(self, content: str) -> list[IssueData]: 

216 """ 

217 Parse the HTML content with BeautifulSoup 

218 returns a list of xissue. 

219 Override this function in a derived class 

220 """ 

221 return [] 

222 

223 def parse_issue_content(self, content: str, xissue: IssueData): 

224 """ 

225 Parse the HTML content with BeautifulSoup 

226 Fills the xissue.articles 

227 Override this function in a derived class. 

228 

229 CAV : You are supposed to create articles there. Please assign a PID to each article. 

230 The PID can be `a + article_index`, like this : `a0` `a21` 

231 """ 

232 

233 def parse_article_content( 

234 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

235 ) -> ArticleData | None: 

236 """ 

237 Parse the HTML content with BeautifulSoup 

238 returns the xarticle. 

239 Override this function in a derived class. 

240 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

241 The article url is also passed as a parameter 

242 

243 CAV : You are supposed to assign articles pid again here 

244 """ 

245 return xarticle 

246 

247 @tracer.start_as_current_span("crawl_collection") 

248 def crawl_collection(self): 

249 # TODO: Comments, filter 

250 """ 

251 Crawl an entire collection. ptf.models.Container objects are created. 

252 - get the HTML content of the collection_url 

253 - parse the HTML content with beautifulsoup to extract the list of issues 

254 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

255 - crawl each issue if col_only is False 

256 - Returns the list of merged issues. 

257 It is an OrderedDict {pid: {"issues": xissues}} 

258 The key is the pid of the merged issues. 

259 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

260 the pid is then made with 1999-2000__6_ 

261 """ 

262 

263 if self.source is None: 

264 raise RuntimeError("ERROR: the source is not set") 

265 

266 content = self.download_file(self.collection_url) 

267 if content: 

268 xissues = self.parse_collection_content(content) 

269 else: 

270 # download_file returns None (404) 

271 return None 

272 

273 """ 

274 Some collections split the same volumes in different pages 

275 Ex: Volume 6 (2000) and Volume 6 (1999) 

276 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

277 """ 

278 # merged_xissues = self.merge_xissues(xissues) 

279 

280 xissues_dict = {str(i.pid): i for i in xissues} 

281 

282 return xissues_dict 

283 

284 @tracer.start_as_current_span("crawl_issue") 

285 def crawl_issue(self, xissue: IssueData): 

286 """ 

287 Crawl 1 wag page of an issue. 

288 - get the HTML content of the issue 

289 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

290 - crawl each article 

291 """ 

292 

293 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

294 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

295 

296 issue_url = xissue.url 

297 if issue_url is not None: 

298 if issue_url.endswith(".pdf"): 

299 add_pdf_link_to_xarticle(xissue, issue_url) 

300 xissue.url = None 

301 else: 

302 content = self.download_file(issue_url) 

303 with self.tracer.start_as_current_span("parse_issue_content"): 

304 self.parse_issue_content(content, xissue) 

305 

306 xarticles = xissue.articles 

307 

308 parsed_xarticles = [] 

309 

310 for xarticle in xarticles: 

311 parsed_xarticle = self.crawl_article(xarticle, xissue) 

312 if parsed_xarticle is not None: 

313 parsed_xarticles.append(parsed_xarticle) 

314 

315 xissue.articles = parsed_xarticles 

316 

317 article_has_pdf = self.article_has_pdf(xissue) 

318 

319 if self.ignore_missing_pdf: 

320 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

321 

322 if not self.dry and (len(xissue.articles) > 0 or article_has_pdf): 

323 self.process_resource_metadata(xissue, resource_type="issue") 

324 self.database_executor.submit(self.add_xissue_into_database, xissue) 

325 

326 @staticmethod 

327 def article_has_source(art: ArticleData | IssueData): 

328 return ( 

329 next( 

330 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

331 None, 

332 ) 

333 is not None 

334 ) 

335 

336 @staticmethod 

337 def article_has_pdf(art: ArticleData | IssueData): 

338 return ( 

339 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

340 is not None 

341 ) 

342 

343 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

344 # ARTICLE URL as en ExtLink (to display the link in the article page) 

345 if xarticle.url is None: 

346 if not self.article_has_source(xarticle): 346 ↛ 356line 346 didn't jump to line 356 because the condition on line 346 was always true

347 if xissue.url: 

348 article_source = xissue.url 

349 else: 

350 article_source = self.collection_url 

351 ext_link = create_extlink() 

352 ext_link["rel"] = "source" 

353 ext_link["location"] = article_source 

354 ext_link["metadata"] = self.source_domain 

355 xarticle.ext_links.append(ext_link) 

356 return self.process_article_metadata(xarticle) 

357 

358 content = self.download_file(xarticle.url) 

359 if not content: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 return None 

361 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

362 

363 try: 

364 with self.tracer.start_as_current_span("parse_article_content"): 

365 parsed_xarticle = self.parse_article_content( 

366 content, xissue, xarticle, xarticle.url 

367 ) 

368 except ValueError as e: 

369 self.logger.warning(e) 

370 self.logger.warning("Retrying in 5 mins while invalidating cache") 

371 time.sleep(5 * 60) 

372 content = self.download_file(xarticle.url, force_refresh=True) 

373 with self.tracer.start_as_current_span("parse_article_content"): 

374 parsed_xarticle = self.parse_article_content( 

375 content, xissue, xarticle, xarticle.url 

376 ) 

377 

378 if parsed_xarticle is None: 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true

379 return None 

380 

381 if parsed_xarticle.doi: 

382 parsed_xarticle.pid = ( 

383 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

384 ) 

385 

386 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

387 ext_link = create_extlink() 

388 ext_link["rel"] = "source" 

389 ext_link["location"] = parsed_xarticle.url 

390 ext_link["metadata"] = self.source_domain 

391 parsed_xarticle.ext_links.append(ext_link) 

392 

393 # The article title may have formulas surrounded with '$' 

394 return self.process_article_metadata(parsed_xarticle) 

395 

396 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

397 tag = "article-title" if resource_type == "article" else "issue-title" 

398 

399 # Process title tex 

400 ckeditor_data = build_jats_data_from_html_field( 

401 xresource.title_tex, 

402 tag=tag, 

403 text_lang=xresource.lang, 

404 delimiter_inline=self.delimiter_inline_formula, 

405 delimiter_disp=self.delimiter_disp_formula, 

406 ) 

407 

408 xresource.title_html = ckeditor_data["value_html"] 

409 # xresource.title_tex = ckeditor_data["value_tex"] 

410 xresource.title_xml = ckeditor_data["value_xml"] 

411 

412 # Process trans_title tex 

413 if xresource.trans_title_tex: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true

414 self.logger.warning( 

415 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

416 ) 

417 trans_title = self.create_trans_title( 

418 xresource_lang=xresource.lang, 

419 resource_type=resource_type, 

420 title_tex=xresource.trans_title_tex, 

421 lang=xresource.trans_lang, 

422 ) 

423 xresource.titles.append(trans_title) 

424 

425 abstracts_to_parse = [ 

426 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

427 ] 

428 # abstract may have formulas surrounded with '$' 

429 if len(abstracts_to_parse) > 0: 

430 for xabstract in abstracts_to_parse: 

431 ckeditor_data = build_jats_data_from_html_field( 

432 xabstract["value_tex"], 

433 tag="abstract", 

434 text_lang=xabstract["lang"], 

435 resource_lang=xresource.lang, 

436 field_type="abstract", 

437 delimiter_inline=self.delimiter_inline_formula, 

438 delimiter_disp=self.delimiter_disp_formula, 

439 ) 

440 

441 xabstract["value_html"] = ckeditor_data["value_html"] 

442 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

443 xabstract["value_xml"] = ckeditor_data["value_xml"] 

444 

445 return xresource 

446 

447 def process_article_metadata(self, xarticle: ArticleData): 

448 self.process_resource_metadata(xarticle) 

449 for bibitem in xarticle.bibitems: 

450 bibitem.type = "unknown" 

451 update_data_for_jats(xarticle, with_label=False) 

452 

453 return xarticle 

454 

455 def _wait_download_delay(self): 

456 delta = self.next_allowed_request - time.time() 

457 self.next_allowed_request = time.time() + self.requests_interval 

458 if delta > 0: 

459 self.logger.info(f"Waiting {int(delta)}s before making another request") 

460 time.sleep(delta) 

461 

462 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response: 

463 """ 

464 Wrapper around requests.get with delay based on the crawler class instance 

465 """ 

466 

467 kwargs = {} 

468 # self.session.cache.delete(urls=[url]) 

469 if isinstance(self.session, CachedSession): 

470 kwargs["force_refresh"] = force_refresh 

471 

472 try: 

473 response = self.session.get( 

474 url, 

475 headers={**self.headers, **headers}, 

476 timeout=self.requests_timeout, 

477 **kwargs, 

478 ) 

479 except DocumentTooLarge as e: 

480 self.logger.error(e) 

481 response = requests.get( 

482 url, headers={**self.headers, **headers}, timeout=self.requests_timeout 

483 ) 

484 

485 if not response.ok: 

486 raise requests.exceptions.HTTPError( 

487 f"Endpoint answered with code {response.status_code} : {url}", 

488 response=response, 

489 ) 

490 

491 if not getattr(response, "from_cache", False): 

492 self._wait_download_delay() 

493 return response 

494 

495 def download_file(self, url: str, force_refresh=False, headers={}): 

496 """ 

497 Downloads a page and returns its content (decoded string). 

498 This function handles retries and decoding 

499 """ 

500 attempts = 0 

501 while True: 

502 try: 

503 if attempts > 0: 

504 force_refresh = True 

505 response = self._get( 

506 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

507 ) 

508 

509 if getattr(response, "from_cache", False): 

510 return response.text 

511 

512 content = self.decode_response(response) 

513 if content == "" or not content: 

514 raise requests.exceptions.HTTPError(response) 

515 

516 if isinstance(self.session, CachedSession): 

517 if "Expires" in response.headers: 

518 del response.headers["Expires"] 

519 del response.headers["Cache-Control"] 

520 try: 

521 self.session.cache.save_response(response) 

522 except DocumentTooLarge as e: 

523 self.logger.warning(e) 

524 return content 

525 except ( 

526 requests.ConnectionError, 

527 requests.ConnectTimeout, 

528 requests.exceptions.HTTPError, 

529 ) as e: 

530 if isinstance(e, requests.exceptions.HTTPError): 

531 # if Error 404 (resource not found) we skip 

532 status_code = e.response.status_code 

533 if status_code == 404: 

534 return None 

535 else: 

536 raise e 

537 if attempts > 3: 

538 raise e 

539 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

540 attempts += 1 

541 # 15 mins, 30 mins, 45 mins 

542 delay_minutes = attempts * 15 

543 self.logger.debug( 

544 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

545 extra={"url": url}, 

546 ) 

547 time.sleep(delay_minutes * 60) 

548 

549 def decode_response(self, response: requests.Response, encoding: str | None = None): 

550 """Override this if the content-type headers from the sources are advertising something else than the actual content 

551 SASA needs this""" 

552 # Force 

553 if encoding: 

554 response.encoding = encoding 

555 return response.text 

556 

557 # Attempt to get encoding using HTTP headers 

558 content_type_tag = response.headers.get("Content-Type", None) 

559 

560 if content_type_tag: 560 ↛ 567line 560 didn't jump to line 567 because the condition on line 560 was always true

561 charset = self.parse_content_type_charset(content_type_tag) 

562 if charset: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true

563 response.encoding = charset 

564 return response.text 

565 

566 # Attempt to get encoding using HTML meta charset tag 

567 soup = BeautifulSoup(response.text, "html5lib") 

568 charset = soup.select_one("meta[charset]") 

569 if charset: 

570 htmlencoding = charset.get("charset") 

571 if isinstance(htmlencoding, str): 571 ↛ 576line 571 didn't jump to line 576 because the condition on line 571 was always true

572 response.encoding = htmlencoding 

573 return response.text 

574 

575 # Attempt to get encoding using HTML meta content type tag 

576 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]') 

577 if content_type_tag: 

578 content_type = content_type_tag.get("content") 

579 if isinstance(content_type, str): 579 ↛ 585line 579 didn't jump to line 585 because the condition on line 579 was always true

580 charset = self.parse_content_type_charset(content_type) 

581 if charset: 581 ↛ 585line 581 didn't jump to line 585 because the condition on line 581 was always true

582 response.encoding = charset 

583 return response.text 

584 

585 return response.text 

586 

587 @staticmethod 

588 def parse_content_type_charset(content_type: str): 

589 header = EmailPolicy.header_factory("content-type", content_type) 

590 if "charset" in header.params: 

591 return header.params.get("charset") 

592 

593 @tracer.start_as_current_span("add_xissue_to_database") 

594 def add_xissue_into_database(self, xissue: IssueData): 

595 xissue.journal = self.collection 

596 xissue.source = self.source_domain 

597 

598 if xissue.year == "": 

599 raise ValueError("Failsafe : Cannot insert issue without a year") 

600 

601 xpub = create_publisherdata() 

602 xpub.name = self.publisher 

603 xissue.publisher = xpub 

604 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

605 

606 attempt = 1 

607 success = False 

608 

609 while not success and attempt < 4: 

610 try: 

611 params = {"xissue": xissue, "use_body": False} 

612 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

613 cmd.do() 

614 success = True 

615 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

616 except SolrError: 

617 self.logger.warning( 

618 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

619 ) 

620 attempt += 1 

621 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

622 time.sleep(10) 

623 except Exception as e: 

624 self.logger.error( 

625 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

626 ) 

627 raise e 

628 

629 if success is False: 

630 raise ConnectionRefusedError("Cannot connect to SolR") 

631 

632 def get_metadata_using_citation_meta( 

633 self, 

634 xarticle: ArticleData, 

635 xissue: IssueData, 

636 soup: BeautifulSoup, 

637 what: list[CitationLiteral] = [], 

638 ): 

639 """ 

640 :param xarticle: the xarticle that will collect the metadata 

641 :param xissue: the xissue that will collect the publisher 

642 :param soup: the BeautifulSoup object of tha article page 

643 :param what: list of citation_ items to collect. 

644 :return: None. The given article is modified 

645 """ 

646 

647 if "title" in what: 

648 # TITLE 

649 citation_title_node = soup.select_one("meta[name='citation_title']") 

650 if citation_title_node: 650 ↛ 655line 650 didn't jump to line 655 because the condition on line 650 was always true

651 title = citation_title_node.get("content") 

652 if isinstance(title, str): 652 ↛ 655line 652 didn't jump to line 655 because the condition on line 652 was always true

653 xarticle.title_tex = title 

654 

655 if "author" in what: 655 ↛ 684line 655 didn't jump to line 684 because the condition on line 655 was always true

656 # AUTHORS 

657 citation_author_nodes = soup.select("meta[name^='citation_author']") 

658 current_author: ContributorDict | None = None 

659 for citation_author_node in citation_author_nodes: 

660 if citation_author_node.get("name") == "citation_author": 

661 text_author = citation_author_node.get("content") 

662 if not isinstance(text_author, str): 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true

663 raise ValueError("Cannot parse author") 

664 if text_author == "": 664 ↛ 665line 664 didn't jump to line 665 because the condition on line 664 was never true

665 current_author = None 

666 continue 

667 current_author = create_contributor(role="author", string_name=text_author) 

668 xarticle.contributors.append(current_author) 

669 continue 

670 if current_author is None: 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true

671 self.logger.warning("Couldn't parse citation author") 

672 continue 

673 if citation_author_node.get("name") == "citation_author_institution": 

674 text_institution = citation_author_node.get("content") 

675 if not isinstance(text_institution, str): 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true

676 continue 

677 current_author["addresses"].append(text_institution) 

678 if citation_author_node.get("name") == "citation_author_ocrid": 678 ↛ 679line 678 didn't jump to line 679 because the condition on line 678 was never true

679 text_orcid = citation_author_node.get("content") 

680 if not isinstance(text_orcid, str): 

681 continue 

682 current_author["orcid"] = text_orcid 

683 

684 if "pdf" in what: 

685 # PDF 

686 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

687 if citation_pdf_node: 

688 pdf_url = citation_pdf_node.get("content") 

689 if isinstance(pdf_url, str): 689 ↛ 692line 689 didn't jump to line 692 because the condition on line 689 was always true

690 add_pdf_link_to_xarticle(xarticle, pdf_url) 

691 

692 if "lang" in what: 

693 # LANG 

694 citation_lang_node = soup.select_one("meta[name='citation_language']") 

695 if citation_lang_node: 695 ↛ 701line 695 didn't jump to line 701 because the condition on line 695 was always true

696 # TODO: check other language code 

697 content_text = citation_lang_node.get("content") 

698 if isinstance(content_text, str): 698 ↛ 701line 698 didn't jump to line 701 because the condition on line 698 was always true

699 xarticle.lang = standardize_tag(content_text) 

700 

701 if "abstract" in what: 

702 # ABSTRACT 

703 abstract_node = soup.select_one("meta[name='citation_abstract']") 

704 if abstract_node is not None: 

705 abstract = abstract_node.get("content") 

706 if not isinstance(abstract, str): 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true

707 raise ValueError("Couldn't parse abstract from meta") 

708 abstract = BeautifulSoup(abstract, "html.parser").text 

709 lang = abstract_node.get("lang") 

710 if not isinstance(lang, str): 

711 lang = self.detect_language(abstract, xarticle) 

712 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

713 

714 if "page" in what: 

715 # PAGES 

716 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

717 if citation_fpage_node: 

718 page = citation_fpage_node.get("content") 

719 if isinstance(page, str): 719 ↛ 724line 719 didn't jump to line 724 because the condition on line 719 was always true

720 page = page.split("(")[0] 

721 if len(page) < 32: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true

722 xarticle.fpage = page 

723 

724 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

725 if citation_lpage_node: 

726 page = citation_lpage_node.get("content") 

727 if isinstance(page, str): 727 ↛ 732line 727 didn't jump to line 732 because the condition on line 727 was always true

728 page = page.split("(")[0] 

729 if len(page) < 32: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true

730 xarticle.lpage = page 

731 

732 if "doi" in what: 

733 # DOI 

734 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

735 if citation_doi_node: 

736 doi = citation_doi_node.get("content") 

737 if isinstance(doi, str): 737 ↛ 744line 737 didn't jump to line 744 because the condition on line 737 was always true

738 doi = doi.strip() 

739 pos = doi.find("10.") 

740 if pos > 0: 

741 doi = doi[pos:] 

742 xarticle.doi = doi 

743 

744 if "mr" in what: 

745 # MR 

746 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

747 if citation_mr_node: 

748 mr = citation_mr_node.get("content") 

749 if isinstance(mr, str): 749 ↛ 755line 749 didn't jump to line 755 because the condition on line 749 was always true

750 mr = mr.strip() 

751 if mr.find("MR") == 0: 751 ↛ 755line 751 didn't jump to line 755 because the condition on line 751 was always true

752 mr = mr[2:] 

753 xarticle.extids.append(("mr-item-id", mr)) 

754 

755 if "zbl" in what: 

756 # ZBL 

757 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

758 if citation_zbl_node: 

759 zbl = citation_zbl_node.get("content") 

760 if isinstance(zbl, str): 760 ↛ 766line 760 didn't jump to line 766 because the condition on line 760 was always true

761 zbl = zbl.strip() 

762 if zbl.find("Zbl") == 0: 762 ↛ 766line 762 didn't jump to line 766 because the condition on line 762 was always true

763 zbl = zbl[3:].strip() 

764 xarticle.extids.append(("zbl-item-id", zbl)) 

765 

766 if "publisher" in what: 

767 # PUBLISHER 

768 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

769 if citation_publisher_node: 

770 pub = citation_publisher_node.get("content") 

771 if isinstance(pub, str): 771 ↛ 778line 771 didn't jump to line 778 because the condition on line 771 was always true

772 pub = pub.strip() 

773 if pub != "": 773 ↛ 778line 773 didn't jump to line 778 because the condition on line 773 was always true

774 xpub = create_publisherdata() 

775 xpub.name = pub 

776 xissue.publisher = xpub 

777 

778 if "keywords" in what: 

779 # KEYWORDS 

780 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

781 for kwd_node in citation_kwd_nodes: 

782 kwds = kwd_node.get("content") 

783 if isinstance(kwds, str): 783 ↛ 781line 783 didn't jump to line 781 because the condition on line 783 was always true

784 kwds = kwds.split(",") 

785 for kwd in kwds: 

786 if kwd == "": 

787 continue 

788 kwd = kwd.strip() 

789 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

790 

791 if "references" in what: 

792 citation_references = soup.select("meta[name='citation_reference']") 

793 for index, tag in enumerate(citation_references): 

794 content = tag.get("content") 

795 if not isinstance(content, str): 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true

796 raise ValueError("Cannot parse citation_reference meta") 

797 label = str(index + 1) 

798 if regex.match(r"^\[\d+\].*", content): 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true

799 label = None 

800 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

801 

802 def create_xissue( 

803 self, 

804 url: str | None, 

805 year: str, 

806 volume_number: str | None, 

807 issue_number: str | None = "1", 

808 vseries: str | None = None, 

809 ): 

810 if url is not None and url.endswith("/"): 

811 url = url[:-1] 

812 xissue = create_issuedata() 

813 xissue.url = url 

814 

815 xissue.pid = self.get_issue_pid( 

816 self.collection_id, year, volume_number, issue_number, vseries 

817 ) 

818 

819 xissue.year = year 

820 

821 if volume_number is not None: 

822 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

823 

824 if issue_number is not None: 

825 xissue.number = issue_number.replace(",", "-") 

826 

827 if vseries is not None: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 xissue.vseries = vseries 

829 return xissue 

830 

831 def detect_language(self, text: str, article: ArticleData | None = None): 

832 if article and article.lang is not None and article.lang != "und": 

833 return article.lang 

834 

835 language = self.language_detector.detect_language_of(text) 

836 

837 if not language: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true

838 return "und" 

839 return language.iso_code_639_1.name.lower() 

840 

841 def create_trans_title( 

842 self, 

843 resource_type: str, 

844 title_tex: str, 

845 lang: str, 

846 xresource_lang: str, 

847 title_type: str = "main", 

848 ): 

849 tag = "trans-title" if resource_type == "article" else "issue-title" 

850 

851 ckeditor_data = build_jats_data_from_html_field( 

852 title_tex, 

853 tag=tag, 

854 text_lang=lang, 

855 resource_lang=xresource_lang, 

856 delimiter_inline=self.delimiter_inline_formula, 

857 delimiter_disp=self.delimiter_disp_formula, 

858 ) 

859 

860 titledata = create_titledata( 

861 lang=lang, 

862 type="main", 

863 title_html=ckeditor_data["value_html"], 

864 title_xml=ckeditor_data["value_xml"], 

865 ) 

866 

867 return titledata 

868 

869 references_mapping = { 

870 "citation_title": get_article_title_xml, 

871 "citation_journal_title": get_source_xml, 

872 "citation_publication_date": get_year_xml, 

873 "citation_firstpage": get_fpage_xml, 

874 "citation_lastpage": get_lpage_xml, 

875 } 

876 

877 @classmethod 

878 def __parse_meta_citation_reference(cls, content: str, label=None): 

879 categories = content.split(";") 

880 

881 if len(categories) == 1: 

882 return JatsBase.bake_ref(content, label=label) 

883 

884 citation_data = [c.split("=") for c in categories if "=" in c] 

885 del categories 

886 

887 xml_string = "" 

888 authors_parsed = False 

889 authors_strings = [] 

890 for data in citation_data: 

891 key = data[0].strip() 

892 citation_content = data[1] 

893 if key == "citation_author": 

894 authors_strings.append(get_author_xml(template_str=citation_content)) 

895 continue 

896 elif not authors_parsed: 

897 xml_string += ", ".join(authors_strings) 

898 authors_parsed = True 

899 

900 if key in cls.references_mapping: 

901 xml_string += " " + cls.references_mapping[key](citation_content) 

902 

903 return JatsBase.bake_ref(xml_string, label=label) 

904 

905 @classmethod 

906 def get_or_create_source(cls): 

907 source, created = Source.objects.get_or_create( 

908 domain=cls.source_domain, 

909 defaults={ 

910 "name": cls.source_name, 

911 "website": cls.source_website, 

912 "view_id": cls.get_view_id(), 

913 }, 

914 ) 

915 if created: 915 ↛ 916line 915 didn't jump to line 916 because the condition on line 915 was never true

916 source.save() 

917 return source 

918 

919 @staticmethod 

920 def get_issue_pid( 

921 collection_id: str, 

922 year: str, 

923 volume_number: str | None = None, 

924 issue_number: str | None = None, 

925 series: str | None = None, 

926 ): 

927 # Replace any non-word character with an underscore 

928 pid = f"{collection_id}_{year}" 

929 if series is not None: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true

930 pid += f"_{series}" 

931 if volume_number is not None: 

932 pid += f"_{volume_number}" 

933 if issue_number is not None: 

934 pid += f"_{issue_number}" 

935 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

936 return pid 

937 

938 @staticmethod 

939 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

940 pages_split = pages.split(separator) 

941 if len(pages_split) == 0: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true

942 article.page_range = pages 

943 if len(pages_split) > 0: 943 ↛ exitline 943 didn't return from function 'set_pages' because the condition on line 943 was always true

944 if pages[0].isnumeric(): 944 ↛ exitline 944 didn't return from function 'set_pages' because the condition on line 944 was always true

945 article.fpage = pages_split[0] 

946 if ( 

947 len(pages_split) > 1 

948 and pages_split[0] != pages_split[1] 

949 and pages_split[1].isnumeric() 

950 ): 

951 article.lpage = pages_split[1]