Coverage for src / crawler / base_crawler.py: 67%

553 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import logging 

2import time 

3from concurrent.futures import ( 

4 Executor, 

5 ThreadPoolExecutor, 

6) 

7from datetime import datetime, timedelta 

8from email.policy import EmailPolicy 

9from typing import TYPE_CHECKING, Any 

10 

11import regex 

12import requests 

13from bs4 import BeautifulSoup 

14from django.conf import settings 

15from django.contrib.auth.models import User 

16from django.utils import timezone 

17from langcodes import standardize_tag 

18from lingua import LanguageDetector, LanguageDetectorBuilder 

19from opentelemetry import trace 

20from ptf.cmds.xml.ckeditor.utils import ( 

21 build_jats_data_from_html_field, 

22) 

23from ptf.cmds.xml.jats.builder.references import ( 

24 get_article_title_xml, 

25 get_author_xml, 

26 get_fpage_xml, 

27 get_lpage_xml, 

28 get_source_xml, 

29 get_year_xml, 

30) 

31from ptf.cmds.xml.jats.jats_parser import JatsBase 

32from ptf.model_data import ( 

33 ArticleData, 

34 ContributorDict, 

35 IssueData, 

36 ResourceData, 

37 TitleDict, 

38 create_abstract, 

39 create_contributor, 

40 create_extlink, 

41 create_issuedata, 

42 create_publisherdata, 

43 create_titledata, 

44) 

45from ptf.model_data_converter import update_data_for_jats 

46from ptf.models import ExtLink 

47from pylatexenc.latex2text import LatexNodes2Text 

48from pymongo.errors import DocumentTooLarge 

49from pysolr import SolrError 

50from requests.adapters import HTTPAdapter 

51from requests.models import Response 

52from requests_cache import CachedSession, MongoCache 

53from urllib3 import Retry 

54 

55from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

56from crawler.models import Source 

57from crawler.models.extlink_checked import ExtlinkChecked 

58from crawler.types import CitationLiteral 

59from crawler.utils import ( 

60 add_pdf_link_to_xarticle, 

61 cleanup_str, 

62 get_all_cols, 

63 get_or_create_collection, 

64) 

65 

66if TYPE_CHECKING: 

67 from concurrent.futures import Future 

68 

69 

70class CrawlerTitleDict(TitleDict): 

71 title_tex: str | None 

72 

73 

74class BaseCollectionCrawler: 

75 """ 

76 Base collection for the crawlers. 

77 To create a crawler: 

78 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

79 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

80 3) update factory.py so that crawler_factory can return your new crawler 

81 """ 

82 

83 logger = logging.getLogger(__name__) 

84 tracer = trace.get_tracer(__name__) 

85 

86 source_name = "" 

87 source_domain = "" 

88 source_website = "" 

89 

90 issue_href = "" 

91 

92 collection = None 

93 source = None 

94 user = None 

95 session: requests.Session | CachedSession 

96 

97 verify = True 

98 headers = { 

99 "accept_encoding": "utf-8", 

100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

102 } 

103 

104 next_allowed_request: float = time.time() 

105 

106 # seconds to wait between two http requests 

107 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

108 # seconds to wait before aborting the connection (if no bytes are recieved) 

109 requests_timeout = 60 

110 

111 latext_parser = LatexNodes2Text() 

112 

113 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

114 # do not use the "$" to surround tex formulas 

115 delimiter_inline_formula = "$" 

116 delimiter_disp_formula = "$" 

117 

118 # HACK : Workaround for tests (monkeypatching) 

119 # We store the class here, so we can monkeypatch it when running tests 

120 # subCrawlers = { 

121 # LofplCrawler: None 

122 # } 

123 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

124 

125 _language_detector: LanguageDetector | None = None 

126 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

127 

128 force_refresh = False 

129 

130 # Whereas to include headers in requests cache key 

131 match_headers = False 

132 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

133 

134 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

135 ignore_missing_pdf = True 

136 

137 database_executor: Executor 

138 exception: Exception | None = None 

139 

140 @classmethod 

141 def get_view_id(cls): 

142 return cls.source_domain 

143 

144 @property 

145 def language_detector(self): 

146 """Crawler Instance singleton for language builder. 

147 Late init of LanguageDetector to save on memory""" 

148 if not self._language_detector: 

149 self._language_detector = self._language_detector_builder.build() 

150 return self._language_detector 

151 

152 def __init__( 

153 self, 

154 *args, 

155 username: str, 

156 collection_id: str, 

157 dry: bool = False, 

158 publisher: str = "", 

159 force_refresh=False, 

160 collection_url: str | None = None, 

161 ): 

162 if not collection_url: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 all_cols = get_all_cols() 

164 col = all_cols[collection_id] 

165 

166 collection_url = col["sources"].get(self.source_domain, None) 

167 if collection_url is None: 

168 raise ValueError( 

169 f"Source {self.source_domain} not found for collection {collection_id}" 

170 ) 

171 self.collection_url = collection_url 

172 for CrawlerClass in self.subCrawlers: 172 ↛ 173line 172 didn't jump to line 173 because the loop on line 172 never started

173 self.subCrawlers[CrawlerClass] = CrawlerClass( 

174 *args, 

175 username=username, 

176 collection_id=collection_id, 

177 dry=dry, 

178 publisher=publisher, 

179 collection_url=collection_url, 

180 ) 

181 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

182 

183 self.username = username 

184 

185 self.collection_id = collection_id 

186 

187 self.dry = dry 

188 self.publisher = publisher 

189 

190 self.session = requests.session() 

191 

192 # Skipped when running tests 

193 self.initialize() 

194 self.session.verify = self.verify 

195 self.force_refresh = force_refresh 

196 

197 # We implemented custom retry behaviour, so we don't want to make extra requests here 

198 retries = Retry( 

199 total=0, 

200 ) 

201 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

202 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

203 

204 self.database_executor = ThreadPoolExecutor( 

205 max_workers=1, thread_name_prefix="crawler_database_thread" 

206 ) 

207 

208 def initialize(self): 

209 """ 

210 Acts as a "second" init function to skip model accesses during test data generation 

211 """ 

212 self.collection = get_or_create_collection(self.collection_id) 

213 self.source = self.get_or_create_source() 

214 self.user = User.objects.get(username=self.username) 

215 self.session = CachedSession( 

216 match_headers=self.match_headers, 

217 headers=self.headers, 

218 backend=MongoCache( 

219 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), decode_content=False 

220 ), 

221 expire_after=timedelta(days=30), 

222 ) 

223 

224 @classmethod 

225 def can_crawl(cls, pid: str) -> bool: 

226 return True 

227 

228 def parse_collection_content(self, content: str) -> list[IssueData]: 

229 """ 

230 Parse the HTML content with BeautifulSoup 

231 returns a list of xissue. 

232 Override this function in a derived class 

233 """ 

234 return [] 

235 

236 def parse_issue_content(self, content: str, xissue: IssueData): 

237 """ 

238 Parse the HTML content with BeautifulSoup 

239 Fills the xissue.articles 

240 Override this function in a derived class. 

241 

242 CAV : You are supposed to create articles there. Please assign a PID to each article. 

243 The PID can be `a + article_index`, like this : `a0` `a21` 

244 """ 

245 

246 def parse_article_content( 

247 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

248 ) -> ArticleData | None: 

249 """ 

250 Parse the HTML content with BeautifulSoup 

251 returns the xarticle. 

252 Override this function in a derived class. 

253 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

254 The article url is also passed as a parameter 

255 

256 CAV : You are supposed to assign articles pid again here 

257 """ 

258 return xarticle 

259 

260 @tracer.start_as_current_span("crawl_collection") 

261 def crawl_collection(self): 

262 # TODO: Comments, filter 

263 """ 

264 Crawl an entire collection. ptf.models.Container objects are created. 

265 - get the HTML content of the collection_url 

266 - parse the HTML content with beautifulsoup to extract the list of issues 

267 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

268 - crawl each issue if col_only is False 

269 - Returns the list of merged issues. 

270 It is an OrderedDict {pid: {"issues": xissues}} 

271 The key is the pid of the merged issues. 

272 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

273 the pid is then made with 1999-2000__6_ 

274 """ 

275 

276 if self.source is None: 

277 raise RuntimeError("ERROR: the source is not set") 

278 

279 content = self.download_file(self.collection_url) 

280 if content: 

281 xissues = self.parse_collection_content(content) 

282 else: 

283 # download_file returns None (404) 

284 return None 

285 

286 """ 

287 Some collections split the same volumes in different pages 

288 Ex: Volume 6 (2000) and Volume 6 (1999) 

289 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

290 """ 

291 # merged_xissues = self.merge_xissues(xissues) 

292 

293 xissues_dict = {str(i.pid): i for i in xissues} 

294 

295 return xissues_dict 

296 

297 @tracer.start_as_current_span("crawl_issue") 

298 def crawl_issue(self, xissue: IssueData): 

299 """ 

300 Crawl 1 wag page of an issue. 

301 - get the HTML content of the issue 

302 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

303 - crawl each article 

304 """ 

305 

306 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

307 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

308 issue_url = xissue.url 

309 if issue_url is not None: 

310 if issue_url.endswith(".pdf"): 

311 add_pdf_link_to_xarticle(xissue, issue_url) 

312 xissue.url = None 

313 else: 

314 content = self.download_file(issue_url) 

315 with self.tracer.start_as_current_span("parse_issue_content"): 

316 self.parse_issue_content(content, xissue) 

317 

318 xarticles = xissue.articles 

319 

320 parsed_xarticles = [] 

321 

322 for xarticle in xarticles: 

323 parsed_xarticle = self.crawl_article(xarticle, xissue) 

324 if parsed_xarticle is not None: 

325 parsed_xarticles.append(parsed_xarticle) 

326 

327 xissue.articles = parsed_xarticles 

328 

329 article_has_pdf = self.article_has_pdf(xissue) 

330 

331 if self.ignore_missing_pdf: 

332 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

333 

334 if not self.dry and (len(xissue.articles) > 0 or article_has_pdf): 

335 self.process_resource_metadata(xissue, resource_type="issue") 

336 if self.exception: 

337 raise self.exception 

338 self.database_executor.submit(self.add_xissue_into_database, xissue).add_done_callback( 

339 self._issue_added_callback 

340 ) 

341 

342 def _issue_added_callback(self, future: "Future"): 

343 exception = future.exception() 

344 if exception: 

345 self.exception = exception 

346 self.database_executor.shutdown(wait=False, cancel_futures=True) 

347 

348 @staticmethod 

349 def article_has_source(art: ArticleData | IssueData): 

350 return ( 

351 next( 

352 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

353 None, 

354 ) 

355 is not None 

356 ) 

357 

358 @staticmethod 

359 def article_has_pdf(art: ArticleData | IssueData): 

360 return ( 

361 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

362 is not None 

363 ) 

364 

365 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

366 # ARTICLE URL as en ExtLink (to display the link in the article page) 

367 if xarticle.url is None: 

368 if not self.article_has_source(xarticle): 368 ↛ 378line 368 didn't jump to line 378 because the condition on line 368 was always true

369 if xissue.url: 

370 article_source = xissue.url 

371 else: 

372 article_source = self.collection_url 

373 ext_link = create_extlink() 

374 ext_link["rel"] = "source" 

375 ext_link["location"] = article_source 

376 ext_link["metadata"] = self.source_domain 

377 xarticle.ext_links.append(ext_link) 

378 return self.process_article_metadata(xarticle) 

379 

380 content = self.download_file(xarticle.url) 

381 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

382 

383 try: 

384 with self.tracer.start_as_current_span("parse_article_content"): 

385 parsed_xarticle = self.parse_article_content( 

386 content, xissue, xarticle, xarticle.url 

387 ) 

388 except ValueError as e: 

389 self.logger.warning(e) 

390 self.logger.warning("Retrying in 5 mins while invalidating cache") 

391 time.sleep(5 * 60) 

392 content = self.download_file(xarticle.url, force_refresh=True) 

393 with self.tracer.start_as_current_span("parse_article_content"): 

394 parsed_xarticle = self.parse_article_content( 

395 content, xissue, xarticle, xarticle.url 

396 ) 

397 

398 if parsed_xarticle is None: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 return None 

400 

401 if parsed_xarticle.doi: 

402 parsed_xarticle.pid = ( 

403 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

404 ) 

405 

406 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

407 ext_link = create_extlink() 

408 ext_link["rel"] = "source" 

409 ext_link["location"] = parsed_xarticle.url 

410 ext_link["metadata"] = self.source_domain 

411 parsed_xarticle.ext_links.append(ext_link) 

412 

413 # The article title may have formulas surrounded with '$' 

414 return self.process_article_metadata(parsed_xarticle) 

415 

416 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

417 tag = "article-title" if resource_type == "article" else "issue-title" 

418 

419 # Process title tex 

420 ckeditor_data = build_jats_data_from_html_field( 

421 xresource.title_tex, 

422 tag=tag, 

423 text_lang=xresource.lang, 

424 delimiter_inline=self.delimiter_inline_formula, 

425 delimiter_disp=self.delimiter_disp_formula, 

426 ) 

427 

428 xresource.title_html = ckeditor_data["value_html"] 

429 # xresource.title_tex = ckeditor_data["value_tex"] 

430 xresource.title_xml = ckeditor_data["value_xml"] 

431 

432 # Process trans_title tex 

433 if xresource.trans_title_tex: 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true

434 self.logger.warning( 

435 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

436 ) 

437 trans_title = self.create_trans_title( 

438 xresource_lang=xresource.lang, 

439 resource_type=resource_type, 

440 title_tex=xresource.trans_title_tex, 

441 lang=xresource.trans_lang, 

442 ) 

443 xresource.titles.append(trans_title) 

444 

445 abstracts_to_parse = [ 

446 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

447 ] 

448 # abstract may have formulas surrounded with '$' 

449 if len(abstracts_to_parse) > 0: 

450 for xabstract in abstracts_to_parse: 

451 ckeditor_data = build_jats_data_from_html_field( 

452 xabstract["value_tex"], 

453 tag="abstract", 

454 text_lang=xabstract["lang"], 

455 resource_lang=xresource.lang, 

456 field_type="abstract", 

457 delimiter_inline=self.delimiter_inline_formula, 

458 delimiter_disp=self.delimiter_disp_formula, 

459 ) 

460 

461 xabstract["value_html"] = ckeditor_data["value_html"] 

462 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

463 xabstract["value_xml"] = ckeditor_data["value_xml"] 

464 

465 return xresource 

466 

467 def process_article_metadata(self, xarticle: ArticleData): 

468 self.process_resource_metadata(xarticle) 

469 for bibitem in xarticle.bibitems: 

470 bibitem.type = "unknown" 

471 update_data_for_jats(xarticle, with_label=False) 

472 

473 return xarticle 

474 

475 def _wait_download_delay(self): 

476 delta = self.next_allowed_request - time.time() 

477 self.next_allowed_request = time.time() + self.requests_interval 

478 if delta > 0: 478 ↛ 479line 478 didn't jump to line 479 because the condition on line 478 was never true

479 self.logger.info(f"Waiting {int(delta)}s before making another request") 

480 time.sleep(delta) 

481 

482 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response: 

483 """ 

484 Wrapper around requests.get with delay based on the crawler class instance 

485 """ 

486 

487 kwargs = {} 

488 # self.session.cache.delete(urls=[url]) 

489 if isinstance(self.session, CachedSession): 

490 kwargs["force_refresh"] = force_refresh 

491 

492 try: 

493 response = self.session.get( 

494 url, 

495 headers={**self.headers, **headers}, 

496 timeout=self.requests_timeout, 

497 **kwargs, 

498 ) 

499 except DocumentTooLarge as e: 

500 self.logger.error(e) 

501 response = requests.get( 

502 url, headers={**self.headers, **headers}, timeout=self.requests_timeout 

503 ) 

504 

505 if not response.ok: 

506 raise requests.exceptions.HTTPError( 

507 f"Endpoint answered with code {response.status_code} : {url}", 

508 response=response, 

509 ) 

510 

511 if not getattr(response, "from_cache", False): 

512 self._wait_download_delay() 

513 return response 

514 

515 def download_file(self, url: str, force_refresh=False, headers={}): 

516 """ 

517 Downloads a page and returns its content (decoded string). 

518 This function handles retries and decoding 

519 """ 

520 attempts = 0 

521 while True: 

522 try: 

523 if attempts > 0: 

524 force_refresh = True 

525 response = self._get( 

526 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

527 ) 

528 

529 content = self.decode_response(response) 

530 if content == "" or not content: 

531 raise requests.exceptions.HTTPError(response) 

532 

533 return content 

534 except ( 

535 requests.ConnectionError, 

536 requests.ConnectTimeout, 

537 requests.exceptions.HTTPError, 

538 ) as e: 

539 if attempts > 3: 

540 raise e 

541 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

542 attempts += 1 

543 # 15 mins, 30 mins, 45 mins 

544 delay_minutes = attempts * 15 

545 self.logger.debug( 

546 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

547 extra={"url": url}, 

548 ) 

549 time.sleep(delay_minutes * 60) 

550 

551 def decode_response(self, response: requests.Response, encoding: str | None = None): 

552 """Override this if the content-type headers from the sources are advertising something else than the actual content 

553 SASA needs this""" 

554 # Force 

555 if encoding: 

556 response.encoding = encoding 

557 return response.text 

558 

559 # Attempt to get encoding using HTTP headers 

560 content_type_tag = response.headers.get("Content-Type", None) 

561 

562 if content_type_tag: 562 ↛ 569line 562 didn't jump to line 569 because the condition on line 562 was always true

563 charset = self.parse_content_type_charset(content_type_tag) 

564 if charset: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true

565 response.encoding = charset 

566 return response.text 

567 

568 # Attempt to get encoding using HTML meta charset tag 

569 soup = BeautifulSoup(response.text, "html5lib") 

570 charset = soup.select_one("meta[charset]") 

571 if charset: 

572 htmlencoding = charset.get("charset") 

573 if isinstance(htmlencoding, str): 573 ↛ 578line 573 didn't jump to line 578 because the condition on line 573 was always true

574 response.encoding = htmlencoding 

575 return response.text 

576 

577 # Attempt to get encoding using HTML meta content type tag 

578 content_type_tag = soup.select_one( 

579 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]' 

580 ) 

581 if content_type_tag: 

582 content_type = content_type_tag.get("content") 

583 if isinstance(content_type, str): 583 ↛ 589line 583 didn't jump to line 589 because the condition on line 583 was always true

584 charset = self.parse_content_type_charset(content_type) 

585 if charset: 585 ↛ 589line 585 didn't jump to line 589 because the condition on line 585 was always true

586 response.encoding = charset 

587 return response.text 

588 

589 return response.text 

590 

591 @staticmethod 

592 def parse_content_type_charset(content_type: str): 

593 header = EmailPolicy.header_factory("content-type", content_type) 

594 if "charset" in header.params: 

595 return header.params.get("charset") 

596 

597 @tracer.start_as_current_span("add_xissue_to_database") 

598 def add_xissue_into_database(self, xissue: IssueData): 

599 xissue.journal = self.collection 

600 xissue.source = self.source_domain 

601 

602 if xissue.year == "": 

603 raise ValueError("Failsafe : Cannot insert issue without a year") 

604 

605 xpub = create_publisherdata() 

606 xpub.name = self.publisher 

607 xissue.publisher = xpub 

608 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

609 

610 attempt = 1 

611 success = False 

612 

613 while not success and attempt < 4: 

614 try: 

615 params = {"xissue": xissue, "use_body": False} 

616 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

617 cmd.do() 

618 success = True 

619 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

620 except SolrError: 

621 self.logger.warning( 

622 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

623 ) 

624 attempt += 1 

625 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

626 time.sleep(10) 

627 except Exception as e: 

628 self.logger.error( 

629 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

630 ) 

631 raise e 

632 

633 if success is False: 

634 raise ConnectionRefusedError("Cannot connect to SolR") 

635 

636 def get_metadata_using_citation_meta( 

637 self, 

638 xarticle: ArticleData, 

639 xissue: IssueData, 

640 soup: BeautifulSoup, 

641 what: list[CitationLiteral] = [], 

642 ): 

643 """ 

644 :param xarticle: the xarticle that will collect the metadata 

645 :param xissue: the xissue that will collect the publisher 

646 :param soup: the BeautifulSoup object of tha article page 

647 :param what: list of citation_ items to collect. 

648 :return: None. The given article is modified 

649 """ 

650 

651 if "title" in what: 

652 # TITLE 

653 citation_title_node = soup.select_one("meta[name='citation_title']") 

654 if citation_title_node: 654 ↛ 659line 654 didn't jump to line 659 because the condition on line 654 was always true

655 title = citation_title_node.get("content") 

656 if isinstance(title, str): 656 ↛ 659line 656 didn't jump to line 659 because the condition on line 656 was always true

657 xarticle.title_tex = title 

658 

659 if "author" in what: 659 ↛ 688line 659 didn't jump to line 688 because the condition on line 659 was always true

660 # AUTHORS 

661 citation_author_nodes = soup.select("meta[name^='citation_author']") 

662 current_author: ContributorDict | None = None 

663 for citation_author_node in citation_author_nodes: 

664 if citation_author_node.get("name") == "citation_author": 

665 text_author = citation_author_node.get("content") 

666 if not isinstance(text_author, str): 666 ↛ 667line 666 didn't jump to line 667 because the condition on line 666 was never true

667 raise ValueError("Cannot parse author") 

668 if text_author == "": 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true

669 current_author = None 

670 continue 

671 current_author = create_contributor(role="author", string_name=text_author) 

672 xarticle.contributors.append(current_author) 

673 continue 

674 if current_author is None: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 self.logger.warning("Couldn't parse citation author") 

676 continue 

677 if citation_author_node.get("name") == "citation_author_institution": 

678 text_institution = citation_author_node.get("content") 

679 if not isinstance(text_institution, str): 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true

680 continue 

681 current_author["addresses"].append(text_institution) 

682 if citation_author_node.get("name") == "citation_author_ocrid": 682 ↛ 683line 682 didn't jump to line 683 because the condition on line 682 was never true

683 text_orcid = citation_author_node.get("content") 

684 if not isinstance(text_orcid, str): 

685 continue 

686 current_author["orcid"] = text_orcid 

687 

688 if "pdf" in what: 

689 # PDF 

690 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

691 if citation_pdf_node: 

692 pdf_url = citation_pdf_node.get("content") 

693 if isinstance(pdf_url, str): 693 ↛ 696line 693 didn't jump to line 696 because the condition on line 693 was always true

694 add_pdf_link_to_xarticle(xarticle, pdf_url) 

695 

696 if "lang" in what: 

697 # LANG 

698 citation_lang_node = soup.select_one("meta[name='citation_language']") 

699 if citation_lang_node: 699 ↛ 705line 699 didn't jump to line 705 because the condition on line 699 was always true

700 # TODO: check other language code 

701 content_text = citation_lang_node.get("content") 

702 if isinstance(content_text, str): 702 ↛ 705line 702 didn't jump to line 705 because the condition on line 702 was always true

703 xarticle.lang = standardize_tag(content_text) 

704 

705 if "abstract" in what: 

706 # ABSTRACT 

707 abstract_node = soup.select_one("meta[name='citation_abstract']") 

708 if abstract_node is not None: 

709 abstract = abstract_node.get("content") 

710 if not isinstance(abstract, str): 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 raise ValueError("Couldn't parse abstract from meta") 

712 abstract = BeautifulSoup(abstract, "html.parser").text 

713 lang = abstract_node.get("lang") 

714 if not isinstance(lang, str): 

715 lang = self.detect_language(abstract, xarticle) 

716 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

717 

718 if "page" in what: 

719 # PAGES 

720 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

721 if citation_fpage_node: 

722 page = citation_fpage_node.get("content") 

723 if isinstance(page, str): 723 ↛ 728line 723 didn't jump to line 728 because the condition on line 723 was always true

724 page = page.split("(")[0] 

725 if len(page) < 32: 725 ↛ 728line 725 didn't jump to line 728 because the condition on line 725 was always true

726 xarticle.fpage = page 

727 

728 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

729 if citation_lpage_node: 

730 page = citation_lpage_node.get("content") 

731 if isinstance(page, str): 731 ↛ 736line 731 didn't jump to line 736 because the condition on line 731 was always true

732 page = page.split("(")[0] 

733 if len(page) < 32: 733 ↛ 736line 733 didn't jump to line 736 because the condition on line 733 was always true

734 xarticle.lpage = page 

735 

736 if "doi" in what: 

737 # DOI 

738 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

739 if citation_doi_node: 

740 doi = citation_doi_node.get("content") 

741 if isinstance(doi, str): 741 ↛ 748line 741 didn't jump to line 748 because the condition on line 741 was always true

742 doi = doi.strip() 

743 pos = doi.find("10.") 

744 if pos > 0: 

745 doi = doi[pos:] 

746 xarticle.doi = doi 

747 

748 if "mr" in what: 

749 # MR 

750 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

751 if citation_mr_node: 

752 mr = citation_mr_node.get("content") 

753 if isinstance(mr, str): 753 ↛ 759line 753 didn't jump to line 759 because the condition on line 753 was always true

754 mr = mr.strip() 

755 if mr.find("MR") == 0: 755 ↛ 759line 755 didn't jump to line 759 because the condition on line 755 was always true

756 mr = mr[2:] 

757 xarticle.extids.append(("mr-item-id", mr)) 

758 

759 if "zbl" in what: 

760 # ZBL 

761 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

762 if citation_zbl_node: 

763 zbl = citation_zbl_node.get("content") 

764 if isinstance(zbl, str): 764 ↛ 770line 764 didn't jump to line 770 because the condition on line 764 was always true

765 zbl = zbl.strip() 

766 if zbl.find("Zbl") == 0: 766 ↛ 770line 766 didn't jump to line 770 because the condition on line 766 was always true

767 zbl = zbl[3:].strip() 

768 xarticle.extids.append(("zbl-item-id", zbl)) 

769 

770 if "publisher" in what: 

771 # PUBLISHER 

772 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

773 if citation_publisher_node: 

774 pub = citation_publisher_node.get("content") 

775 if isinstance(pub, str): 775 ↛ 782line 775 didn't jump to line 782 because the condition on line 775 was always true

776 pub = pub.strip() 

777 if pub != "": 777 ↛ 782line 777 didn't jump to line 782 because the condition on line 777 was always true

778 xpub = create_publisherdata() 

779 xpub.name = pub 

780 xissue.publisher = xpub 

781 

782 if "keywords" in what: 

783 # KEYWORDS 

784 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

785 for kwd_node in citation_kwd_nodes: 

786 kwds = kwd_node.get("content") 

787 if isinstance(kwds, str): 787 ↛ 785line 787 didn't jump to line 785 because the condition on line 787 was always true

788 kwds = kwds.split(",") 

789 for kwd in kwds: 

790 if kwd == "": 

791 continue 

792 kwd = kwd.strip() 

793 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

794 

795 if "references" in what: 

796 citation_references = soup.select("meta[name='citation_reference']") 

797 for index, tag in enumerate(citation_references): 

798 content = tag.get("content") 

799 if not isinstance(content, str): 799 ↛ 800line 799 didn't jump to line 800 because the condition on line 799 was never true

800 raise ValueError("Cannot parse citation_reference meta") 

801 label = str(index + 1) 

802 if regex.match(r"^\[\d+\].*", content): 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true

803 label = None 

804 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

805 

806 def create_xissue( 

807 self, 

808 url: str | None, 

809 year: str, 

810 volume_number: str | None, 

811 issue_number: str | None = None, 

812 vseries: str | None = None, 

813 ): 

814 if url is not None and url.endswith("/"): 

815 url = url[:-1] 

816 xissue = create_issuedata() 

817 xissue.url = url 

818 

819 xissue.pid = self.get_issue_pid( 

820 self.collection_id, year, volume_number, issue_number, vseries 

821 ) 

822 

823 xissue.year = year 

824 

825 if volume_number is not None: 

826 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number) 

827 

828 if issue_number is not None: 

829 xissue.number = issue_number.replace(",", "-") 

830 

831 if vseries is not None: 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true

832 xissue.vseries = vseries 

833 return xissue 

834 

835 def detect_language(self, text: str, article: ArticleData | None = None): 

836 if article and article.lang is not None and article.lang != "und": 

837 return article.lang 

838 

839 language = self.language_detector.detect_language_of(text) 

840 

841 if not language: 841 ↛ 842line 841 didn't jump to line 842 because the condition on line 841 was never true

842 return "und" 

843 return language.iso_code_639_1.name.lower() 

844 

845 def create_trans_title( 

846 self, 

847 resource_type: str, 

848 title_tex: str, 

849 lang: str, 

850 xresource_lang: str, 

851 title_type: str = "main", 

852 ): 

853 tag = "trans-title" if resource_type == "article" else "issue-title" 

854 

855 ckeditor_data = build_jats_data_from_html_field( 

856 title_tex, 

857 tag=tag, 

858 text_lang=lang, 

859 resource_lang=xresource_lang, 

860 delimiter_inline=self.delimiter_inline_formula, 

861 delimiter_disp=self.delimiter_disp_formula, 

862 ) 

863 

864 titledata = create_titledata( 

865 lang=lang, 

866 type="main", 

867 title_html=ckeditor_data["value_html"], 

868 title_xml=ckeditor_data["value_xml"], 

869 ) 

870 

871 return titledata 

872 

873 references_mapping = { 

874 "citation_title": get_article_title_xml, 

875 "citation_journal_title": get_source_xml, 

876 "citation_publication_date": get_year_xml, 

877 "citation_firstpage": get_fpage_xml, 

878 "citation_lastpage": get_lpage_xml, 

879 } 

880 

881 @classmethod 

882 def __parse_meta_citation_reference(cls, content: str, label=None): 

883 categories = content.split(";") 

884 

885 if len(categories) == 1: 

886 return JatsBase.bake_ref(content, label=label) 

887 

888 citation_data = [c.split("=") for c in categories if "=" in c] 

889 del categories 

890 

891 xml_string = "" 

892 authors_parsed = False 

893 authors_strings = [] 

894 for data in citation_data: 

895 key = data[0].strip() 

896 citation_content = data[1] 

897 if key == "citation_author": 

898 authors_strings.append(get_author_xml(template_str=citation_content)) 

899 continue 

900 elif not authors_parsed: 

901 xml_string += ", ".join(authors_strings) 

902 authors_parsed = True 

903 

904 if key in cls.references_mapping: 

905 xml_string += " " + cls.references_mapping[key](citation_content) 

906 

907 return JatsBase.bake_ref(xml_string, label=label) 

908 

909 @classmethod 

910 def get_or_create_source(cls): 

911 source, created = Source.objects.get_or_create( 

912 domain=cls.source_domain, 

913 defaults={ 

914 "name": cls.source_name, 

915 "website": cls.source_website, 

916 "view_id": cls.get_view_id(), 

917 }, 

918 ) 

919 if created: 919 ↛ 920line 919 didn't jump to line 920 because the condition on line 919 was never true

920 source.save() 

921 return source 

922 

923 @staticmethod 

924 def get_issue_pid( 

925 collection_id: str, 

926 year: str, 

927 volume_number: str | None = None, 

928 issue_number: str | None = None, 

929 series: str | None = None, 

930 ): 

931 # Replace any non-word character with an underscore 

932 pid = f"{collection_id}_{year}" 

933 if series is not None: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true

934 pid += f"_{series}" 

935 if volume_number is not None: 

936 pid += f"_{volume_number}" 

937 if issue_number is not None: 

938 pid += f"_{issue_number}" 

939 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid)) 

940 return pid 

941 

942 @staticmethod 

943 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

944 pages_split = pages.split(separator) 

945 if len(pages_split) == 0: 945 ↛ 946line 945 didn't jump to line 946 because the condition on line 945 was never true

946 article.page_range = pages 

947 if len(pages_split) > 0: 947 ↛ exitline 947 didn't return from function 'set_pages' because the condition on line 947 was always true

948 if pages[0].isnumeric(): 948 ↛ exitline 948 didn't return from function 'set_pages' because the condition on line 948 was always true

949 article.fpage = pages_split[0] 

950 if ( 950 ↛ 955line 950 didn't jump to line 955 because the condition on line 950 was never true

951 len(pages_split) > 1 

952 and pages_split[0] != pages_split[1] 

953 and pages_split[1].isnumeric() 

954 ): 

955 article.lpage = pages_split[1] 

956 

957 @classmethod 

958 def check_pdf_link_validity( 

959 cls, url: str, verify=True, session=requests.Session() 

960 ) -> "tuple[bool, Response, dict[str, Any]]": 

961 # Avoid downloading the whole PDF 

962 CHUNK_SIZE = 10 # number of characters fetched 

963 header = { 

964 "Range": f"bytes=0-{CHUNK_SIZE}", 

965 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

966 } 

967 with session.get( 

968 url, stream=True, allow_redirects=True, headers=header, verify=verify 

969 ) as response: 

970 content_type = response.headers.get("Content-Type") 

971 if not content_type or "application/pdf" not in content_type: 971 ↛ 973line 971 didn't jump to line 973 because the condition on line 971 was never true

972 # Content type is wrong, lest's check the header 

973 try: 

974 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE)) 

975 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None: 

976 return ( 

977 False, 

978 response, 

979 { 

980 "status": ExtlinkChecked.Status.ERROR, 

981 "message": f"Content-Type header: {content_type}; PDF Header not found : got {pdf_header}", 

982 }, 

983 ) 

984 else: 

985 return ( 

986 True, 

987 response, 

988 { 

989 "status": ExtlinkChecked.Status.WARNING, 

990 "message": f"Content-Type header: {content_type}", 

991 }, 

992 ) 

993 except StopIteration: 

994 return ( 

995 False, 

996 response, 

997 { 

998 "status": ExtlinkChecked.Status.ERROR, 

999 "message": f"Content-Type header: {content_type}.", 

1000 }, 

1001 ) 

1002 try: 

1003 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE)) 

1004 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None: 1004 ↛ 1005line 1004 didn't jump to line 1005 because the condition on line 1004 was never true

1005 return ( 

1006 False, 

1007 response, 

1008 { 

1009 "status": ExtlinkChecked.Status.ERROR, 

1010 "message": "PDF Header not found : got {pdf_header}", 

1011 }, 

1012 ) 

1013 except StopIteration: 

1014 return ( 

1015 False, 

1016 response, 

1017 { 

1018 "status": ExtlinkChecked.Status.ERROR, 

1019 "message": f"Content-Type header: {content_type}.", 

1020 }, 

1021 ) 

1022 

1023 # if response.status_code not in (200, 206): 

1024 # raise ValueError("Invalid status code") 

1025 

1026 return ( 

1027 True, 

1028 response, 

1029 { 

1030 "status": ExtlinkChecked.Status.OK, 

1031 "message": "", 

1032 }, 

1033 ) 

1034 

1035 @classmethod 

1036 def check_extlink_validity(cls, extlink: "ExtLink"): 

1037 """ 

1038 Method used by rot_monitoring to check if links have expired 

1039 """ 

1040 defaults: dict = {"date": time.time(), "status": ExtlinkChecked.Status.OK} 

1041 # CHUNK_SIZE = 100 

1042 header = { 

1043 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" 

1044 } 

1045 verify = True 

1046 if not cls.verify: 

1047 verify = False 

1048 if extlink.rel == "article-pdf": 

1049 isok, response, message = cls.check_pdf_link_validity(extlink.location, verify) 

1050 defaults.update(message) 

1051 else: 

1052 # check the article page 

1053 response = requests.get( 

1054 url=extlink.location, 

1055 headers=header, 

1056 stream=False, 

1057 allow_redirects=True, 

1058 verify=verify, 

1059 ) 

1060 

1061 defaults["http_status"] = response.status_code 

1062 

1063 if response.status_code not in (200, 206): 

1064 defaults["status"] = ExtlinkChecked.Status.ERROR 

1065 

1066 ExtlinkChecked.objects.update_or_create(extlink=extlink, defaults=defaults)