Coverage for src / crawler / base_crawler.py: 66%

575 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import logging 

2import time 

3from datetime import datetime, timedelta 

4from email.policy import EmailPolicy 

5from typing import TYPE_CHECKING, Iterable, Literal 

6 

7import aiohttp 

8import regex 

9import requests 

10from bs4 import BeautifulSoup 

11from django.conf import settings 

12from django.contrib.auth.models import User 

13from django.utils import timezone 

14from langcodes import standardize_tag 

15from lingua import LanguageDetector, LanguageDetectorBuilder 

16from opentelemetry import trace 

17from ptf.cmds.xml.ckeditor.utils import ( 

18 build_jats_data_from_html_field, 

19) 

20from ptf.cmds.xml.jats.builder.references import ( 

21 get_article_title_xml, 

22 get_author_xml, 

23 get_fpage_xml, 

24 get_lpage_xml, 

25 get_source_xml, 

26 get_year_xml, 

27) 

28from ptf.cmds.xml.jats.jats_parser import JatsBase 

29from ptf.model_data import ( 

30 ArticleData, 

31 ContributorDict, 

32 IssueData, 

33 ResourceData, 

34 TitleDict, 

35 create_abstract, 

36 create_contributor, 

37 create_extlink, 

38 create_issuedata, 

39 create_publisherdata, 

40 create_subj, 

41 create_titledata, 

42) 

43from ptf.model_data_converter import update_data_for_jats 

44from ptf.models import ExtLink 

45from pylatexenc.latex2text import LatexNodes2Text 

46from pysolr import SolrError 

47from requests.adapters import HTTPAdapter 

48from requests_cache import CachedSession 

49from urllib3 import Retry 

50 

51from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

52from crawler.models import Source 

53from crawler.models.extlink_checked import ExtlinkChecked 

54from crawler.types import CitationLiteral 

55from crawler.utils import ( 

56 add_pdf_link_to_xarticle, 

57 cleanup_str, 

58 get_all_cols, 

59 get_or_create_collection, 

60 get_session, 

61) 

62 

63if TYPE_CHECKING: 

64 from bs4 import Tag 

65 

66 

67class CrawlerTitleDict(TitleDict): 

68 title_tex: str | None 

69 

70 

71class BaseCollectionCrawler: 

72 """ 

73 Base collection for the crawlers. 

74 To create a crawler: 

75 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

76 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

77 3) update factory.py so that crawler_factory can return your new crawler 

78 """ 

79 

80 logger = logging.getLogger(__name__) 

81 tracer = trace.get_tracer(__name__) 

82 

83 source_name = "" 

84 source_domain = "" 

85 source_website = "" 

86 

87 issue_href = "" 

88 

89 collection = None 

90 source = None 

91 user = None 

92 session: requests.Session | CachedSession 

93 async_session: aiohttp.ClientSession 

94 is_checkable = True 

95 verify = True 

96 headers = { 

97 "accept_encoding": "utf-8", 

98 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

99 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

100 } 

101 

102 # seconds to wait between two http requests 

103 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

104 # seconds to wait before aborting the connection (if no bytes are recieved) 

105 requests_timeout = 60 

106 

107 latext_parser = LatexNodes2Text() 

108 

109 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

110 # do not use the "$" to surround tex formulas 

111 delimiter_inline_formula = "$" 

112 delimiter_disp_formula = "$" 

113 

114 # HACK : Workaround for tests (monkeypatching) 

115 # We store the class here, so we can monkeypatch it when running tests 

116 # subCrawlers = { 

117 # LofplCrawler: None 

118 # } 

119 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

120 

121 _language_detector: LanguageDetector | None = None 

122 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

123 

124 force_refresh = False 

125 

126 # Whereas to include headers in requests cache key 

127 match_headers = False 

128 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

129 

130 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

131 ignore_missing_pdf = True 

132 

133 @classmethod 

134 def get_view_id(cls): 

135 return cls.source_domain 

136 

137 @property 

138 def language_detector(self): 

139 """Crawler Instance singleton for language builder. 

140 Late init of LanguageDetector to save on memory""" 

141 if not self._language_detector: 

142 self._language_detector = self._language_detector_builder.build() 

143 return self._language_detector 

144 

145 def __init__( 

146 self, 

147 *args, 

148 username: str, 

149 collection_id: str, 

150 dry: bool = False, 

151 publisher: str = "", 

152 force_refresh=False, 

153 collection_url: str | None = None, 

154 ): 

155 if not collection_url: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 all_cols = get_all_cols() 

157 col = all_cols[collection_id] 

158 

159 collection_url = col["sources"].get(self.source_domain, None) 

160 if collection_url is None: 

161 raise ValueError( 

162 f"Source {self.source_domain} not found for collection {collection_id}" 

163 ) 

164 self.collection_url = collection_url 

165 for CrawlerClass in self.subCrawlers: 165 ↛ 166line 165 didn't jump to line 166 because the loop on line 165 never started

166 self.subCrawlers[CrawlerClass] = CrawlerClass( 

167 *args, 

168 username=username, 

169 collection_id=collection_id, 

170 dry=dry, 

171 publisher=publisher, 

172 collection_url=collection_url, 

173 ) 

174 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

175 # self.logger = logging.getLogger(__name__) 

176 

177 self.username = username 

178 

179 self.collection_id = collection_id 

180 

181 self.dry = dry 

182 self.publisher = publisher 

183 

184 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring) 

185 BaseCollectionCrawler.session: requests.Session 

186 

187 # Skipped when running tests 

188 self.initialize() 

189 

190 self.force_refresh = force_refresh 

191 

192 # We implemented custom retry behaviour, so we don't want to make extra requests here 

193 

194 def initialize(self): 

195 """ 

196 Acts as a "second" init function to skip model accesses during test data generation 

197 """ 

198 self.collection = get_or_create_collection(self.collection_id) 

199 self.source = self.get_or_create_source() 

200 self.user = User.objects.get(username=self.username) 

201 BaseCollectionCrawler.session = get_session() 

202 BaseCollectionCrawler.session.verify = self.verify 

203 self.session.delay = self.requests_interval 

204 retries = Retry( 

205 total=0, 

206 ) 

207 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

208 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

209 

210 @classmethod 

211 def can_crawl(cls, pid: str) -> bool: 

212 return True 

213 

214 def parse_collection_content(self, content: str) -> list[IssueData]: 

215 """ 

216 Parse the HTML content with BeautifulSoup 

217 returns a list of xissue. 

218 Override this function in a derived class 

219 """ 

220 return [] 

221 

222 def parse_issue_content(self, content: str, xissue: IssueData): 

223 """ 

224 Parse the HTML content with BeautifulSoup 

225 Fills the xissue.articles 

226 Override this function in a derived class. 

227 

228 CAV : You are supposed to create articles there. Please assign a PID to each article. 

229 The PID can be `a + article_index`, like this : `a0` `a21` 

230 """ 

231 

232 def parse_article_content( 

233 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

234 ) -> ArticleData | None: 

235 """ 

236 Parse the HTML content with BeautifulSoup 

237 returns the xarticle. 

238 Override this function in a derived class. 

239 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

240 The article url is also passed as a parameter 

241 

242 CAV : You are supposed to assign articles pid again here 

243 """ 

244 return xarticle 

245 

246 @tracer.start_as_current_span("crawl_collection") 

247 def crawl_collection(self): 

248 # TODO: Comments, filter 

249 """ 

250 Crawl an entire collection. ptf.models.Container objects are created. 

251 - get the HTML content of the collection_url 

252 - parse the HTML content with beautifulsoup to extract the list of issues 

253 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

254 - crawl each issue if col_only is False 

255 - Returns the list of merged issues. 

256 It is an OrderedDict {pid: {"issues": xissues}} 

257 The key is the pid of the merged issues. 

258 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

259 the pid is then made with 1999-2000__6_ 

260 """ 

261 

262 if self.source is None: 

263 raise RuntimeError("ERROR: the source is not set") 

264 

265 content = self.download_file(self.collection_url) 

266 if content: 

267 xissues = self.parse_collection_content(content) 

268 else: 

269 # download_file returns None (404) 

270 return None 

271 

272 """ 

273 Some collections split the same volumes in different pages 

274 Ex: Volume 6 (2000) and Volume 6 (1999) 

275 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

276 """ 

277 # merged_xissues = self.merge_xissues(xissues) 

278 

279 xissues_dict = {str(i.pid): i for i in xissues} 

280 

281 return xissues_dict 

282 

283 @tracer.start_as_current_span("crawl_issue") 

284 def crawl_issue(self, xissue: IssueData): 

285 """ 

286 Crawl 1 wag page of an issue. 

287 - get the HTML content of the issue 

288 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

289 - crawl each article 

290 """ 

291 

292 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

293 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

294 issue_url = xissue.url 

295 if issue_url is not None: 

296 if issue_url.endswith(".pdf"): 

297 add_pdf_link_to_xarticle(xissue, issue_url) 

298 xissue.url = None 

299 else: 

300 content = self.download_file(issue_url) 

301 with self.tracer.start_as_current_span("parse_issue_content"): 

302 self.parse_issue_content(content, xissue) 

303 

304 xarticles = xissue.articles 

305 

306 parsed_xarticles = [] 

307 

308 for xarticle in xarticles: 

309 parsed_xarticle = self.crawl_article(xarticle, xissue) 

310 if parsed_xarticle is not None: 

311 parsed_xarticles.append(parsed_xarticle) 

312 

313 xissue.articles = parsed_xarticles 

314 

315 issue_has_pdf = self.article_has_pdf(xissue) 

316 

317 if self.ignore_missing_pdf: 

318 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

319 if self.dry: 

320 return 

321 if len(xissue.articles) == 0 and not issue_has_pdf: 

322 return 

323 self.process_resource_metadata(xissue, resource_type="issue") 

324 

325 self.add_xissue_into_database(xissue) 

326 

327 @staticmethod 

328 def article_has_source(art: ArticleData | IssueData): 

329 return ( 

330 next( 

331 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

332 None, 

333 ) 

334 is not None 

335 ) 

336 

337 @staticmethod 

338 def article_has_pdf(art: ArticleData | IssueData): 

339 return ( 

340 next( 

341 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]), 

342 None, 

343 ) 

344 is not None 

345 ) 

346 

347 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

348 # ARTICLE URL as en ExtLink (to display the link in the article page) 

349 if xarticle.url is None: 

350 if not self.article_has_source(xarticle): 350 ↛ 360line 350 didn't jump to line 360 because the condition on line 350 was always true

351 if xissue.url: 

352 article_source = xissue.url 

353 else: 

354 article_source = self.collection_url 

355 ext_link = create_extlink() 

356 ext_link["rel"] = "source" 

357 ext_link["location"] = article_source 

358 ext_link["metadata"] = self.source_domain 

359 xarticle.ext_links.append(ext_link) 

360 return self.process_article_metadata(xarticle) 

361 

362 content = self.download_file(xarticle.url) 

363 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

364 

365 try: 

366 with self.tracer.start_as_current_span("parse_article_content"): 

367 parsed_xarticle = self.parse_article_content( 

368 content, xissue, xarticle, xarticle.url 

369 ) 

370 except ValueError as e: 

371 self.logger.warning(e) 

372 self.logger.warning("Retrying in 5 mins while invalidating cache") 

373 time.sleep(5 * 60) 

374 content = self.download_file(xarticle.url, force_refresh=True) 

375 with self.tracer.start_as_current_span("parse_article_content"): 

376 parsed_xarticle = self.parse_article_content( 

377 content, xissue, xarticle, xarticle.url 

378 ) 

379 

380 if parsed_xarticle is None: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true

381 return None 

382 

383 if parsed_xarticle.doi: 

384 parsed_xarticle.pid = ( 

385 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

386 ) 

387 

388 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

389 ext_link = create_extlink() 

390 ext_link["rel"] = "source" 

391 ext_link["location"] = parsed_xarticle.url 

392 ext_link["metadata"] = self.source_domain 

393 parsed_xarticle.ext_links.append(ext_link) 

394 

395 # The article title may have formulas surrounded with '$' 

396 return self.process_article_metadata(parsed_xarticle) 

397 

398 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

399 tag = "article-title" if resource_type == "article" else "issue-title" 

400 

401 # Process title tex 

402 ckeditor_data = build_jats_data_from_html_field( 

403 xresource.title_tex, 

404 tag=tag, 

405 text_lang=xresource.lang, 

406 delimiter_inline=self.delimiter_inline_formula, 

407 delimiter_disp=self.delimiter_disp_formula, 

408 ) 

409 

410 xresource.title_html = ckeditor_data["value_html"] 

411 # xresource.title_tex = ckeditor_data["value_tex"] 

412 xresource.title_xml = ckeditor_data["value_xml"] 

413 

414 # Process trans_title tex 

415 if xresource.trans_title_tex: 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true

416 self.logger.warning( 

417 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

418 ) 

419 trans_title = self.create_trans_title( 

420 xresource_lang=xresource.lang, 

421 resource_type=resource_type, 

422 title_tex=xresource.trans_title_tex, 

423 lang=xresource.trans_lang, 

424 ) 

425 xresource.titles.append(trans_title) 

426 

427 abstracts_to_parse = [ 

428 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

429 ] 

430 # abstract may have formulas surrounded with '$' 

431 if len(abstracts_to_parse) > 0: 

432 for xabstract in abstracts_to_parse: 

433 ckeditor_data = build_jats_data_from_html_field( 

434 xabstract["value_tex"], 

435 tag="abstract", 

436 text_lang=xabstract["lang"], 

437 resource_lang=xresource.lang, 

438 field_type="abstract", 

439 delimiter_inline=self.delimiter_inline_formula, 

440 delimiter_disp=self.delimiter_disp_formula, 

441 ) 

442 

443 xabstract["value_html"] = ckeditor_data["value_html"] 

444 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

445 xabstract["value_xml"] = ckeditor_data["value_xml"] 

446 

447 return xresource 

448 

449 def process_article_metadata(self, xarticle: ArticleData): 

450 self.process_resource_metadata(xarticle) 

451 for bibitem in xarticle.bibitems: 

452 bibitem.type = "unknown" 

453 update_data_for_jats(xarticle, with_label=False) 

454 

455 return xarticle 

456 

457 def download_file(self, url: str, force_refresh=False, headers={}): 

458 """ 

459 Downloads a page and returns its content (decoded string). 

460 This function handles retries and decoding 

461 """ 

462 current_exception: Exception | None = None 

463 for attempt in range(3): 

464 try: 

465 kwargs = { 

466 "url": url, 

467 "headers": {**self.headers, **headers}, 

468 "timeout": self.requests_timeout, 

469 } 

470 if attempt > 0 and isinstance(self.session, CachedSession): 

471 kwargs["force_refresh"] = True 

472 response = self.session.get(**kwargs) 

473 

474 content = self.decode_response(response) 

475 if content == "" or not content: 

476 raise requests.exceptions.HTTPError(response) 

477 

478 return content 

479 except ( 

480 requests.ConnectionError, 

481 requests.ConnectTimeout, 

482 requests.exceptions.HTTPError, 

483 ) as e: 

484 current_exception = e 

485 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

486 # 15 mins, 30 mins, 45 mins 

487 delay_minutes = attempt * 15 

488 self.logger.debug( 

489 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

490 extra={"url": url}, 

491 ) 

492 time.sleep(delay_minutes * 60) 

493 

494 raise current_exception 

495 

496 def decode_response(self, response: requests.Response, encoding: str | None = None): 

497 """Override this if the content-type headers from the sources are advertising something else than the actual content 

498 SASA needs this""" 

499 # Force 

500 if encoding: 

501 response.encoding = encoding 

502 return response.text 

503 

504 # Attempt to get encoding using HTTP headers 

505 content_type_tag = response.headers.get("Content-Type", None) 

506 

507 if content_type_tag: 507 ↛ 514line 507 didn't jump to line 514 because the condition on line 507 was always true

508 charset = self.parse_content_type_charset(content_type_tag) 

509 if charset: 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true

510 response.encoding = charset 

511 return response.text 

512 

513 # Attempt to get encoding using HTML meta charset tag 

514 soup = BeautifulSoup(response.text, "html5lib") 

515 charset = soup.select_one("meta[charset]") 

516 if charset: 

517 htmlencoding = charset.get("charset") 

518 if isinstance(htmlencoding, str): 518 ↛ 523line 518 didn't jump to line 523 because the condition on line 518 was always true

519 response.encoding = htmlencoding 

520 return response.text 

521 

522 # Attempt to get encoding using HTML meta content type tag 

523 content_type_tag = soup.select_one( 

524 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]' 

525 ) 

526 if content_type_tag: 

527 content_type = content_type_tag.get("content") 

528 if isinstance(content_type, str): 528 ↛ 534line 528 didn't jump to line 534 because the condition on line 528 was always true

529 charset = self.parse_content_type_charset(content_type) 

530 if charset: 530 ↛ 534line 530 didn't jump to line 534 because the condition on line 530 was always true

531 response.encoding = charset 

532 return response.text 

533 

534 return response.text 

535 

536 @staticmethod 

537 def parse_content_type_charset(content_type: str): 

538 header = EmailPolicy.header_factory("content-type", content_type) 

539 if "charset" in header.params: 

540 return header.params.get("charset") 

541 

542 @tracer.start_as_current_span("add_xissue_to_database") 

543 def add_xissue_into_database(self, xissue: IssueData) -> IssueData: 

544 xissue.journal = self.collection 

545 xissue.source = self.source_domain 

546 

547 if xissue.year == "": 

548 raise ValueError("Failsafe : Cannot insert issue without a year") 

549 

550 xpub = create_publisherdata() 

551 xpub.name = self.publisher 

552 xissue.publisher = xpub 

553 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

554 

555 attempt = 1 

556 success = False 

557 

558 while not success and attempt < 4: 

559 try: 

560 params = {"xissue": xissue, "use_body": False} 

561 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

562 cmd.do() 

563 success = True 

564 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

565 return xissue 

566 except SolrError: 

567 self.logger.warning( 

568 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

569 ) 

570 attempt += 1 

571 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

572 time.sleep(10) 

573 except Exception as e: 

574 self.logger.error( 

575 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

576 ) 

577 raise e 

578 

579 if success is False: 

580 raise ConnectionRefusedError("Cannot connect to SolR") 

581 

582 assert False, "Unreachable" 

583 

584 def get_metadata_using_citation_meta( 

585 self, 

586 xarticle: ArticleData, 

587 xissue: IssueData, 

588 soup: BeautifulSoup, 

589 what: list[CitationLiteral] = [], 

590 ): 

591 """ 

592 :param xarticle: the xarticle that will collect the metadata 

593 :param xissue: the xissue that will collect the publisher 

594 :param soup: the BeautifulSoup object of tha article page 

595 :param what: list of citation_ items to collect. 

596 :return: None. The given article is modified 

597 """ 

598 

599 if "title" in what: 

600 # TITLE 

601 citation_title_node = soup.select_one("meta[name='citation_title']") 

602 if citation_title_node: 602 ↛ 607line 602 didn't jump to line 607 because the condition on line 602 was always true

603 title = citation_title_node.get("content") 

604 if isinstance(title, str): 604 ↛ 607line 604 didn't jump to line 607 because the condition on line 604 was always true

605 xarticle.title_tex = title 

606 

607 if "author" in what: 607 ↛ 636line 607 didn't jump to line 636 because the condition on line 607 was always true

608 # AUTHORS 

609 citation_author_nodes = soup.select("meta[name^='citation_author']") 

610 current_author: ContributorDict | None = None 

611 for citation_author_node in citation_author_nodes: 

612 if citation_author_node.get("name") == "citation_author": 

613 text_author = citation_author_node.get("content") 

614 if not isinstance(text_author, str): 614 ↛ 615line 614 didn't jump to line 615 because the condition on line 614 was never true

615 raise ValueError("Cannot parse author") 

616 if text_author == "": 616 ↛ 617line 616 didn't jump to line 617 because the condition on line 616 was never true

617 current_author = None 

618 continue 

619 current_author = create_contributor(role="author", string_name=text_author) 

620 xarticle.contributors.append(current_author) 

621 continue 

622 if current_author is None: 622 ↛ 623line 622 didn't jump to line 623 because the condition on line 622 was never true

623 self.logger.warning("Couldn't parse citation author") 

624 continue 

625 if citation_author_node.get("name") == "citation_author_institution": 

626 text_institution = citation_author_node.get("content") 

627 if not isinstance(text_institution, str): 627 ↛ 628line 627 didn't jump to line 628 because the condition on line 627 was never true

628 continue 

629 current_author["addresses"].append(text_institution) 

630 if citation_author_node.get("name") == "citation_author_ocrid": 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true

631 text_orcid = citation_author_node.get("content") 

632 if not isinstance(text_orcid, str): 

633 continue 

634 current_author["orcid"] = text_orcid 

635 

636 if "pdf" in what: 

637 # PDF 

638 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

639 if citation_pdf_node: 

640 pdf_url = citation_pdf_node.get("content") 

641 if isinstance(pdf_url, str): 641 ↛ 644line 641 didn't jump to line 644 because the condition on line 641 was always true

642 add_pdf_link_to_xarticle(xarticle, pdf_url) 

643 

644 if "lang" in what: 

645 # LANG 

646 citation_lang_node = soup.select_one("meta[name='citation_language']") 

647 if citation_lang_node: 647 ↛ 653line 647 didn't jump to line 653 because the condition on line 647 was always true

648 # TODO: check other language code 

649 content_text = citation_lang_node.get("content") 

650 if isinstance(content_text, str): 650 ↛ 653line 650 didn't jump to line 653 because the condition on line 650 was always true

651 xarticle.lang = standardize_tag(content_text) 

652 

653 if "abstract" in what: 

654 # ABSTRACT 

655 abstract_node = soup.select_one("meta[name='citation_abstract']") 

656 if abstract_node is not None: 

657 abstract = abstract_node.get("content") 

658 if not isinstance(abstract, str): 658 ↛ 659line 658 didn't jump to line 659 because the condition on line 658 was never true

659 raise ValueError("Couldn't parse abstract from meta") 

660 abstract = BeautifulSoup(abstract, "html.parser").text 

661 lang = abstract_node.get("lang") 

662 if not isinstance(lang, str): 

663 lang = self.detect_language(abstract, xarticle) 

664 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

665 

666 if "page" in what: 

667 # PAGES 

668 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

669 if citation_fpage_node: 

670 page = citation_fpage_node.get("content") 

671 if isinstance(page, str): 671 ↛ 676line 671 didn't jump to line 676 because the condition on line 671 was always true

672 page = page.split("(")[0] 

673 if len(page) < 32: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true

674 xarticle.fpage = page 

675 

676 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

677 if citation_lpage_node: 

678 page = citation_lpage_node.get("content") 

679 if isinstance(page, str): 679 ↛ 684line 679 didn't jump to line 684 because the condition on line 679 was always true

680 page = page.split("(")[0] 

681 if len(page) < 32: 681 ↛ 684line 681 didn't jump to line 684 because the condition on line 681 was always true

682 xarticle.lpage = page 

683 

684 if "doi" in what: 

685 # DOI 

686 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

687 if citation_doi_node: 

688 doi = citation_doi_node.get("content") 

689 if isinstance(doi, str): 689 ↛ 696line 689 didn't jump to line 696 because the condition on line 689 was always true

690 doi = doi.strip() 

691 pos = doi.find("10.") 

692 if pos > 0: 

693 doi = doi[pos:] 

694 xarticle.doi = doi 

695 

696 if "mr" in what: 

697 # MR 

698 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

699 if citation_mr_node: 

700 mr = citation_mr_node.get("content") 

701 if isinstance(mr, str): 701 ↛ 707line 701 didn't jump to line 707 because the condition on line 701 was always true

702 mr = mr.strip() 

703 if mr.find("MR") == 0: 703 ↛ 707line 703 didn't jump to line 707 because the condition on line 703 was always true

704 mr = mr[2:] 

705 xarticle.extids.append(("mr-item-id", mr)) 

706 

707 if "zbl" in what: 

708 # ZBL 

709 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

710 if citation_zbl_node: 

711 zbl = citation_zbl_node.get("content") 

712 if isinstance(zbl, str): 712 ↛ 718line 712 didn't jump to line 718 because the condition on line 712 was always true

713 zbl = zbl.strip() 

714 if zbl.find("Zbl") == 0: 714 ↛ 718line 714 didn't jump to line 718 because the condition on line 714 was always true

715 zbl = zbl[3:].strip() 

716 xarticle.extids.append(("zbl-item-id", zbl)) 

717 

718 if "publisher" in what: 

719 # PUBLISHER 

720 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

721 if citation_publisher_node: 

722 pub = citation_publisher_node.get("content") 

723 if isinstance(pub, str): 723 ↛ 730line 723 didn't jump to line 730 because the condition on line 723 was always true

724 pub = pub.strip() 

725 if pub != "": 725 ↛ 730line 725 didn't jump to line 730 because the condition on line 725 was always true

726 xpub = create_publisherdata() 

727 xpub.name = pub 

728 xissue.publisher = xpub 

729 

730 if "keywords" in what: 

731 # KEYWORDS 

732 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

733 for kwd_node in citation_kwd_nodes: 

734 kwds = kwd_node.get("content") 

735 if isinstance(kwds, str): 735 ↛ 733line 735 didn't jump to line 733 because the condition on line 735 was always true

736 kwds = kwds.split(",") 

737 for kwd in kwds: 

738 if kwd == "": 

739 continue 

740 kwd = kwd.strip() 

741 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

742 

743 if "references" in what: 

744 citation_references = soup.select("meta[name='citation_reference']") 

745 for index, tag in enumerate(citation_references): 

746 content = tag.get("content") 

747 if not isinstance(content, str): 747 ↛ 748line 747 didn't jump to line 748 because the condition on line 747 was never true

748 raise ValueError("Cannot parse citation_reference meta") 

749 label = str(index + 1) 

750 if regex.match(r"^\[\d+\].*", content): 750 ↛ 751line 750 didn't jump to line 751 because the condition on line 750 was never true

751 label = None 

752 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

753 

754 def get_metadata_using_dcterms( 

755 self, 

756 xarticle: ArticleData, 

757 soup: "Tag", 

758 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]", 

759 ): 

760 if "abstract" in what: 760 ↛ 768line 760 didn't jump to line 768 because the condition on line 760 was always true

761 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

762 if abstract_tag: 762 ↛ 768line 762 didn't jump to line 768 because the condition on line 762 was always true

763 abstract_text = self.get_str_attr(abstract_tag, "content") 

764 xarticle.abstracts.append( 

765 create_abstract(lang="en", value_tex=cleanup_str(abstract_text)) 

766 ) 

767 

768 if "keywords" in what: 768 ↛ 777line 768 didn't jump to line 777 because the condition on line 768 was always true

769 keyword_tags = soup.select("meta[name='DC.subject']") 

770 for tag in keyword_tags: 

771 kwd_text = tag.get("content") 

772 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 772 ↛ 773line 772 didn't jump to line 773 because the condition on line 772 was never true

773 continue 

774 kwd = create_subj(value=kwd_text) 

775 xarticle.kwds.append(kwd) 

776 

777 if "date_published" in what: 777 ↛ 778line 777 didn't jump to line 778 because the condition on line 777 was never true

778 published_tag = soup.select_one("meta[name='DC.Date.created']") 

779 if published_tag: 

780 published_text = self.get_str_attr(published_tag, "content") 

781 xarticle.date_published = published_text 

782 

783 if "article_type" in what: 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true

784 type_tag = soup.select_one("meta[name='DC.Type.articleType']") 

785 if type_tag: 

786 type_text = self.get_str_attr(type_tag, "content") 

787 xarticle.atype = type_text 

788 

789 def create_xissue( 

790 self, 

791 url: str | None, 

792 year: str, 

793 volume_number: str | None, 

794 issue_number: str | None = None, 

795 vseries: str | None = None, 

796 ): 

797 if url is not None and url.endswith("/"): 

798 url = url[:-1] 

799 xissue = create_issuedata() 

800 xissue.url = url 

801 

802 xissue.pid = self.get_issue_pid( 

803 self.collection_id, year, volume_number, issue_number, vseries 

804 ) 

805 

806 xissue.year = year 

807 

808 if volume_number is not None: 

809 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number) 

810 

811 if issue_number is not None: 

812 xissue.number = issue_number.replace(",", "-") 

813 

814 if vseries is not None: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true

815 xissue.vseries = vseries 

816 return xissue 

817 

818 def detect_language(self, text: str, article: ArticleData | None = None): 

819 if article and article.lang is not None and article.lang != "und": 

820 return article.lang 

821 

822 language = self.language_detector.detect_language_of(text) 

823 

824 if not language: 824 ↛ 825line 824 didn't jump to line 825 because the condition on line 824 was never true

825 return "und" 

826 return language.iso_code_639_1.name.lower() 

827 

828 def get_str_attr(self, tag: "Tag", attr: str): 

829 """Equivalent of `tag.get(attr)`, but ensures the return value is a string""" 

830 node_attr = tag.get(attr) 

831 if isinstance(node_attr, list): 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true

832 raise ValueError( 

833 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes." 

834 ) 

835 if node_attr is None: 835 ↛ 836line 835 didn't jump to line 836 because the condition on line 835 was never true

836 raise ValueError( 

837 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes" 

838 ) 

839 return node_attr 

840 

841 def create_trans_title( 

842 self, 

843 resource_type: str, 

844 title_tex: str, 

845 lang: str, 

846 xresource_lang: str, 

847 title_type: str = "main", 

848 ): 

849 tag = "trans-title" if resource_type == "article" else "issue-title" 

850 

851 ckeditor_data = build_jats_data_from_html_field( 

852 title_tex, 

853 tag=tag, 

854 text_lang=lang, 

855 resource_lang=xresource_lang, 

856 delimiter_inline=self.delimiter_inline_formula, 

857 delimiter_disp=self.delimiter_disp_formula, 

858 ) 

859 

860 titledata = create_titledata( 

861 lang=lang, 

862 type="main", 

863 title_html=ckeditor_data["value_html"], 

864 title_xml=ckeditor_data["value_xml"], 

865 ) 

866 

867 return titledata 

868 

869 references_mapping = { 

870 "citation_title": get_article_title_xml, 

871 "citation_journal_title": get_source_xml, 

872 "citation_publication_date": get_year_xml, 

873 "citation_firstpage": get_fpage_xml, 

874 "citation_lastpage": get_lpage_xml, 

875 } 

876 

877 @classmethod 

878 def __parse_meta_citation_reference(cls, content: str, label=None): 

879 categories = content.split(";") 

880 

881 if len(categories) == 1: 

882 return JatsBase.bake_ref(content, label=label) 

883 

884 citation_data = [c.split("=") for c in categories if "=" in c] 

885 del categories 

886 

887 xml_string = "" 

888 authors_parsed = False 

889 authors_strings = [] 

890 for data in citation_data: 

891 key = data[0].strip() 

892 citation_content = data[1] 

893 if key == "citation_author": 

894 authors_strings.append(get_author_xml(template_str=citation_content)) 

895 continue 

896 elif not authors_parsed: 

897 xml_string += ", ".join(authors_strings) 

898 authors_parsed = True 

899 

900 if key in cls.references_mapping: 

901 xml_string += " " + cls.references_mapping[key](citation_content) 

902 

903 return JatsBase.bake_ref(xml_string, label=label) 

904 

905 @classmethod 

906 def get_or_create_source(cls): 

907 source, created = Source.objects.get_or_create( 

908 domain=cls.source_domain, 

909 defaults={ 

910 "name": cls.source_name, 

911 "website": cls.source_website, 

912 "view_id": cls.get_view_id(), 

913 }, 

914 ) 

915 if created: 915 ↛ 916line 915 didn't jump to line 916 because the condition on line 915 was never true

916 source.save() 

917 return source 

918 

919 @staticmethod 

920 def get_issue_pid( 

921 collection_id: str, 

922 year: str, 

923 volume_number: str | None = None, 

924 issue_number: str | None = None, 

925 series: str | None = None, 

926 ): 

927 # Replace any non-word character with an underscore 

928 pid = f"{collection_id}_{year}" 

929 if series is not None: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true

930 pid += f"_{series}" 

931 if volume_number is not None: 

932 pid += f"_{volume_number}" 

933 if issue_number is not None: 

934 pid += f"_{issue_number}" 

935 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid)) 

936 return pid 

937 

938 @staticmethod 

939 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

940 pages_split = pages.split(separator) 

941 if len(pages_split) == 0: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true

942 article.page_range = pages 

943 if len(pages_split) > 0: 943 ↛ exitline 943 didn't return from function 'set_pages' because the condition on line 943 was always true

944 if pages[0].isnumeric(): 944 ↛ exitline 944 didn't return from function 'set_pages' because the condition on line 944 was always true

945 article.fpage = pages_split[0] 

946 if ( 946 ↛ 951line 946 didn't jump to line 951 because the condition on line 946 was never true

947 len(pages_split) > 1 

948 and pages_split[0] != pages_split[1] 

949 and pages_split[1].isnumeric() 

950 ): 

951 article.lpage = pages_split[1] 

952 

953 @staticmethod 

954 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse): 

955 content_type = response.headers.get("Content-Type") 

956 if regex.match(rb"^%PDF-\d\.\d", chunk): 

957 if content_type and "application/pdf" in content_type: 

958 # The file is unmistakably a pdf 

959 return [ 

960 True, 

961 response, 

962 { 

963 "status": ExtlinkChecked.Status.OK, 

964 "message": "", 

965 }, 

966 ] 

967 # The file is a pdf, but the content type advertised by the server is wrong 

968 return [ 

969 True, 

970 response, 

971 { 

972 "status": ExtlinkChecked.Status.WARNING, 

973 "message": f"Content-Type header: {content_type}", 

974 }, 

975 ] 

976 

977 # Reaching here means we couldn't find the pdf. 

978 if not content_type or "application/pdf" not in content_type: 

979 return [ 

980 False, 

981 response, 

982 { 

983 "status": ExtlinkChecked.Status.ERROR, 

984 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}", 

985 }, 

986 ] 

987 

988 return [ 

989 False, 

990 response, 

991 { 

992 "status": ExtlinkChecked.Status.ERROR, 

993 "message": f"PDF Header not found: got {chunk}", 

994 }, 

995 ] 

996 

997 @classmethod 

998 async def a_check_pdf_link_validity( 

999 cls, url: str, verify=True 

1000 ) -> tuple[bool, aiohttp.ClientResponse, dict]: 

1001 """ 

1002 Check the validity of the PDF links. 

1003 """ 

1004 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1005 header = { 

1006 "Range": f"bytes=0-{CHUNK_SIZE}", 

1007 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1008 } 

1009 async with cls.async_session.get( 

1010 url, headers=header, allow_redirects=True, ssl=verify 

1011 ) as response: 

1012 try: 

1013 chunk = await response.content.read(CHUNK_SIZE) 

1014 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1015 except StopIteration: 

1016 return [ 

1017 False, 

1018 response, 

1019 { 

1020 "status": ExtlinkChecked.Status.ERROR, 

1021 "message": "Error reading PDF header", 

1022 }, 

1023 ] 

1024 

1025 @classmethod 

1026 def check_pdf_link_validity( 

1027 cls, url: str, verify=True 

1028 ) -> tuple[bool, requests.Response | None, dict]: 

1029 """ 

1030 Check the validity of the PDF links. 

1031 """ 

1032 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1033 header = { 

1034 "Range": f"bytes=0-{CHUNK_SIZE}", 

1035 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1036 } 

1037 with cls.session.get( 

1038 url, headers=header, allow_redirects=True, verify=verify, stream=True 

1039 ) as response: 

1040 try: 

1041 chunk = next(response.iter_content(CHUNK_SIZE)) 

1042 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1043 except StopIteration: 

1044 return [ 

1045 False, 

1046 response, 

1047 { 

1048 "status": ExtlinkChecked.Status.ERROR, 

1049 "message": "Error reading PDF header", 

1050 }, 

1051 ] 

1052 

1053 @classmethod 

1054 async def check_extlink_validity(cls, extlink: "ExtLink"): 

1055 """ 

1056 Method used by rot_monitoring to check if links have expired 

1057 """ 

1058 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK} 

1059 header = { 

1060 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" 

1061 } 

1062 verify = True 

1063 if not cls.verify: 

1064 verify = False 

1065 try: 

1066 if extlink.rel == "article-pdf": 

1067 isok, response, message = await cls.a_check_pdf_link_validity( 

1068 extlink.location, verify 

1069 ) 

1070 defaults.update(message) 

1071 defaults["http_status"] = response.status 

1072 else: 

1073 async with cls.async_session.get( 

1074 url=extlink.location, 

1075 headers=header, 

1076 allow_redirects=True, 

1077 ssl=verify, 

1078 ) as response: 

1079 defaults["http_status"] = response.status 

1080 if response.status not in (200, 206): 

1081 defaults["status"] = ExtlinkChecked.Status.ERROR 

1082 

1083 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults) 

1084 cls.logger.info("DB Update, source: %s, url: %s", cls.source_domain, extlink.location) 

1085 

1086 except aiohttp.ClientSSLError: 

1087 cls.logger.error("SSL error for the url: %s", extlink.location) 

1088 except aiohttp.ClientConnectionError: 

1089 cls.logger.error("Connection error for the url: %s", extlink.location)