Coverage for src / crawler / base_crawler.py: 65%

588 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-02-17 12:56 +0000

1import asyncio 

2import logging 

3import time 

4from datetime import datetime, timedelta 

5from email.policy import EmailPolicy 

6from typing import TYPE_CHECKING, Iterable, Literal 

7 

8import aiohttp 

9import regex 

10import requests 

11from bs4 import BeautifulSoup 

12from django.conf import settings 

13from django.contrib.auth.models import User 

14from django.db.utils import IntegrityError 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetector, LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_subj, 

43 create_titledata, 

44) 

45from ptf.model_data_converter import update_data_for_jats 

46from ptf.models import ExtLink 

47from pylatexenc.latex2text import LatexNodes2Text 

48from pysolr import SolrError 

49from requests.adapters import HTTPAdapter 

50from requests_cache import CachedSession 

51from urllib3 import Retry 

52 

53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

54from crawler.models import Source 

55from crawler.models.extlink_checked import ExtlinkChecked 

56from crawler.types import CitationLiteral 

57from crawler.utils import ( 

58 add_pdf_link_to_xarticle, 

59 cleanup_str, 

60 get_all_cols, 

61 get_or_create_collection, 

62 get_session, 

63) 

64 

65if TYPE_CHECKING: 

66 from bs4 import Tag 

67 

68 

69class CrawlerTitleDict(TitleDict): 

70 title_tex: str | None 

71 

72 

73class BaseCollectionCrawler: 

74 """ 

75 Base collection for the crawlers. 

76 To create a crawler: 

77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

79 3) update factory.py so that crawler_factory can return your new crawler 

80 """ 

81 

82 logger = logging.getLogger(__name__) 

83 tracer = trace.get_tracer(__name__) 

84 

85 source_name = "" 

86 source_domain = "" 

87 source_website = "" 

88 

89 issue_href = "" 

90 

91 collection = None 

92 source = None 

93 user = None 

94 session: requests.Session | CachedSession 

95 async_session: aiohttp.ClientSession 

96 is_checkable = True 

97 verify = True 

98 headers = { 

99 "accept_encoding": "utf-8", 

100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

102 } 

103 

104 # seconds to wait between two http requests 

105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

106 # seconds to wait before aborting the connection (if no bytes are recieved) 

107 requests_timeout = 60 

108 

109 latext_parser = LatexNodes2Text() 

110 

111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

112 # do not use the "$" to surround tex formulas 

113 delimiter_inline_formula = "$" 

114 delimiter_disp_formula = "$" 

115 

116 # HACK : Workaround for tests (monkeypatching) 

117 # We store the class here, so we can monkeypatch it when running tests 

118 # subCrawlers = { 

119 # LofplCrawler: None 

120 # } 

121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

122 

123 _language_detector: LanguageDetector | None = None 

124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

125 

126 force_refresh = False 

127 

128 # Whereas to include headers in requests cache key 

129 match_headers = False 

130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

131 

132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

133 ignore_missing_pdf = True 

134 

135 @classmethod 

136 def get_view_id(cls): 

137 return cls.source_domain 

138 

139 @property 

140 def language_detector(self): 

141 """Crawler Instance singleton for language builder. 

142 Late init of LanguageDetector to save on memory""" 

143 if not self._language_detector: 

144 self._language_detector = self._language_detector_builder.build() 

145 return self._language_detector 

146 

147 def __init__( 

148 self, 

149 *args, 

150 username: str, 

151 collection_id: str, 

152 dry: bool = False, 

153 publisher: str = "", 

154 force_refresh=False, 

155 collection_url: str | None = None, 

156 ): 

157 if not collection_url: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 all_cols = get_all_cols() 

159 col = all_cols[collection_id] 

160 

161 collection_url = col["sources"].get(self.source_domain, None) 

162 if collection_url is None: 

163 raise ValueError( 

164 f"Source {self.source_domain} not found for collection {collection_id}" 

165 ) 

166 self.collection_url = collection_url 

167 for CrawlerClass in self.subCrawlers: 167 ↛ 168line 167 didn't jump to line 168 because the loop on line 167 never started

168 self.subCrawlers[CrawlerClass] = CrawlerClass( 

169 *args, 

170 username=username, 

171 collection_id=collection_id, 

172 dry=dry, 

173 publisher=publisher, 

174 collection_url=collection_url, 

175 ) 

176 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

177 # self.logger = logging.getLogger(__name__) 

178 

179 self.username = username 

180 

181 self.collection_id = collection_id 

182 

183 self.dry = dry 

184 self.publisher = publisher 

185 

186 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring) 

187 BaseCollectionCrawler.session: requests.Session 

188 

189 # Skipped when running tests 

190 self.initialize() 

191 

192 self.force_refresh = force_refresh 

193 

194 # We implemented custom retry behaviour, so we don't want to make extra requests here 

195 

196 def initialize(self): 

197 """ 

198 Acts as a "second" init function to skip model accesses during test data generation 

199 """ 

200 self.collection = get_or_create_collection(self.collection_id) 

201 self.source = self.get_or_create_source() 

202 self.user = User.objects.get(username=self.username) 

203 BaseCollectionCrawler.session = get_session() 

204 BaseCollectionCrawler.session.verify = self.verify 

205 self.session.delay = self.requests_interval 

206 retries = Retry( 

207 total=0, 

208 ) 

209 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

210 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

211 

212 @classmethod 

213 def can_crawl(cls, pid: str) -> bool: 

214 return True 

215 

216 def parse_collection_content(self, content: str) -> list[IssueData]: 

217 """ 

218 Parse the HTML content with BeautifulSoup 

219 returns a list of xissue. 

220 Override this function in a derived class 

221 """ 

222 return [] 

223 

224 def parse_issue_content(self, content: str, xissue: IssueData): 

225 """ 

226 Parse the HTML content with BeautifulSoup 

227 Fills the xissue.articles 

228 Override this function in a derived class. 

229 

230 CAV : You are supposed to create articles there. Please assign a PID to each article. 

231 The PID can be `a + article_index`, like this : `a0` `a21` 

232 """ 

233 

234 def parse_article_content( 

235 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

236 ) -> ArticleData | None: 

237 """ 

238 Parse the HTML content with BeautifulSoup 

239 returns the xarticle. 

240 Override this function in a derived class. 

241 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

242 The article url is also passed as a parameter 

243 

244 CAV : You are supposed to assign articles pid again here 

245 """ 

246 return xarticle 

247 

248 @tracer.start_as_current_span("crawl_collection") 

249 def crawl_collection(self): 

250 # TODO: Comments, filter 

251 """ 

252 Crawl an entire collection. ptf.models.Container objects are created. 

253 - get the HTML content of the collection_url 

254 - parse the HTML content with beautifulsoup to extract the list of issues 

255 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

256 - crawl each issue if col_only is False 

257 - Returns the list of merged issues. 

258 It is an OrderedDict {pid: {"issues": xissues}} 

259 The key is the pid of the merged issues. 

260 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

261 the pid is then made with 1999-2000__6_ 

262 """ 

263 

264 if self.source is None: 

265 raise RuntimeError("ERROR: the source is not set") 

266 

267 content = self.download_file(self.collection_url) 

268 if content: 

269 xissues = self.parse_collection_content(content) 

270 else: 

271 # download_file returns None (404) 

272 return None 

273 

274 """ 

275 Some collections split the same volumes in different pages 

276 Ex: Volume 6 (2000) and Volume 6 (1999) 

277 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

278 """ 

279 # merged_xissues = self.merge_xissues(xissues) 

280 

281 xissues_dict = {str(i.pid): i for i in xissues} 

282 

283 return xissues_dict 

284 

285 @tracer.start_as_current_span("crawl_issue") 

286 def crawl_issue(self, xissue: IssueData): 

287 """ 

288 Crawl 1 wag page of an issue. 

289 - get the HTML content of the issue 

290 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

291 - crawl each article 

292 """ 

293 

294 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

295 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

296 issue_url = xissue.url 

297 if issue_url is not None: 

298 if issue_url.endswith(".pdf"): 

299 add_pdf_link_to_xarticle(xissue, issue_url) 

300 xissue.url = None 

301 else: 

302 content = self.download_file(issue_url) 

303 with self.tracer.start_as_current_span("parse_issue_content"): 

304 self.parse_issue_content(content, xissue) 

305 

306 xarticles = xissue.articles 

307 

308 parsed_xarticles = [] 

309 

310 for xarticle in xarticles: 

311 parsed_xarticle = self.crawl_article(xarticle, xissue) 

312 if parsed_xarticle is not None: 

313 parsed_xarticles.append(parsed_xarticle) 

314 

315 xissue.articles = parsed_xarticles 

316 

317 issue_has_pdf = self.article_has_pdf(xissue) 

318 

319 if self.ignore_missing_pdf: 

320 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

321 if self.dry: 

322 return 

323 if len(xissue.articles) == 0 and not issue_has_pdf: 

324 return 

325 self.process_resource_metadata(xissue, resource_type="issue") 

326 

327 self.add_xissue_into_database(xissue) 

328 

329 @staticmethod 

330 def article_has_source(art: ArticleData | IssueData): 

331 return ( 

332 next( 

333 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

334 None, 

335 ) 

336 is not None 

337 ) 

338 

339 @staticmethod 

340 def article_has_pdf(art: ArticleData | IssueData): 

341 return ( 

342 next( 

343 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]), 

344 None, 

345 ) 

346 is not None 

347 ) 

348 

349 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

350 # ARTICLE URL as en ExtLink (to display the link in the article page) 

351 if xarticle.url is None: 

352 if not self.article_has_source(xarticle): 352 ↛ 362line 352 didn't jump to line 362 because the condition on line 352 was always true

353 if xissue.url: 

354 article_source = xissue.url 

355 else: 

356 article_source = self.collection_url 

357 ext_link = create_extlink() 

358 ext_link["rel"] = "source" 

359 ext_link["location"] = article_source 

360 ext_link["metadata"] = self.source_domain 

361 xarticle.ext_links.append(ext_link) 

362 return self.process_article_metadata(xarticle) 

363 

364 content = self.download_file(xarticle.url) 

365 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

366 

367 try: 

368 with self.tracer.start_as_current_span("parse_article_content"): 

369 parsed_xarticle = self.parse_article_content( 

370 content, xissue, xarticle, xarticle.url 

371 ) 

372 except ValueError as e: 

373 self.logger.warning(e) 

374 self.logger.warning("Retrying in 5 mins while invalidating cache") 

375 time.sleep(5 * 60) 

376 content = self.download_file(xarticle.url, force_refresh=True) 

377 with self.tracer.start_as_current_span("parse_article_content"): 

378 parsed_xarticle = self.parse_article_content( 

379 content, xissue, xarticle, xarticle.url 

380 ) 

381 

382 if parsed_xarticle is None: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true

383 return None 

384 

385 if parsed_xarticle.doi: 

386 parsed_xarticle.pid = ( 

387 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

388 ) 

389 

390 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

391 ext_link = create_extlink() 

392 ext_link["rel"] = "source" 

393 ext_link["location"] = parsed_xarticle.url 

394 ext_link["metadata"] = self.source_domain 

395 parsed_xarticle.ext_links.append(ext_link) 

396 

397 # The article title may have formulas surrounded with '$' 

398 return self.process_article_metadata(parsed_xarticle) 

399 

400 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

401 tag = "article-title" if resource_type == "article" else "issue-title" 

402 

403 # Process title tex 

404 ckeditor_data = build_jats_data_from_html_field( 

405 xresource.title_tex, 

406 tag=tag, 

407 text_lang=xresource.lang, 

408 delimiter_inline=self.delimiter_inline_formula, 

409 delimiter_disp=self.delimiter_disp_formula, 

410 ) 

411 

412 xresource.title_html = ckeditor_data["value_html"] 

413 # xresource.title_tex = ckeditor_data["value_tex"] 

414 xresource.title_xml = ckeditor_data["value_xml"] 

415 

416 # Process trans_title tex 

417 if xresource.trans_title_tex: 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 self.logger.warning( 

419 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

420 ) 

421 trans_title = self.create_trans_title( 

422 xresource_lang=xresource.lang, 

423 resource_type=resource_type, 

424 title_tex=xresource.trans_title_tex, 

425 lang=xresource.trans_lang, 

426 ) 

427 xresource.titles.append(trans_title) 

428 

429 abstracts_to_parse = [ 

430 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

431 ] 

432 # abstract may have formulas surrounded with '$' 

433 if len(abstracts_to_parse) > 0: 

434 for xabstract in abstracts_to_parse: 

435 ckeditor_data = build_jats_data_from_html_field( 

436 xabstract["value_tex"], 

437 tag="abstract", 

438 text_lang=xabstract["lang"], 

439 resource_lang=xresource.lang, 

440 field_type="abstract", 

441 delimiter_inline=self.delimiter_inline_formula, 

442 delimiter_disp=self.delimiter_disp_formula, 

443 ) 

444 

445 xabstract["value_html"] = ckeditor_data["value_html"] 

446 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

447 xabstract["value_xml"] = ckeditor_data["value_xml"] 

448 

449 return xresource 

450 

451 def process_article_metadata(self, xarticle: ArticleData): 

452 self.process_resource_metadata(xarticle) 

453 for bibitem in xarticle.bibitems: 

454 bibitem.type = "unknown" 

455 update_data_for_jats(xarticle, with_label=False) 

456 

457 return xarticle 

458 

459 def download_file(self, url: str, force_refresh=False, headers={}): 

460 """ 

461 Downloads a page and returns its content (decoded string). 

462 This function handles retries and decoding 

463 """ 

464 current_exception: Exception | None = None 

465 for attempt in range(3): 

466 try: 

467 kwargs = { 

468 "url": url, 

469 "headers": {**self.headers, **headers}, 

470 "timeout": self.requests_timeout, 

471 } 

472 if attempt > 0 and isinstance(self.session, CachedSession): 

473 kwargs["force_refresh"] = True 

474 response = self.session.get(**kwargs) 

475 

476 content = self.decode_response(response) 

477 if content == "" or not content: 

478 raise requests.exceptions.HTTPError(response) 

479 

480 return content 

481 except ( 

482 requests.ConnectionError, 

483 requests.ConnectTimeout, 

484 requests.exceptions.HTTPError, 

485 ) as e: 

486 current_exception = e 

487 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

488 # 15 mins, 30 mins, 45 mins 

489 delay_minutes = attempt * 15 

490 self.logger.debug( 

491 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

492 extra={"url": url}, 

493 ) 

494 time.sleep(delay_minutes * 60) 

495 

496 raise current_exception 

497 

498 def decode_response(self, response: requests.Response, encoding: str | None = None): 

499 """Override this if the content-type headers from the sources are advertising something else than the actual content 

500 SASA needs this""" 

501 # Force 

502 if encoding: 

503 response.encoding = encoding 

504 return response.text 

505 

506 # Attempt to get encoding using HTTP headers 

507 content_type_tag = response.headers.get("Content-Type", None) 

508 

509 if content_type_tag: 509 ↛ 516line 509 didn't jump to line 516 because the condition on line 509 was always true

510 charset = self.parse_content_type_charset(content_type_tag) 

511 if charset: 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true

512 response.encoding = charset 

513 return response.text 

514 

515 # Attempt to get encoding using HTML meta charset tag 

516 soup = BeautifulSoup(response.text, "html5lib") 

517 charset = soup.select_one("meta[charset]") 

518 if charset: 

519 htmlencoding = charset.get("charset") 

520 if isinstance(htmlencoding, str): 520 ↛ 525line 520 didn't jump to line 525 because the condition on line 520 was always true

521 response.encoding = htmlencoding 

522 return response.text 

523 

524 # Attempt to get encoding using HTML meta content type tag 

525 content_type_tag = soup.select_one( 

526 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]' 

527 ) 

528 if content_type_tag: 

529 content_type = content_type_tag.get("content") 

530 if isinstance(content_type, str): 530 ↛ 536line 530 didn't jump to line 536 because the condition on line 530 was always true

531 charset = self.parse_content_type_charset(content_type) 

532 if charset: 532 ↛ 536line 532 didn't jump to line 536 because the condition on line 532 was always true

533 response.encoding = charset 

534 return response.text 

535 

536 return response.text 

537 

538 @staticmethod 

539 def parse_content_type_charset(content_type: str): 

540 header = EmailPolicy.header_factory("content-type", content_type) 

541 if "charset" in header.params: 

542 return header.params.get("charset") 

543 

544 @tracer.start_as_current_span("add_xissue_to_database") 

545 def add_xissue_into_database(self, xissue: IssueData) -> IssueData: 

546 xissue.journal = self.collection 

547 xissue.source = self.source_domain 

548 

549 if xissue.year == "": 

550 raise ValueError("Failsafe : Cannot insert issue without a year") 

551 

552 xpub = create_publisherdata() 

553 xpub.name = self.publisher 

554 xissue.publisher = xpub 

555 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

556 

557 attempt = 1 

558 success = False 

559 

560 while not success and attempt < 4: 

561 try: 

562 params = {"xissue": xissue, "use_body": False} 

563 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

564 cmd.do() 

565 success = True 

566 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

567 return xissue 

568 except SolrError: 

569 self.logger.warning( 

570 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

571 ) 

572 attempt += 1 

573 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

574 time.sleep(10) 

575 except Exception as e: 

576 self.logger.error( 

577 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

578 ) 

579 raise e 

580 

581 if success is False: 

582 raise ConnectionRefusedError("Cannot connect to SolR") 

583 

584 assert False, "Unreachable" 

585 

586 def get_metadata_using_citation_meta( 

587 self, 

588 xarticle: ArticleData, 

589 xissue: IssueData, 

590 soup: BeautifulSoup, 

591 what: list[CitationLiteral] = [], 

592 ): 

593 """ 

594 :param xarticle: the xarticle that will collect the metadata 

595 :param xissue: the xissue that will collect the publisher 

596 :param soup: the BeautifulSoup object of tha article page 

597 :param what: list of citation_ items to collect. 

598 :return: None. The given article is modified 

599 """ 

600 

601 if "title" in what: 

602 # TITLE 

603 citation_title_node = soup.select_one("meta[name='citation_title']") 

604 if citation_title_node: 604 ↛ 609line 604 didn't jump to line 609 because the condition on line 604 was always true

605 title = citation_title_node.get("content") 

606 if isinstance(title, str): 606 ↛ 609line 606 didn't jump to line 609 because the condition on line 606 was always true

607 xarticle.title_tex = title 

608 

609 if "author" in what: 609 ↛ 638line 609 didn't jump to line 638 because the condition on line 609 was always true

610 # AUTHORS 

611 citation_author_nodes = soup.select("meta[name^='citation_author']") 

612 current_author: ContributorDict | None = None 

613 for citation_author_node in citation_author_nodes: 

614 if citation_author_node.get("name") == "citation_author": 

615 text_author = citation_author_node.get("content") 

616 if not isinstance(text_author, str): 616 ↛ 617line 616 didn't jump to line 617 because the condition on line 616 was never true

617 raise ValueError("Cannot parse author") 

618 if text_author == "": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true

619 current_author = None 

620 continue 

621 current_author = create_contributor(role="author", string_name=text_author) 

622 xarticle.contributors.append(current_author) 

623 continue 

624 if current_author is None: 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true

625 self.logger.warning("Couldn't parse citation author") 

626 continue 

627 if citation_author_node.get("name") == "citation_author_institution": 

628 text_institution = citation_author_node.get("content") 

629 if not isinstance(text_institution, str): 629 ↛ 630line 629 didn't jump to line 630 because the condition on line 629 was never true

630 continue 

631 current_author["addresses"].append(text_institution) 

632 if citation_author_node.get("name") == "citation_author_ocrid": 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true

633 text_orcid = citation_author_node.get("content") 

634 if not isinstance(text_orcid, str): 

635 continue 

636 current_author["orcid"] = text_orcid 

637 

638 if "pdf" in what: 

639 # PDF 

640 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

641 if citation_pdf_node: 

642 pdf_url = citation_pdf_node.get("content") 

643 if isinstance(pdf_url, str): 643 ↛ 646line 643 didn't jump to line 646 because the condition on line 643 was always true

644 add_pdf_link_to_xarticle(xarticle, pdf_url) 

645 

646 if "lang" in what: 

647 # LANG 

648 citation_lang_node = soup.select_one("meta[name='citation_language']") 

649 if citation_lang_node: 649 ↛ 655line 649 didn't jump to line 655 because the condition on line 649 was always true

650 # TODO: check other language code 

651 content_text = citation_lang_node.get("content") 

652 if isinstance(content_text, str): 652 ↛ 655line 652 didn't jump to line 655 because the condition on line 652 was always true

653 xarticle.lang = standardize_tag(content_text) 

654 

655 if "abstract" in what: 

656 # ABSTRACT 

657 abstract_node = soup.select_one("meta[name='citation_abstract']") 

658 if abstract_node is not None: 

659 abstract = abstract_node.get("content") 

660 if not isinstance(abstract, str): 660 ↛ 661line 660 didn't jump to line 661 because the condition on line 660 was never true

661 raise ValueError("Couldn't parse abstract from meta") 

662 abstract = BeautifulSoup(abstract, "html.parser").text 

663 lang = abstract_node.get("lang") 

664 if not isinstance(lang, str): 

665 lang = self.detect_language(abstract, xarticle) 

666 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

667 

668 if "page" in what: 

669 # PAGES 

670 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

671 if citation_fpage_node: 

672 page = citation_fpage_node.get("content") 

673 if isinstance(page, str): 673 ↛ 678line 673 didn't jump to line 678 because the condition on line 673 was always true

674 page = page.split("(")[0] 

675 if len(page) < 32: 675 ↛ 678line 675 didn't jump to line 678 because the condition on line 675 was always true

676 xarticle.fpage = page 

677 

678 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

679 if citation_lpage_node: 

680 page = citation_lpage_node.get("content") 

681 if isinstance(page, str): 681 ↛ 686line 681 didn't jump to line 686 because the condition on line 681 was always true

682 page = page.split("(")[0] 

683 if len(page) < 32: 683 ↛ 686line 683 didn't jump to line 686 because the condition on line 683 was always true

684 xarticle.lpage = page 

685 

686 if "doi" in what: 

687 # DOI 

688 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

689 if citation_doi_node: 

690 doi = citation_doi_node.get("content") 

691 if isinstance(doi, str): 691 ↛ 698line 691 didn't jump to line 698 because the condition on line 691 was always true

692 doi = doi.strip() 

693 pos = doi.find("10.") 

694 if pos > 0: 

695 doi = doi[pos:] 

696 xarticle.doi = doi 

697 

698 if "mr" in what: 

699 # MR 

700 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

701 if citation_mr_node: 

702 mr = citation_mr_node.get("content") 

703 if isinstance(mr, str): 703 ↛ 709line 703 didn't jump to line 709 because the condition on line 703 was always true

704 mr = mr.strip() 

705 if mr.find("MR") == 0: 705 ↛ 709line 705 didn't jump to line 709 because the condition on line 705 was always true

706 mr = mr[2:] 

707 xarticle.extids.append(("mr-item-id", mr)) 

708 

709 if "zbl" in what: 

710 # ZBL 

711 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

712 if citation_zbl_node: 

713 zbl = citation_zbl_node.get("content") 

714 if isinstance(zbl, str): 714 ↛ 720line 714 didn't jump to line 720 because the condition on line 714 was always true

715 zbl = zbl.strip() 

716 if zbl.find("Zbl") == 0: 716 ↛ 720line 716 didn't jump to line 720 because the condition on line 716 was always true

717 zbl = zbl[3:].strip() 

718 xarticle.extids.append(("zbl-item-id", zbl)) 

719 

720 if "publisher" in what: 

721 # PUBLISHER 

722 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

723 if citation_publisher_node: 

724 pub = citation_publisher_node.get("content") 

725 if isinstance(pub, str): 725 ↛ 732line 725 didn't jump to line 732 because the condition on line 725 was always true

726 pub = pub.strip() 

727 if pub != "": 727 ↛ 732line 727 didn't jump to line 732 because the condition on line 727 was always true

728 xpub = create_publisherdata() 

729 xpub.name = pub 

730 xissue.publisher = xpub 

731 

732 if "keywords" in what: 

733 # KEYWORDS 

734 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

735 for kwd_node in citation_kwd_nodes: 

736 kwds = kwd_node.get("content") 

737 if isinstance(kwds, str): 737 ↛ 735line 737 didn't jump to line 735 because the condition on line 737 was always true

738 kwds = kwds.split(",") 

739 for kwd in kwds: 

740 if kwd == "": 

741 continue 

742 kwd = kwd.strip() 

743 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

744 

745 if "references" in what: 

746 citation_references = soup.select("meta[name='citation_reference']") 

747 for index, tag in enumerate(citation_references): 

748 content = tag.get("content") 

749 if not isinstance(content, str): 749 ↛ 750line 749 didn't jump to line 750 because the condition on line 749 was never true

750 raise ValueError("Cannot parse citation_reference meta") 

751 label = str(index + 1) 

752 if regex.match(r"^\[\d+\].*", content): 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 label = None 

754 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

755 

756 def get_metadata_using_dcterms( 

757 self, 

758 xarticle: ArticleData, 

759 soup: "Tag", 

760 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]", 

761 ): 

762 if "abstract" in what: 762 ↛ 770line 762 didn't jump to line 770 because the condition on line 762 was always true

763 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

764 if abstract_tag: 764 ↛ 770line 764 didn't jump to line 770 because the condition on line 764 was always true

765 abstract_text = self.get_str_attr(abstract_tag, "content") 

766 xarticle.abstracts.append( 

767 create_abstract(lang="en", value_tex=cleanup_str(abstract_text)) 

768 ) 

769 

770 if "keywords" in what: 770 ↛ 779line 770 didn't jump to line 779 because the condition on line 770 was always true

771 keyword_tags = soup.select("meta[name='DC.subject']") 

772 for tag in keyword_tags: 

773 kwd_text = tag.get("content") 

774 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 774 ↛ 775line 774 didn't jump to line 775 because the condition on line 774 was never true

775 continue 

776 kwd = create_subj(value=kwd_text) 

777 xarticle.kwds.append(kwd) 

778 

779 if "date_published" in what: 779 ↛ 780line 779 didn't jump to line 780 because the condition on line 779 was never true

780 published_tag = soup.select_one("meta[name='DC.Date.created']") 

781 if published_tag: 

782 published_text = self.get_str_attr(published_tag, "content") 

783 xarticle.date_published = published_text 

784 

785 if "article_type" in what: 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true

786 type_tag = soup.select_one("meta[name='DC.Type.articleType']") 

787 if type_tag: 

788 type_text = self.get_str_attr(type_tag, "content") 

789 xarticle.atype = type_text 

790 

791 def create_xissue( 

792 self, 

793 url: str | None, 

794 year: str, 

795 volume_number: str | None, 

796 issue_number: str | None = None, 

797 vseries: str | None = None, 

798 ): 

799 if url is not None and url.endswith("/"): 

800 url = url[:-1] 

801 xissue = create_issuedata() 

802 xissue.url = url 

803 

804 xissue.pid = self.get_issue_pid( 

805 self.collection_id, year, volume_number, issue_number, vseries 

806 ) 

807 

808 xissue.year = year 

809 

810 if volume_number is not None: 

811 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number) 

812 

813 if issue_number is not None: 

814 xissue.number = issue_number.replace(",", "-") 

815 

816 if vseries is not None: 816 ↛ 817line 816 didn't jump to line 817 because the condition on line 816 was never true

817 xissue.vseries = vseries 

818 return xissue 

819 

820 def detect_language(self, text: str, article: ArticleData | None = None): 

821 if article and article.lang is not None and article.lang != "und": 

822 return article.lang 

823 

824 language = self.language_detector.detect_language_of(text) 

825 

826 if not language: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true

827 return "und" 

828 return language.iso_code_639_1.name.lower() 

829 

830 def get_str_attr(self, tag: "Tag", attr: str): 

831 """Equivalent of `tag.get(attr)`, but ensures the return value is a string""" 

832 node_attr = tag.get(attr) 

833 if isinstance(node_attr, list): 833 ↛ 834line 833 didn't jump to line 834 because the condition on line 833 was never true

834 raise ValueError( 

835 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes." 

836 ) 

837 if node_attr is None: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true

838 raise ValueError( 

839 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes" 

840 ) 

841 return node_attr 

842 

843 def create_trans_title( 

844 self, 

845 resource_type: str, 

846 title_tex: str, 

847 lang: str, 

848 xresource_lang: str, 

849 title_type: str = "main", 

850 ): 

851 tag = "trans-title" if resource_type == "article" else "issue-title" 

852 

853 ckeditor_data = build_jats_data_from_html_field( 

854 title_tex, 

855 tag=tag, 

856 text_lang=lang, 

857 resource_lang=xresource_lang, 

858 delimiter_inline=self.delimiter_inline_formula, 

859 delimiter_disp=self.delimiter_disp_formula, 

860 ) 

861 

862 titledata = create_titledata( 

863 lang=lang, 

864 type="main", 

865 title_html=ckeditor_data["value_html"], 

866 title_xml=ckeditor_data["value_xml"], 

867 ) 

868 

869 return titledata 

870 

871 references_mapping = { 

872 "citation_title": get_article_title_xml, 

873 "citation_journal_title": get_source_xml, 

874 "citation_publication_date": get_year_xml, 

875 "citation_firstpage": get_fpage_xml, 

876 "citation_lastpage": get_lpage_xml, 

877 } 

878 

879 @classmethod 

880 def __parse_meta_citation_reference(cls, content: str, label=None): 

881 categories = content.split(";") 

882 

883 if len(categories) == 1: 

884 return JatsBase.bake_ref(content, label=label) 

885 

886 citation_data = [c.split("=") for c in categories if "=" in c] 

887 del categories 

888 

889 xml_string = "" 

890 authors_parsed = False 

891 authors_strings = [] 

892 for data in citation_data: 

893 key = data[0].strip() 

894 citation_content = data[1] 

895 if key == "citation_author": 

896 authors_strings.append(get_author_xml(template_str=citation_content)) 

897 continue 

898 elif not authors_parsed: 

899 xml_string += ", ".join(authors_strings) 

900 authors_parsed = True 

901 

902 if key in cls.references_mapping: 

903 xml_string += " " + cls.references_mapping[key](citation_content) 

904 

905 return JatsBase.bake_ref(xml_string, label=label) 

906 

907 @classmethod 

908 def get_or_create_source(cls): 

909 source, created = Source.objects.get_or_create( 

910 domain=cls.source_domain, 

911 defaults={ 

912 "name": cls.source_name, 

913 "website": cls.source_website, 

914 "view_id": cls.get_view_id(), 

915 }, 

916 ) 

917 if created: 917 ↛ 918line 917 didn't jump to line 918 because the condition on line 917 was never true

918 source.save() 

919 return source 

920 

921 @staticmethod 

922 def get_issue_pid( 

923 collection_id: str, 

924 year: str, 

925 volume_number: str | None = None, 

926 issue_number: str | None = None, 

927 series: str | None = None, 

928 ): 

929 # Replace any non-word character with an underscore 

930 pid = f"{collection_id}_{year}" 

931 if series is not None: 931 ↛ 932line 931 didn't jump to line 932 because the condition on line 931 was never true

932 pid += f"_{series}" 

933 if volume_number is not None: 

934 pid += f"_{volume_number}" 

935 if issue_number is not None: 

936 pid += f"_{issue_number}" 

937 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid)) 

938 return pid 

939 

940 @staticmethod 

941 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

942 pages_split = pages.split(separator) 

943 if len(pages_split) == 0: 943 ↛ 944line 943 didn't jump to line 944 because the condition on line 943 was never true

944 article.page_range = pages 

945 if len(pages_split) > 0: 945 ↛ exitline 945 didn't return from function 'set_pages' because the condition on line 945 was always true

946 if pages[0].isnumeric(): 946 ↛ exitline 946 didn't return from function 'set_pages' because the condition on line 946 was always true

947 article.fpage = pages_split[0] 

948 if ( 948 ↛ 953line 948 didn't jump to line 953 because the condition on line 948 was never true

949 len(pages_split) > 1 

950 and pages_split[0] != pages_split[1] 

951 and pages_split[1].isnumeric() 

952 ): 

953 article.lpage = pages_split[1] 

954 

955 @staticmethod 

956 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse): 

957 content_type = response.headers.get("Content-Type") 

958 if regex.match(rb"^%PDF-\d\.\d", chunk): 

959 if content_type and "application/pdf" in content_type: 

960 # The file is unmistakably a pdf 

961 return [ 

962 True, 

963 response, 

964 { 

965 "status": ExtlinkChecked.Status.OK, 

966 "message": "", 

967 }, 

968 ] 

969 # The file is a pdf, but the content type advertised by the server is wrong 

970 return [ 

971 True, 

972 response, 

973 { 

974 "status": ExtlinkChecked.Status.WARNING, 

975 "message": f"Content-Type header: {content_type}", 

976 }, 

977 ] 

978 

979 # Reaching here means we couldn't find the pdf. 

980 if not content_type or "application/pdf" not in content_type: 

981 return [ 

982 False, 

983 response, 

984 { 

985 "status": ExtlinkChecked.Status.ERROR, 

986 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}", 

987 }, 

988 ] 

989 

990 return [ 

991 False, 

992 response, 

993 { 

994 "status": ExtlinkChecked.Status.ERROR, 

995 "message": f"PDF Header not found: got {chunk}", 

996 }, 

997 ] 

998 

999 @classmethod 

1000 async def a_check_pdf_link_validity( 

1001 cls, url: str, verify=True 

1002 ) -> tuple[bool, aiohttp.ClientResponse, dict]: 

1003 """ 

1004 Check the validity of the PDF links. 

1005 """ 

1006 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1007 header = { 

1008 "Range": f"bytes=0-{CHUNK_SIZE}", 

1009 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1010 } 

1011 async with cls.async_session.get( 

1012 url, headers=header, allow_redirects=True, ssl=verify 

1013 ) as response: 

1014 try: 

1015 chunk = await response.content.read(CHUNK_SIZE) 

1016 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1017 except StopIteration: 

1018 return [ 

1019 False, 

1020 response, 

1021 { 

1022 "status": ExtlinkChecked.Status.ERROR, 

1023 "message": "Error reading PDF header", 

1024 }, 

1025 ] 

1026 

1027 @classmethod 

1028 def check_pdf_link_validity( 

1029 cls, url: str, verify=True 

1030 ) -> tuple[bool, requests.Response | None, dict]: 

1031 """ 

1032 Check the validity of the PDF links. 

1033 """ 

1034 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1035 header = { 

1036 "Range": f"bytes=0-{CHUNK_SIZE}", 

1037 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1038 } 

1039 with cls.session.get( 

1040 url, headers=header, allow_redirects=True, verify=verify, stream=True 

1041 ) as response: 

1042 try: 

1043 chunk = next(response.iter_content(CHUNK_SIZE)) 

1044 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1045 except StopIteration: 

1046 return [ 

1047 False, 

1048 response, 

1049 { 

1050 "status": ExtlinkChecked.Status.ERROR, 

1051 "message": "Error reading PDF header", 

1052 }, 

1053 ] 

1054 

1055 @classmethod 

1056 async def check_extlink_validity(cls, extlink: "ExtLink"): 

1057 """ 

1058 Method used by rot_monitoring to check if links have expired 

1059 """ 

1060 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK} 

1061 header = { 

1062 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" 

1063 } 

1064 verify = True 

1065 if not cls.verify: 

1066 verify = False 

1067 try: 

1068 if extlink.rel == "article-pdf": 

1069 isok, response, message = await cls.a_check_pdf_link_validity( 

1070 extlink.location, verify 

1071 ) 

1072 defaults.update(message) 

1073 defaults["http_status"] = response.status 

1074 else: 

1075 async with cls.async_session.get( 

1076 url=extlink.location, 

1077 headers=header, 

1078 allow_redirects=True, 

1079 ssl=verify, 

1080 ) as response: 

1081 defaults["http_status"] = response.status 

1082 if response.status not in (200, 206): 

1083 defaults["status"] = ExtlinkChecked.Status.ERROR 

1084 

1085 except aiohttp.ClientSSLError: 

1086 cls.logger.error("SSL error for the url: %s", extlink.location) 

1087 defaults["status"] = ExtlinkChecked.Status.ERROR 

1088 defaults["message"] = "SSL error" 

1089 except aiohttp.ClientConnectionError: 

1090 cls.logger.error("Connection error for the url: %s", extlink.location) 

1091 defaults["status"] = ExtlinkChecked.Status.ERROR 

1092 defaults["message"] = "Connection error" 

1093 except asyncio.TimeoutError: 

1094 cls.logger.error("Timeout error for the url: %s", extlink.location) 

1095 defaults["status"] = ExtlinkChecked.Status.ERROR 

1096 defaults["message"] = "Timeout error" 

1097 finally: 

1098 try: 

1099 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults) 

1100 cls.logger.info( 

1101 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location 

1102 ) 

1103 except IntegrityError: 

1104 cls.logger.error( 

1105 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location 

1106 )