Coverage for src / crawler / abstract_crawlers / base_crawler.py: 65%

590 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-05-21 12:58 +0000

1import logging 

2import time 

3from collections.abc import Iterable 

4from datetime import datetime, timedelta 

5from email.policy import EmailPolicy 

6from typing import TYPE_CHECKING, Literal 

7 

8import aiohttp 

9import regex 

10import requests 

11from bs4 import BeautifulSoup 

12from django.conf import settings 

13from django.contrib.auth.models import User 

14from django.db.utils import IntegrityError 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetector, LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_subj, 

43 create_titledata, 

44) 

45from ptf.model_data_converter import update_data_for_jats 

46from ptf.models import ExtLink 

47from pylatexenc.latex2text import LatexNodes2Text 

48from pysolr import SolrError 

49from requests.adapters import HTTPAdapter 

50from requests_cache import CachedSession 

51from urllib3 import Retry 

52 

53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

54from crawler.models import Source 

55from crawler.models.extlink_checked import ExtlinkChecked 

56from crawler.types import CitationLiteral 

57from crawler.utils import ( 

58 add_pdf_link_to_xarticle, 

59 cleanup_str, 

60 get_all_cols, 

61 get_or_create_collection, 

62 get_session, 

63) 

64 

65if TYPE_CHECKING: 

66 from bs4 import Tag 

67 

68 

69class CrawlerTitleDict(TitleDict): 

70 title_tex: str | None 

71 

72 

73class BaseCollectionCrawler: 

74 """ 

75 Base collection for the crawlers. 

76 To create a crawler: 

77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

79 3) update factory.py so that crawler_factory can return your new crawler 

80 """ 

81 

82 logger = logging.getLogger(__name__) 

83 tracer = trace.get_tracer(__name__) 

84 

85 source_name = "" 

86 source_domain = "" 

87 source_website = "" 

88 

89 issue_href = "" 

90 

91 collection = None 

92 source = None 

93 user = None 

94 session: requests.Session | CachedSession 

95 async_session: aiohttp.ClientSession 

96 is_checkable = True 

97 verify = True 

98 headers = { 

99 "accept_encoding": "utf-8", 

100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

102 } 

103 

104 # seconds to wait between two http requests 

105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

106 # seconds to wait before aborting the connection (if no bytes are recieved) 

107 requests_timeout = 60 

108 

109 latext_parser = LatexNodes2Text() 

110 

111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

112 # do not use the "$" to surround tex formulas 

113 delimiter_inline_formula = "$" 

114 delimiter_disp_formula = "$" 

115 

116 # HACK : Workaround for tests (monkeypatching) 

117 # We store the class here, so we can monkeypatch it when running tests 

118 # subCrawlers = { 

119 # LofplCrawler: None 

120 # } 

121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

122 

123 _language_detector: LanguageDetector | None = None 

124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

125 

126 force_refresh = False 

127 

128 # Whereas to include headers in requests cache key 

129 match_headers = False 

130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

131 

132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

133 ignore_missing_pdf = True 

134 

135 pid_year_restrictions: dict[str, int] = {} # pid -> excluded years count 

136 

137 @classmethod 

138 def get_view_id(cls): 

139 return cls.source_domain 

140 

141 @property 

142 def language_detector(self): 

143 """Crawler Instance singleton for language builder. 

144 Late init of LanguageDetector to save on memory""" 

145 if not self._language_detector: 

146 self._language_detector = self._language_detector_builder.build() 

147 return self._language_detector 

148 

149 def __init__( 

150 self, 

151 *args, 

152 username: str, 

153 collection_id: str, 

154 dry: bool = False, 

155 publisher: str = "", 

156 force_refresh=False, 

157 collection_url: str | None = None, 

158 backend=None, 

159 ): 

160 if not collection_url: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 all_cols = get_all_cols() 

162 col = all_cols[collection_id] 

163 

164 collection_url = col["sources"].get(self.source_domain, None) 

165 if collection_url is None: 

166 raise ValueError( 

167 f"Source {self.source_domain} not found for collection {collection_id}" 

168 ) 

169 self.collection_url = collection_url 

170 for CrawlerClass in self.subCrawlers: 170 ↛ 171line 170 didn't jump to line 171 because the loop on line 170 never started

171 self.subCrawlers[CrawlerClass] = CrawlerClass( 

172 *args, 

173 username=username, 

174 collection_id=collection_id, 

175 dry=dry, 

176 publisher=publisher, 

177 collection_url=collection_url, 

178 ) 

179 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

180 # self.logger = logging.getLogger(__name__) 

181 

182 self.username = username 

183 

184 self.collection_id = collection_id 

185 

186 self.dry = dry 

187 self.publisher = publisher 

188 

189 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring) 

190 BaseCollectionCrawler.session = requests.Session() 

191 

192 # Skipped when running tests 

193 self.initialize() 

194 

195 self.force_refresh = force_refresh 

196 self.backend = backend 

197 

198 # We implemented custom retry behaviour, so we don't want to make extra requests here 

199 

200 def initialize(self): 

201 """ 

202 Acts as a "second" init function to skip model accesses during test data generation 

203 """ 

204 self.collection = get_or_create_collection(self.collection_id) 

205 self.source = self.get_or_create_source() 

206 self.user = User.objects.get(username=self.username) 

207 BaseCollectionCrawler.session = get_session() 

208 BaseCollectionCrawler.session.verify = self.verify 

209 self.session.delay = self.requests_interval 

210 retries = Retry( 

211 total=0, 

212 ) 

213 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

214 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

215 

216 @classmethod 

217 def can_crawl(cls, pid: str) -> bool: 

218 return True 

219 

220 def parse_collection_content(self, content: str) -> list[IssueData]: 

221 """ 

222 Parse the HTML content with BeautifulSoup 

223 returns a list of xissue. 

224 Override this function in a derived class 

225 """ 

226 return [] 

227 

228 def parse_issue_content(self, content: str, xissue: IssueData): 

229 """ 

230 Parse the HTML content with BeautifulSoup 

231 Fills the xissue.articles 

232 Override this function in a derived class. 

233 

234 CAV : You are supposed to create articles there. Please assign a PID to each article. 

235 The PID can be `a + article_index`, like this : `a0` `a21` 

236 """ 

237 

238 def parse_article_content( 

239 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

240 ) -> ArticleData | None: 

241 """ 

242 Parse the HTML content with BeautifulSoup 

243 returns the xarticle. 

244 Override this function in a derived class. 

245 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

246 The article url is also passed as a parameter 

247 

248 CAV : You are supposed to assign articles pid again here 

249 """ 

250 return xarticle 

251 

252 @tracer.start_as_current_span("crawl_collection") 

253 def crawl_collection(self): 

254 # TODO: Comments, filter 

255 """ 

256 Crawl an entire collection. ptf.models.Container objects are created. 

257 - get the HTML content of the collection_url 

258 - parse the HTML content with beautifulsoup to extract the list of issues 

259 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

260 - crawl each issue if col_only is False 

261 - Returns the list of merged issues. 

262 It is an OrderedDict {pid: {"issues": xissues}} 

263 The key is the pid of the merged issues. 

264 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

265 the pid is then made with 1999-2000__6_ 

266 """ 

267 

268 if self.source is None: 

269 raise RuntimeError("ERROR: the source is not set") 

270 

271 content = self.download_file(self.collection_url) 

272 if content: 

273 xissues = self.parse_collection_content(content) 

274 else: 

275 # download_file returns None (404) 

276 return None 

277 

278 """ 

279 Some collections split the same volumes in different pages 

280 Ex: Volume 6 (2000) and Volume 6 (1999) 

281 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

282 """ 

283 # merged_xissues = self.merge_xissues(xissues) 

284 

285 xissues_dict = {str(i.pid): i for i in xissues} 

286 

287 return xissues_dict 

288 

289 @tracer.start_as_current_span("crawl_issue") 

290 def crawl_issue(self, xissue: IssueData): 

291 """ 

292 Crawl 1 wag page of an issue. 

293 - get the HTML content of the issue 

294 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

295 - crawl each article 

296 """ 

297 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

298 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

299 issue_url = xissue.url 

300 if issue_url is not None: 

301 if issue_url.endswith(".pdf"): 

302 add_pdf_link_to_xarticle(xissue, issue_url) 

303 xissue.url = None 

304 else: 

305 content = self.download_file(issue_url) 

306 with self.tracer.start_as_current_span("parse_issue_content"): 

307 self.parse_issue_content(content, xissue) 

308 

309 xarticles = xissue.articles 

310 

311 parsed_xarticles = [] 

312 

313 for xarticle in xarticles: 

314 parsed_xarticle = self.crawl_article(xarticle, xissue) 

315 if parsed_xarticle is not None: 

316 parsed_xarticles.append(parsed_xarticle) 

317 

318 xissue.articles = parsed_xarticles 

319 

320 issue_has_pdf = self.article_has_pdf(xissue) 

321 

322 if self.ignore_missing_pdf: 

323 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

324 if self.dry: 

325 return 

326 if len(xissue.articles) == 0 and not issue_has_pdf: 

327 return 

328 self.process_resource_metadata(xissue, resource_type="issue") 

329 

330 self.add_xissue_into_database(xissue) 

331 

332 @staticmethod 

333 def article_has_source(art: ArticleData | IssueData): 

334 return ( 

335 next( 

336 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

337 None, 

338 ) 

339 is not None 

340 ) 

341 

342 @staticmethod 

343 def article_has_pdf(art: ArticleData | IssueData): 

344 return ( 

345 next( 

346 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]), 

347 None, 

348 ) 

349 is not None 

350 ) 

351 

352 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

353 # ARTICLE URL as en ExtLink (to display the link in the article page) 

354 if xarticle.url is None: 

355 if not self.article_has_source(xarticle): 355 ↛ 365line 355 didn't jump to line 365 because the condition on line 355 was always true

356 if xissue.url: 

357 article_source = xissue.url 

358 else: 

359 article_source = self.collection_url 

360 ext_link = create_extlink() 

361 ext_link["rel"] = "source" 

362 ext_link["location"] = article_source 

363 ext_link["metadata"] = self.source_domain 

364 xarticle.ext_links.append(ext_link) 

365 return self.process_article_metadata(xarticle) 

366 

367 content = self.download_file(xarticle.url) 

368 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

369 

370 try: 

371 with self.tracer.start_as_current_span("parse_article_content"): 

372 parsed_xarticle = self.parse_article_content( 

373 content, xissue, xarticle, xarticle.url 

374 ) 

375 except ValueError as e: 

376 self.logger.warning(e) 

377 self.logger.warning("Retrying in 5 mins while invalidating cache") 

378 time.sleep(5 * 60) 

379 content = self.download_file(xarticle.url, force_refresh=True) 

380 with self.tracer.start_as_current_span("parse_article_content"): 

381 parsed_xarticle = self.parse_article_content( 

382 content, xissue, xarticle, xarticle.url 

383 ) 

384 

385 if parsed_xarticle is None: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 return None 

387 

388 if parsed_xarticle.doi: 

389 parsed_xarticle.pid = ( 

390 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

391 ) 

392 

393 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

394 ext_link = create_extlink() 

395 ext_link["rel"] = "source" 

396 ext_link["location"] = parsed_xarticle.url 

397 ext_link["metadata"] = self.source_domain 

398 parsed_xarticle.ext_links.append(ext_link) 

399 

400 # The article title may have formulas surrounded with '$' 

401 return self.process_article_metadata(parsed_xarticle) 

402 

403 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

404 tag = "article-title" if resource_type == "article" else "issue-title" 

405 

406 # Process title tex 

407 ckeditor_data = build_jats_data_from_html_field( 

408 xresource.title_tex, 

409 tag=tag, 

410 text_lang=xresource.lang, 

411 delimiter_inline=self.delimiter_inline_formula, 

412 delimiter_disp=self.delimiter_disp_formula, 

413 ) 

414 

415 xresource.title_html = ckeditor_data["value_html"] 

416 # xresource.title_tex = ckeditor_data["value_tex"] 

417 xresource.title_xml = ckeditor_data["value_xml"] 

418 

419 abstracts_to_parse = [ 

420 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

421 ] 

422 # abstract may have formulas surrounded with '$' 

423 if len(abstracts_to_parse) > 0: 

424 for xabstract in abstracts_to_parse: 

425 ckeditor_data = build_jats_data_from_html_field( 

426 xabstract["value_tex"], 

427 tag="abstract", 

428 text_lang=xabstract["lang"], 

429 resource_lang=xresource.lang, 

430 field_type="abstract", 

431 delimiter_inline=self.delimiter_inline_formula, 

432 delimiter_disp=self.delimiter_disp_formula, 

433 ) 

434 

435 xabstract["value_html"] = ckeditor_data["value_html"] 

436 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

437 xabstract["value_xml"] = ckeditor_data["value_xml"] 

438 

439 return xresource 

440 

441 def process_article_metadata(self, xarticle: ArticleData): 

442 self.process_resource_metadata(xarticle) 

443 for bibitem in xarticle.bibitems: 

444 bibitem.type = "unknown" 

445 update_data_for_jats(xarticle, with_label=False) 

446 

447 return xarticle 

448 

449 def download_file(self, url: str, force_refresh=False, headers={}): 

450 """ 

451 Downloads a page and returns its content (decoded string). 

452 This function handles retries and decoding 

453 """ 

454 current_exception: Exception | None = None 

455 for attempt in range(3): 

456 try: 

457 kwargs = { 

458 "url": url, 

459 "headers": {**self.headers, **headers}, 

460 "timeout": self.requests_timeout, 

461 } 

462 if attempt > 0 and isinstance(self.session, CachedSession): 

463 kwargs["force_refresh"] = True 

464 response = self.session.get(**kwargs) 

465 

466 content = self.decode_response(response) 

467 if content == "" or not content: 

468 raise requests.exceptions.HTTPError(response) 

469 

470 return content 

471 except ( 

472 requests.ConnectionError, 

473 requests.ConnectTimeout, 

474 requests.exceptions.HTTPError, 

475 ) as e: 

476 current_exception = e 

477 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

478 # 15 mins, 30 mins, 45 mins 

479 delay_minutes = attempt * 15 

480 self.logger.debug( 

481 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

482 extra={"url": url}, 

483 ) 

484 time.sleep(delay_minutes * 60) 

485 

486 raise current_exception 

487 

488 def decode_response(self, response: requests.Response, encoding: str | None = None): 

489 """Override this if the content-type headers from the sources are advertising something else than the actual content 

490 SASA needs this""" 

491 # Force 

492 if encoding: 

493 response.encoding = encoding 

494 return response.text 

495 

496 # Attempt to get encoding using HTTP headers 

497 content_type_tag = response.headers.get("Content-Type", None) 

498 

499 if content_type_tag: 499 ↛ 506line 499 didn't jump to line 506 because the condition on line 499 was always true

500 charset = self.parse_content_type_charset(content_type_tag) 

501 if charset: 501 ↛ 502line 501 didn't jump to line 502 because the condition on line 501 was never true

502 response.encoding = charset 

503 return response.text 

504 

505 # Attempt to get encoding using HTML meta charset tag 

506 soup = BeautifulSoup(response.text, "html5lib") 

507 charset = soup.select_one("meta[charset]") 

508 if charset: 

509 htmlencoding = charset.get("charset") 

510 if isinstance(htmlencoding, str): 510 ↛ 515line 510 didn't jump to line 515 because the condition on line 510 was always true

511 response.encoding = htmlencoding 

512 return response.text 

513 

514 # Attempt to get encoding using HTML meta content type tag 

515 content_type_tag = soup.select_one( 

516 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]' 

517 ) 

518 if content_type_tag: 

519 content_type = content_type_tag.get("content") 

520 if isinstance(content_type, str): 520 ↛ 526line 520 didn't jump to line 526 because the condition on line 520 was always true

521 charset = self.parse_content_type_charset(content_type) 

522 if charset: 522 ↛ 526line 522 didn't jump to line 526 because the condition on line 522 was always true

523 response.encoding = charset 

524 return response.text 

525 

526 return response.text 

527 

528 @staticmethod 

529 def parse_content_type_charset(content_type: str): 

530 header = EmailPolicy.header_factory("content-type", content_type) 

531 if "charset" in header.params: 

532 return header.params.get("charset") 

533 

534 @tracer.start_as_current_span("add_xissue_to_database") 

535 def add_xissue_into_database(self, xissue: IssueData) -> IssueData: 

536 xissue.journal = self.collection 

537 xissue.source = self.source_domain 

538 

539 if xissue.year == "": 

540 raise ValueError("Failsafe : Cannot insert issue without a year") 

541 

542 xpub = create_publisherdata() 

543 xpub.name = self.publisher 

544 xissue.publisher = xpub 

545 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

546 

547 attempt = 1 

548 success = False 

549 

550 while not success and attempt < 4: 

551 try: 

552 params = {"xissue": xissue, "use_body": False} 

553 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

554 cmd.do() 

555 success = True 

556 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

557 return xissue 

558 except SolrError: 

559 self.logger.warning( 

560 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

561 ) 

562 attempt += 1 

563 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

564 time.sleep(10) 

565 except Exception as e: 

566 self.logger.error( 

567 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

568 ) 

569 raise e 

570 

571 if success is False: 

572 raise ConnectionRefusedError("Cannot connect to SolR") 

573 

574 assert False, "Unreachable" 

575 

576 def get_metadata_using_citation_meta( 

577 self, 

578 xarticle: ArticleData, 

579 xissue: IssueData, 

580 soup: BeautifulSoup, 

581 what: list[CitationLiteral] = [], 

582 ): 

583 """ 

584 :param xarticle: the xarticle that will collect the metadata 

585 :param xissue: the xissue that will collect the publisher 

586 :param soup: the BeautifulSoup object of tha article page 

587 :param what: list of citation_ items to collect. 

588 :return: None. The given article is modified 

589 """ 

590 

591 if "title" in what: 

592 # TITLE 

593 citation_title_node = soup.select_one("meta[name='citation_title']") 

594 if citation_title_node: 594 ↛ 599line 594 didn't jump to line 599 because the condition on line 594 was always true

595 title = citation_title_node.get("content") 

596 if isinstance(title, str): 596 ↛ 599line 596 didn't jump to line 599 because the condition on line 596 was always true

597 xarticle.title_tex = title 

598 

599 if "author" in what: 599 ↛ 628line 599 didn't jump to line 628 because the condition on line 599 was always true

600 # AUTHORS 

601 citation_author_nodes = soup.select("meta[name^='citation_author']") 

602 current_author: ContributorDict | None = None 

603 for citation_author_node in citation_author_nodes: 

604 if citation_author_node.get("name") == "citation_author": 

605 text_author = citation_author_node.get("content") 

606 if not isinstance(text_author, str): 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true

607 raise ValueError("Cannot parse author") 

608 if text_author == "": 608 ↛ 609line 608 didn't jump to line 609 because the condition on line 608 was never true

609 current_author = None 

610 continue 

611 current_author = create_contributor(role="author", string_name=text_author) 

612 xarticle.contributors.append(current_author) 

613 continue 

614 if current_author is None: 614 ↛ 615line 614 didn't jump to line 615 because the condition on line 614 was never true

615 self.logger.warning("Couldn't parse citation author") 

616 continue 

617 if citation_author_node.get("name") == "citation_author_institution": 

618 text_institution = citation_author_node.get("content") 

619 if not isinstance(text_institution, str): 619 ↛ 620line 619 didn't jump to line 620 because the condition on line 619 was never true

620 continue 

621 current_author["addresses"].append(text_institution) 

622 if citation_author_node.get("name") == "citation_author_ocrid": 622 ↛ 623line 622 didn't jump to line 623 because the condition on line 622 was never true

623 text_orcid = citation_author_node.get("content") 

624 if not isinstance(text_orcid, str): 

625 continue 

626 current_author["orcid"] = text_orcid 

627 

628 if "pdf" in what: 

629 # PDF 

630 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

631 if citation_pdf_node: 

632 pdf_url = citation_pdf_node.get("content") 

633 if isinstance(pdf_url, str): 633 ↛ 636line 633 didn't jump to line 636 because the condition on line 633 was always true

634 add_pdf_link_to_xarticle(xarticle, pdf_url) 

635 

636 if "lang" in what: 

637 # LANG 

638 citation_lang_node = soup.select_one("meta[name='citation_language']") 

639 if citation_lang_node: 639 ↛ 645line 639 didn't jump to line 645 because the condition on line 639 was always true

640 # TODO: check other language code 

641 content_text = citation_lang_node.get("content") 

642 if isinstance(content_text, str): 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true

643 xarticle.lang = standardize_tag(content_text) 

644 

645 if "abstract" in what: 

646 # ABSTRACT 

647 abstract_node = soup.select_one("meta[name='citation_abstract']") 

648 if abstract_node is not None: 

649 abstract = abstract_node.get("content") 

650 if not isinstance(abstract, str): 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true

651 raise ValueError("Couldn't parse abstract from meta") 

652 abstract = BeautifulSoup(abstract, "html.parser").text 

653 lang = abstract_node.get("lang") 

654 if not isinstance(lang, str): 

655 lang = self.detect_language(abstract, xarticle) 

656 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

657 

658 if "page" in what: 

659 # PAGES 

660 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

661 if citation_fpage_node: 

662 page = citation_fpage_node.get("content") 

663 if isinstance(page, str): 663 ↛ 668line 663 didn't jump to line 668 because the condition on line 663 was always true

664 page = page.split("(")[0] 

665 if len(page) < 32: 665 ↛ 668line 665 didn't jump to line 668 because the condition on line 665 was always true

666 xarticle.fpage = page 

667 

668 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

669 if citation_lpage_node: 

670 page = citation_lpage_node.get("content") 

671 if isinstance(page, str): 671 ↛ 676line 671 didn't jump to line 676 because the condition on line 671 was always true

672 page = page.split("(")[0] 

673 if len(page) < 32: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true

674 xarticle.lpage = page 

675 

676 if "doi" in what: 

677 # DOI 

678 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

679 if citation_doi_node: 

680 doi = citation_doi_node.get("content") 

681 if isinstance(doi, str): 681 ↛ 688line 681 didn't jump to line 688 because the condition on line 681 was always true

682 doi = doi.strip() 

683 pos = doi.find("10.") 

684 if pos > 0: 

685 doi = doi[pos:] 

686 xarticle.doi = doi 

687 

688 if "mr" in what: 

689 # MR 

690 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

691 if citation_mr_node: 

692 mr = citation_mr_node.get("content") 

693 if isinstance(mr, str): 693 ↛ 699line 693 didn't jump to line 699 because the condition on line 693 was always true

694 mr = mr.strip() 

695 if mr.find("MR") == 0: 695 ↛ 699line 695 didn't jump to line 699 because the condition on line 695 was always true

696 mr = mr[2:] 

697 xarticle.extids.append(("mr-item-id", mr)) 

698 

699 if "zbl" in what: 

700 # ZBL 

701 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

702 if citation_zbl_node: 

703 zbl = citation_zbl_node.get("content") 

704 if isinstance(zbl, str): 704 ↛ 710line 704 didn't jump to line 710 because the condition on line 704 was always true

705 zbl = zbl.strip() 

706 if zbl.find("Zbl") == 0: 706 ↛ 710line 706 didn't jump to line 710 because the condition on line 706 was always true

707 zbl = zbl[3:].strip() 

708 xarticle.extids.append(("zbl-item-id", zbl)) 

709 

710 if "publisher" in what: 

711 # PUBLISHER 

712 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

713 if citation_publisher_node: 

714 pub = citation_publisher_node.get("content") 

715 if isinstance(pub, str): 715 ↛ 722line 715 didn't jump to line 722 because the condition on line 715 was always true

716 pub = pub.strip() 

717 if pub != "": 717 ↛ 722line 717 didn't jump to line 722 because the condition on line 717 was always true

718 xpub = create_publisherdata() 

719 xpub.name = pub 

720 xissue.publisher = xpub 

721 

722 if "keywords" in what: 

723 # KEYWORDS 

724 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

725 for kwd_node in citation_kwd_nodes: 

726 kwds = kwd_node.get("content") 

727 if isinstance(kwds, str): 727 ↛ 725line 727 didn't jump to line 725 because the condition on line 727 was always true

728 kwds = kwds.split(",") 

729 for kwd in kwds: 

730 if kwd == "": 

731 continue 

732 kwd = kwd.strip() 

733 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

734 

735 if "references" in what: 

736 citation_references = soup.select("meta[name='citation_reference']") 

737 for index, tag in enumerate(citation_references): 

738 content = tag.get("content") 

739 if not isinstance(content, str): 739 ↛ 740line 739 didn't jump to line 740 because the condition on line 739 was never true

740 raise ValueError("Cannot parse citation_reference meta") 

741 label = str(index + 1) 

742 if regex.match(r"^\[\d+\].*", content): 742 ↛ 743line 742 didn't jump to line 743 because the condition on line 742 was never true

743 label = None 

744 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

745 

746 def get_metadata_using_dcterms( 

747 self, 

748 xarticle: ArticleData, 

749 soup: "Tag", 

750 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]", 

751 ): 

752 if "abstract" in what: 752 ↛ 760line 752 didn't jump to line 760 because the condition on line 752 was always true

753 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

754 if abstract_tag: 754 ↛ 760line 754 didn't jump to line 760 because the condition on line 754 was always true

755 abstract_text = self.get_str_attr(abstract_tag, "content") 

756 xarticle.abstracts.append( 

757 create_abstract(lang="en", value_tex=cleanup_str(abstract_text)) 

758 ) 

759 

760 if "keywords" in what: 760 ↛ 769line 760 didn't jump to line 769 because the condition on line 760 was always true

761 keyword_tags = soup.select("meta[name='DC.subject']") 

762 for tag in keyword_tags: 

763 kwd_text = tag.get("content") 

764 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true

765 continue 

766 kwd = create_subj(value=kwd_text) 

767 xarticle.kwds.append(kwd) 

768 

769 if "date_published" in what: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true

770 published_tag = soup.select_one("meta[name='DC.Date.created']") 

771 if published_tag: 

772 published_text = self.get_str_attr(published_tag, "content") 

773 xarticle.date_published = published_text 

774 

775 if "article_type" in what: 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true

776 type_tag = soup.select_one("meta[name='DC.Type.articleType']") 

777 if type_tag: 

778 type_text = self.get_str_attr(type_tag, "content") 

779 xarticle.atype = type_text 

780 

781 def create_xissue( 

782 self, 

783 url: str | None, 

784 year: str, 

785 volume_number: str | None, 

786 issue_number: str | None = None, 

787 vseries: str | None = None, 

788 ): 

789 if url is not None and url.endswith("/"): 

790 url = url[:-1] 

791 xissue = create_issuedata() 

792 xissue.url = url 

793 

794 xissue.pid = self.get_issue_pid( 

795 self.collection_id, year, volume_number, issue_number, vseries 

796 ) 

797 

798 xissue.year = year 

799 

800 if volume_number is not None: 

801 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number) 

802 

803 if issue_number is not None: 

804 xissue.number = issue_number.replace(",", "-") 

805 

806 if vseries is not None: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true

807 xissue.vseries = vseries 

808 return xissue 

809 

810 def detect_language(self, text: str, article: ArticleData | None = None): 

811 if article and article.lang is not None and article.lang != "und": 

812 return article.lang 

813 

814 language = self.language_detector.detect_language_of(text) 

815 

816 if not language: 816 ↛ 817line 816 didn't jump to line 817 because the condition on line 816 was never true

817 return "und" 

818 return language.iso_code_639_1.name.lower() 

819 

820 def get_str_attr(self, tag: "Tag", attr: str): 

821 """Equivalent of `tag.get(attr)`, but ensures the return value is a string""" 

822 node_attr = tag.get(attr) 

823 if isinstance(node_attr, list): 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true

824 raise ValueError( 

825 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes." 

826 ) 

827 if node_attr is None: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 raise ValueError( 

829 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes" 

830 ) 

831 return node_attr 

832 

833 def create_trans_title( 

834 self, 

835 resource_type: str, 

836 title_str: str, 

837 lang: str, 

838 xresource_lang: str, 

839 title_type: str = "main", 

840 ): 

841 tag = "trans-title" if resource_type == "article" else "issue-title" 

842 

843 ckeditor_data = build_jats_data_from_html_field( 

844 title_str, 

845 tag=tag, 

846 text_lang=lang, 

847 resource_lang=xresource_lang, 

848 delimiter_inline=self.delimiter_inline_formula, 

849 delimiter_disp=self.delimiter_disp_formula, 

850 ) 

851 

852 titledata = create_titledata( 

853 lang=lang, 

854 type="main", 

855 title_html=ckeditor_data["value_html"], 

856 title_xml=ckeditor_data["value_xml"], 

857 ) 

858 

859 return titledata 

860 

861 references_mapping = { 

862 "citation_title": get_article_title_xml, 

863 "citation_journal_title": get_source_xml, 

864 "citation_publication_date": get_year_xml, 

865 "citation_firstpage": get_fpage_xml, 

866 "citation_lastpage": get_lpage_xml, 

867 } 

868 

869 @classmethod 

870 def __parse_meta_citation_reference(cls, content: str, label=None): 

871 categories = content.split(";") 

872 

873 if len(categories) == 1: 

874 return JatsBase.bake_ref(content, label=label) 

875 

876 citation_data = [c.split("=") for c in categories if "=" in c] 

877 del categories 

878 

879 xml_string = "" 

880 authors_parsed = False 

881 authors_strings = [] 

882 for data in citation_data: 

883 key = data[0].strip() 

884 citation_content = data[1] 

885 if key == "citation_author": 

886 authors_strings.append(get_author_xml(template_str=citation_content)) 

887 continue 

888 elif not authors_parsed: 

889 xml_string += ", ".join(authors_strings) 

890 authors_parsed = True 

891 

892 if key in cls.references_mapping: 

893 xml_string += " " + cls.references_mapping[key](citation_content) 

894 

895 return JatsBase.bake_ref(xml_string, label=label) 

896 

897 @classmethod 

898 def get_or_create_source(cls): 

899 source, created = Source.objects.get_or_create( 

900 domain=cls.source_domain, 

901 defaults={ 

902 "name": cls.source_name, 

903 "website": cls.source_website, 

904 "view_id": cls.get_view_id(), 

905 }, 

906 ) 

907 if created: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true

908 source.save() 

909 return source 

910 

911 @staticmethod 

912 def get_issue_pid( 

913 collection_id: str, 

914 year: str, 

915 volume_number: str | None = None, 

916 issue_number: str | None = None, 

917 series: str | None = None, 

918 ): 

919 # Replace any non-word character with an underscore 

920 pid = f"{collection_id}_{year}" 

921 if series is not None: 921 ↛ 922line 921 didn't jump to line 922 because the condition on line 921 was never true

922 pid += f"_{series}" 

923 if volume_number is not None: 

924 pid += f"_{volume_number}" 

925 if issue_number is not None: 

926 pid += f"_{issue_number}" 

927 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid)) 

928 return pid 

929 

930 @staticmethod 

931 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

932 pages_split = pages.split(separator) 

933 if len(pages_split) == 0: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true

934 article.page_range = pages 

935 if len(pages_split) > 0: 935 ↛ exitline 935 didn't return from function 'set_pages' because the condition on line 935 was always true

936 if pages[0].isnumeric(): 936 ↛ exitline 936 didn't return from function 'set_pages' because the condition on line 936 was always true

937 article.fpage = pages_split[0] 

938 if ( 938 ↛ 943line 938 didn't jump to line 943 because the condition on line 938 was never true

939 len(pages_split) > 1 

940 and pages_split[0] != pages_split[1] 

941 and pages_split[1].isnumeric() 

942 ): 

943 article.lpage = pages_split[1] 

944 

945 @staticmethod 

946 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse): 

947 content_type = response.headers.get("Content-Type") 

948 if regex.match(rb"^%PDF-\d\.\d", chunk): 

949 if content_type and "application/pdf" in content_type: 

950 # The file is unmistakably a pdf 

951 return [ 

952 True, 

953 response, 

954 { 

955 "status": ExtlinkChecked.Status.OK, 

956 "message": "", 

957 }, 

958 ] 

959 # The file is a pdf, but the content type advertised by the server is wrong 

960 return [ 

961 True, 

962 response, 

963 { 

964 "status": ExtlinkChecked.Status.WARNING, 

965 "message": f"Content-Type header: {content_type}", 

966 }, 

967 ] 

968 

969 # Reaching here means we couldn't find the pdf. 

970 if not content_type or "application/pdf" not in content_type: 

971 return [ 

972 False, 

973 response, 

974 { 

975 "status": ExtlinkChecked.Status.ERROR, 

976 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}", 

977 }, 

978 ] 

979 

980 return [ 

981 False, 

982 response, 

983 { 

984 "status": ExtlinkChecked.Status.ERROR, 

985 "message": f"PDF Header not found: got {chunk}", 

986 }, 

987 ] 

988 

989 @classmethod 

990 async def a_check_pdf_link_validity( 

991 cls, url: str, verify=True 

992 ) -> tuple[bool, aiohttp.ClientResponse, dict]: 

993 """ 

994 Check the validity of the PDF links. 

995 """ 

996 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

997 header = { 

998 "Range": f"bytes=0-{CHUNK_SIZE}", 

999 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1000 } 

1001 async with cls.async_session.get( 

1002 url, headers=header, allow_redirects=True, ssl=verify 

1003 ) as response: 

1004 try: 

1005 chunk = await response.content.read(CHUNK_SIZE) 

1006 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1007 except StopIteration: 

1008 return [ 

1009 False, 

1010 response, 

1011 { 

1012 "status": ExtlinkChecked.Status.ERROR, 

1013 "message": "Error reading PDF header", 

1014 }, 

1015 ] 

1016 

1017 @classmethod 

1018 def check_pdf_link_validity( 

1019 cls, url: str, verify=True 

1020 ) -> tuple[bool, requests.Response | None, dict]: 

1021 """ 

1022 Check the validity of the PDF links. 

1023 """ 

1024 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1025 header = { 

1026 "Range": f"bytes=0-{CHUNK_SIZE}", 

1027 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1028 } 

1029 with cls.session.get( 

1030 url, headers=header, allow_redirects=True, verify=verify, stream=True 

1031 ) as response: 

1032 try: 

1033 chunk = next(response.iter_content(CHUNK_SIZE)) 

1034 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1035 except StopIteration: 

1036 return [ 

1037 False, 

1038 response, 

1039 { 

1040 "status": ExtlinkChecked.Status.ERROR, 

1041 "message": "Error reading PDF header", 

1042 }, 

1043 ] 

1044 

1045 @classmethod 

1046 async def check_extlink_validity(cls, extlink: "ExtLink"): 

1047 """ 

1048 Method used by rot_monitoring to check if links have expired 

1049 """ 

1050 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK} 

1051 header = { 

1052 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" 

1053 } 

1054 verify = True 

1055 if not cls.verify: 

1056 verify = False 

1057 try: 

1058 # For the GDZ links, we just check if the http response is 200 or 206 

1059 if ( 

1060 extlink.rel == "article-pdf" 

1061 and "gdz.sub.uni-goettingen.de" not in extlink.location 

1062 ): 

1063 isok, response, message = await cls.a_check_pdf_link_validity( 

1064 extlink.location, verify 

1065 ) 

1066 defaults.update(message) 

1067 defaults["http_status"] = response.status 

1068 else: 

1069 async with cls.async_session.get( 

1070 url=extlink.location, 

1071 headers=header, 

1072 allow_redirects=True, 

1073 ssl=verify, 

1074 ) as response: 

1075 defaults["http_status"] = response.status 

1076 if response.status not in (200, 206): 

1077 defaults["status"] = ExtlinkChecked.Status.ERROR 

1078 

1079 except aiohttp.ClientSSLError: 

1080 cls.logger.error("SSL error for the url: %s", extlink.location) 

1081 defaults["status"] = ExtlinkChecked.Status.ERROR 

1082 defaults["message"] = "SSL error" 

1083 except aiohttp.ClientConnectionError: 

1084 cls.logger.error("Connection error for the url: %s", extlink.location) 

1085 defaults["status"] = ExtlinkChecked.Status.ERROR 

1086 defaults["message"] = "Connection error" 

1087 except TimeoutError: 

1088 cls.logger.error("Timeout error for the url: %s", extlink.location) 

1089 defaults["status"] = ExtlinkChecked.Status.ERROR 

1090 defaults["message"] = "Timeout error" 

1091 finally: 

1092 try: 

1093 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults) 

1094 cls.logger.info( 

1095 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location 

1096 ) 

1097 except IntegrityError: 

1098 cls.logger.error( 

1099 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location 

1100 ) 

1101 

1102 def resolve_year_end(self, pid: str, default: int) -> int: 

1103 if pid in self.pid_year_restrictions: 

1104 return datetime.now().year - self.pid_year_restrictions[pid] 

1105 return default