Coverage for src / crawler / abstract_crawlers / base_crawler.py: 65%

613 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import logging 

2import time 

3from collections.abc import Iterable 

4from datetime import datetime, timedelta 

5from email.policy import EmailPolicy 

6from typing import TYPE_CHECKING, Literal 

7 

8import aiohttp 

9import regex 

10import requests 

11from bs4 import BeautifulSoup 

12from django.conf import settings 

13from django.contrib.auth.models import User 

14from django.db.utils import IntegrityError 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetector, LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_subj, 

43 create_titledata, 

44) 

45from ptf.model_data_converter import update_data_for_jats 

46from ptf.models import ExtLink 

47from pylatexenc.latex2text import LatexNodes2Text 

48from pysolr import SolrError 

49from requests.adapters import HTTPAdapter 

50from requests_cache import CachedSession 

51from urllib3 import Retry 

52 

53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

54from crawler.models import Source 

55from crawler.models.extlink_checked import ExtlinkChecked 

56from crawler.types import CitationLiteral 

57from crawler.utils import ( 

58 add_pdf_link_to_xarticle, 

59 cleanup_str, 

60 get_all_cols, 

61 get_or_create_collection, 

62 get_session, 

63) 

64 

65if TYPE_CHECKING: 

66 from typing import Callable 

67 

68 from bs4 import Tag 

69 

70 

71class CrawlerTitleDict(TitleDict): 

72 title_tex: str | None 

73 

74 

75class BaseCollectionCrawler: 

76 """ 

77 Base collection for the crawlers. 

78 To create a crawler: 

79 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

80 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

81 3) update factory.py so that crawler_factory can return your new crawler 

82 """ 

83 

84 logger = logging.getLogger(__name__) 

85 tracer = trace.get_tracer(__name__) 

86 

87 source_name = "" 

88 source_domain = "" 

89 source_website = "" 

90 

91 issue_href = "" 

92 

93 collection = None 

94 source = None 

95 user = None 

96 session: requests.Session | CachedSession 

97 async_session: aiohttp.ClientSession 

98 is_checkable = True 

99 verify = True 

100 headers = { 

101 "accept_encoding": "utf-8", 

102 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

103 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

104 } 

105 

106 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

107 "seconds to wait between two http requests" 

108 requests_timeout = 60 

109 "seconds to wait before aborting the connection (if no bytes are recieved)" 

110 

111 latext_parser = LatexNodes2Text() 

112 

113 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

114 # do not use the "$" to surround tex formulas 

115 delimiter_inline_formula = "$" 

116 delimiter_disp_formula = "$" 

117 

118 # HACK : Workaround for tests (monkeypatching) 

119 # We store the class here, so we can monkeypatch it when running tests 

120 # subCrawlers = { 

121 # LofplCrawler: None 

122 # } 

123 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

124 

125 _language_detector: LanguageDetector | None = None 

126 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

127 

128 force_refresh = False 

129 

130 match_headers = False 

131 "Whereas to include headers in requests cache key" 

132 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

133 

134 ignore_missing_pdf = True 

135 "Set this to False on a Crawler-basis to allow inserting articles without PDFs" 

136 pid_year_restrictions: dict[str, int] = {} 

137 "pid -> excluded years count" 

138 

139 pause_function: "Callable[[int], None]" 

140 "Overridable the pause function (used in celery tasks to speedup aborting)" 

141 

142 @classmethod 

143 def get_view_id(cls): 

144 return cls.source_domain 

145 

146 @property 

147 def language_detector(self): 

148 """Crawler Instance singleton for language builder. 

149 Late init of LanguageDetector to save on memory""" 

150 if not self._language_detector: 

151 self._language_detector = self._language_detector_builder.build() 

152 return self._language_detector 

153 

154 def __init__( 

155 self, 

156 *args, 

157 username: str, 

158 collection_id: str, 

159 dry: bool = False, 

160 publisher: str = "", 

161 force_refresh=False, 

162 collection_url: str | None = None, 

163 backend=None, 

164 pause_function=staticmethod(time.sleep), 

165 ): 

166 if not collection_url: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 all_cols = get_all_cols() 

168 col = all_cols[collection_id] 

169 

170 collection_url = col["sources"].get(self.source_domain, None) 

171 if collection_url is None: 

172 raise ValueError( 

173 f"Source {self.source_domain} not found for collection {collection_id}" 

174 ) 

175 self.collection_url = collection_url 

176 for CrawlerClass in self.subCrawlers: 176 ↛ 177line 176 didn't jump to line 177 because the loop on line 176 never started

177 self.subCrawlers[CrawlerClass] = CrawlerClass( 

178 *args, 

179 username=username, 

180 collection_id=collection_id, 

181 dry=dry, 

182 publisher=publisher, 

183 collection_url=collection_url, 

184 ) 

185 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

186 # self.logger = logging.getLogger(__name__) 

187 

188 self.username = username 

189 

190 self.collection_id = collection_id 

191 

192 self.dry = dry 

193 self.publisher = publisher 

194 

195 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring) 

196 BaseCollectionCrawler.session = requests.Session() 

197 

198 self.pause_function = pause_function 

199 

200 # Skipped when running tests 

201 self.initialize() 

202 

203 self.force_refresh = force_refresh 

204 self.backend = backend 

205 

206 def initialize(self): 

207 """ 

208 Acts as a "second" init function to skip model accesses during test data generation 

209 """ 

210 self.collection = get_or_create_collection(self.collection_id) 

211 self.source = self.get_or_create_source() 

212 self.user = User.objects.get(username=self.username) 

213 BaseCollectionCrawler.session = get_session() 

214 BaseCollectionCrawler.session.verify = self.verify 

215 self.session.pause_function = self.pause_function 

216 self.session.delay = self.requests_interval 

217 retries = Retry( 

218 total=0, 

219 ) 

220 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

221 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

222 

223 @classmethod 

224 def can_crawl(cls, pid: str) -> bool: 

225 return True 

226 

227 def parse_collection_content(self, content: str) -> list[IssueData]: 

228 """ 

229 Parse the HTML content with BeautifulSoup 

230 returns a list of xissue. 

231 Override this function in a derived class 

232 """ 

233 return [] 

234 

235 def parse_issue_content(self, content: str, xissue: IssueData): 

236 """ 

237 Parse the HTML content with BeautifulSoup 

238 Fills the xissue.articles 

239 Override this function in a derived class. 

240 

241 CAV : You are supposed to create articles there. Please assign a PID to each article. 

242 The PID can be `a + article_index`, like this : `a0` `a21` 

243 """ 

244 

245 def parse_article_content( 

246 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

247 ) -> ArticleData | None: 

248 """ 

249 Parse the HTML content with BeautifulSoup 

250 returns the xarticle. 

251 Override this function in a derived class. 

252 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

253 The article url is also passed as a parameter 

254 

255 CAV : You are supposed to assign articles pid again here 

256 """ 

257 return xarticle 

258 

259 @tracer.start_as_current_span("crawl_collection") 

260 def crawl_collection(self): 

261 # TODO: Comments, filter 

262 """ 

263 Crawl an entire collection. ptf.models.Container objects are created. 

264 - get the HTML content of the collection_url 

265 - parse the HTML content with beautifulsoup to extract the list of issues 

266 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

267 - crawl each issue if col_only is False 

268 - Returns the list of merged issues. 

269 It is an OrderedDict {pid: {"issues": xissues}} 

270 The key is the pid of the merged issues. 

271 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

272 the pid is then made with 1999-2000__6_ 

273 """ 

274 

275 if self.source is None: 

276 raise RuntimeError("ERROR: the source is not set") 

277 

278 content = self.download_file(self.collection_url) 

279 if content: 

280 xissues = self.parse_collection_content(content) 

281 else: 

282 # download_file returns None (404) 

283 return None 

284 

285 """ 

286 Some collections split the same volumes in different pages 

287 Ex: Volume 6 (2000) and Volume 6 (1999) 

288 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

289 """ 

290 # merged_xissues = self.merge_xissues(xissues) 

291 

292 xissues_dict = {str(i.pid): i for i in xissues} 

293 

294 return xissues_dict 

295 

296 def start_process_issue(self, xissue: IssueData): 

297 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

298 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

299 issue_url = xissue.url 

300 if issue_url is not None: 

301 if issue_url.endswith(".pdf"): 

302 add_pdf_link_to_xarticle(xissue, issue_url) 

303 xissue.url = None 

304 else: 

305 content = self.download_file(issue_url) 

306 with self.tracer.start_as_current_span("parse_issue_content"): 

307 self.parse_issue_content(content, xissue) 

308 

309 @tracer.start_as_current_span("crawl_issue") 

310 def crawl_issue(self, xissue: IssueData): 

311 """ 

312 Crawl 1 wag page of an issue. 

313 - get the HTML content of the issue 

314 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

315 - crawl each article 

316 """ 

317 

318 self.start_process_issue(xissue) 

319 

320 xarticles = xissue.articles 

321 

322 parsed_xarticles = [] 

323 

324 for xarticle in xarticles: 

325 parsed_xarticle = self.crawl_article(xarticle, xissue) 

326 if parsed_xarticle is not None: 

327 parsed_xarticles.append(parsed_xarticle) 

328 

329 xissue.articles = parsed_xarticles 

330 

331 issue_has_pdf = self.article_has_pdf(xissue) 

332 

333 if self.ignore_missing_pdf: 

334 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

335 if self.dry: 

336 return 

337 if len(xissue.articles) == 0 and not issue_has_pdf: 

338 return 

339 self.process_resource_metadata(xissue, resource_type="issue") 

340 

341 self.add_xissue_into_database(xissue) 

342 

343 @staticmethod 

344 def article_has_source(art: ArticleData | IssueData): 

345 return ( 

346 next( 

347 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

348 None, 

349 ) 

350 is not None 

351 ) 

352 

353 @staticmethod 

354 def article_has_pdf(art: ArticleData | IssueData): 

355 return ( 

356 next( 

357 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]), 

358 None, 

359 ) 

360 is not None 

361 ) 

362 

363 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

364 # ARTICLE URL as en ExtLink (to display the link in the article page) 

365 if xarticle.url is None: 

366 if not self.article_has_source(xarticle): 366 ↛ 376line 366 didn't jump to line 376 because the condition on line 366 was always true

367 if xissue.url: 

368 article_source = xissue.url 

369 else: 

370 article_source = self.collection_url 

371 ext_link = create_extlink() 

372 ext_link["rel"] = "source" 

373 ext_link["location"] = article_source 

374 ext_link["metadata"] = self.source_domain 

375 xarticle.ext_links.append(ext_link) 

376 return self.process_article_metadata(xarticle) 

377 

378 parsed_xarticle = xarticle 

379 if self.parse_article_content.__func__ != BaseCollectionCrawler.parse_article_content: 

380 content = self.download_file(xarticle.url) 

381 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

382 

383 try: 

384 with self.tracer.start_as_current_span("parse_article_content"): 

385 parsed_xarticle = self.parse_article_content( 

386 content, xissue, xarticle, xarticle.url 

387 ) 

388 except ValueError as e: 

389 self.logger.warning(e) 

390 self.logger.warning("Retrying in 5 mins while invalidating cache") 

391 self.pause_function(5 * 60) 

392 content = self.download_file(xarticle.url, force_refresh=True) 

393 with self.tracer.start_as_current_span("parse_article_content"): 

394 parsed_xarticle = self.parse_article_content( 

395 content, xissue, xarticle, xarticle.url 

396 ) 

397 

398 if parsed_xarticle is None: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true

399 return None 

400 

401 if parsed_xarticle.doi: 

402 parsed_xarticle.pid = ( 

403 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

404 ) 

405 

406 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

407 ext_link = create_extlink() 

408 ext_link["rel"] = "source" 

409 ext_link["location"] = parsed_xarticle.url 

410 ext_link["metadata"] = self.source_domain 

411 parsed_xarticle.ext_links.append(ext_link) 

412 

413 # The article title may have formulas surrounded with '$' 

414 return self.process_article_metadata(parsed_xarticle) 

415 

416 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

417 tag = "article-title" if resource_type == "article" else "issue-title" 

418 

419 # Process title tex 

420 ckeditor_data = build_jats_data_from_html_field( 

421 xresource.title_tex, 

422 tag=tag, 

423 text_lang=xresource.lang, 

424 delimiter_inline=self.delimiter_inline_formula, 

425 delimiter_disp=self.delimiter_disp_formula, 

426 ) 

427 

428 xresource.title_html = ckeditor_data["value_html"] 

429 # xresource.title_tex = ckeditor_data["value_tex"] 

430 xresource.title_xml = ckeditor_data["value_xml"] 

431 

432 abstracts_to_parse = [ 

433 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

434 ] 

435 # abstract may have formulas surrounded with '$' 

436 if len(abstracts_to_parse) > 0: 

437 for xabstract in abstracts_to_parse: 

438 ckeditor_data = build_jats_data_from_html_field( 

439 xabstract["value_tex"], 

440 tag="abstract", 

441 text_lang=xabstract["lang"], 

442 resource_lang=xresource.lang, 

443 field_type="abstract", 

444 delimiter_inline=self.delimiter_inline_formula, 

445 delimiter_disp=self.delimiter_disp_formula, 

446 ) 

447 

448 xabstract["value_html"] = ckeditor_data["value_html"] 

449 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

450 xabstract["value_xml"] = ckeditor_data["value_xml"] 

451 

452 return xresource 

453 

454 def process_article_metadata(self, xarticle: ArticleData): 

455 self.process_resource_metadata(xarticle) 

456 for bibitem in xarticle.bibitems: 

457 bibitem.type = "unknown" 

458 update_data_for_jats(xarticle, with_label=False) 

459 

460 return xarticle 

461 

462 def download_file(self, url: str, force_refresh=False, headers={}): 

463 """ 

464 Downloads a page and returns its content (decoded string). 

465 """ 

466 

467 for attempt in range(3): 

468 response = self.get( 

469 url, 

470 force_refresh=force_refresh, 

471 headers=headers, 

472 pause_function=self.pause_function, 

473 ) 

474 

475 content = self.decode_response(response) 

476 if content == "" or not content: 

477 self.logger.debug("Got empty content while fetching ! ") 

478 # 15 mins, 30 mins, 45 mins 

479 delay_minutes = attempt * 15 

480 self.logger.debug( 

481 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

482 extra={"url": url}, 

483 ) 

484 self.pause_function(delay_minutes * 60) 

485 continue 

486 return content 

487 raise ValueError(f"Could not decode content at {url}") 

488 

489 @classmethod 

490 def get(cls, url, *args, headers={}, force_refresh=False, pause_function=time.sleep, **kwargs): 

491 current_exception = Exception(f"Could fetch url {url}") 

492 for attempt in range(3): 

493 try: 

494 kwargs = { 

495 "url": url, 

496 "headers": {**cls.headers, **headers}, 

497 "timeout": cls.requests_timeout, 

498 "force_refresh": force_refresh, 

499 **kwargs, 

500 } 

501 if attempt > 0 and isinstance(cls.session, CachedSession): 

502 kwargs["force_refresh"] = True 

503 response = cls.session.get(*args, **kwargs) 

504 return response 

505 except ( 

506 requests.ConnectionError, 

507 requests.ConnectTimeout, 

508 requests.exceptions.HTTPError, 

509 ) as e: 

510 current_exception = e 

511 cls.logger.debug(f"Caught error : {e}", extra={"url": url}) 

512 # 15 mins, 30 mins, 45 mins 

513 delay_minutes = attempt * 15 

514 cls.logger.debug( 

515 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

516 extra={"url": url}, 

517 ) 

518 pause_function(delay_minutes * 60) 

519 

520 raise current_exception 

521 

522 def decode_response(self, response: requests.Response, encoding: str | None = None): 

523 """Override this if the content-type headers from the sources are advertising something else than the actual content 

524 SASA needs this""" 

525 # Force 

526 if encoding: 

527 response.encoding = encoding 

528 return response.text 

529 

530 # Attempt to get encoding using HTTP headers 

531 content_type_tag = response.headers.get("Content-Type", None) 

532 

533 if content_type_tag: 533 ↛ 540line 533 didn't jump to line 540 because the condition on line 533 was always true

534 charset = self.parse_content_type_charset(content_type_tag) 

535 if charset: 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true

536 response.encoding = charset 

537 return response.text 

538 

539 # Attempt to get encoding using HTML meta charset tag 

540 soup = BeautifulSoup(response.text, "html5lib") 

541 charset = soup.select_one("meta[charset]") 

542 if charset: 

543 htmlencoding = charset.get("charset") 

544 if isinstance(htmlencoding, str): 544 ↛ 549line 544 didn't jump to line 549 because the condition on line 544 was always true

545 response.encoding = htmlencoding 

546 return response.text 

547 

548 # Attempt to get encoding using HTML meta content type tag 

549 content_type_tag = soup.select_one( 

550 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]' 

551 ) 

552 if content_type_tag: 

553 content_type = content_type_tag.get("content") 

554 if isinstance(content_type, str): 554 ↛ 560line 554 didn't jump to line 560 because the condition on line 554 was always true

555 charset = self.parse_content_type_charset(content_type) 

556 if charset: 556 ↛ 560line 556 didn't jump to line 560 because the condition on line 556 was always true

557 response.encoding = charset 

558 return response.text 

559 

560 return response.text 

561 

562 @staticmethod 

563 def parse_content_type_charset(content_type: str): 

564 header = EmailPolicy.header_factory("content-type", content_type) 

565 if "charset" in header.params: 

566 return header.params.get("charset") 

567 

568 @tracer.start_as_current_span("add_xissue_to_database") 

569 def add_xissue_into_database(self, xissue: IssueData) -> IssueData: 

570 xissue.journal = self.collection 

571 xissue.source = self.source_domain 

572 

573 if xissue.year == "": 

574 raise ValueError("Failsafe : Cannot insert issue without a year") 

575 

576 xpub = create_publisherdata() 

577 xpub.name = self.publisher 

578 xissue.publisher = xpub 

579 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

580 

581 attempt = 1 

582 success = False 

583 

584 while not success and attempt < 4: 

585 try: 

586 params = {"xissue": xissue, "use_body": False} 

587 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

588 cmd.do() 

589 success = True 

590 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

591 return xissue 

592 except SolrError: 

593 self.logger.warning( 

594 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

595 ) 

596 attempt += 1 

597 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

598 self.pause_function(10) 

599 except Exception as e: 

600 self.logger.error( 

601 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

602 ) 

603 raise e 

604 

605 if success is False: 

606 raise ConnectionRefusedError("Cannot connect to SolR") 

607 

608 assert False, "Unreachable" 

609 

610 def get_metadata_using_citation_meta( 

611 self, 

612 xarticle: ArticleData, 

613 xissue: IssueData, 

614 soup: BeautifulSoup, 

615 what: list[CitationLiteral] = [], 

616 ): 

617 """ 

618 :param xarticle: the xarticle that will collect the metadata 

619 :param xissue: the xissue that will collect the publisher 

620 :param soup: the BeautifulSoup object of tha article page 

621 :param what: list of citation_ items to collect. 

622 :return: None. The given article is modified 

623 """ 

624 

625 if "title" in what: 

626 # TITLE 

627 citation_title_node = soup.select_one("meta[name='citation_title']") 

628 if citation_title_node: 628 ↛ 633line 628 didn't jump to line 633 because the condition on line 628 was always true

629 title = citation_title_node.get("content") 

630 if isinstance(title, str): 630 ↛ 633line 630 didn't jump to line 633 because the condition on line 630 was always true

631 xarticle.title_tex = title 

632 

633 if "author" in what: 633 ↛ 662line 633 didn't jump to line 662 because the condition on line 633 was always true

634 # AUTHORS 

635 citation_author_nodes = soup.select("meta[name^='citation_author']") 

636 current_author: ContributorDict | None = None 

637 for citation_author_node in citation_author_nodes: 

638 if citation_author_node.get("name") == "citation_author": 

639 text_author = citation_author_node.get("content") 

640 if not isinstance(text_author, str): 640 ↛ 641line 640 didn't jump to line 641 because the condition on line 640 was never true

641 raise ValueError("Cannot parse author") 

642 if text_author == "": 642 ↛ 643line 642 didn't jump to line 643 because the condition on line 642 was never true

643 current_author = None 

644 continue 

645 current_author = create_contributor(role="author", string_name=text_author) 

646 xarticle.contributors.append(current_author) 

647 continue 

648 if current_author is None: 648 ↛ 649line 648 didn't jump to line 649 because the condition on line 648 was never true

649 self.logger.warning("Couldn't parse citation author") 

650 continue 

651 if citation_author_node.get("name") == "citation_author_institution": 

652 text_institution = citation_author_node.get("content") 

653 if not isinstance(text_institution, str): 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true

654 continue 

655 current_author["addresses"].append(text_institution) 

656 if citation_author_node.get("name") == "citation_author_ocrid": 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true

657 text_orcid = citation_author_node.get("content") 

658 if not isinstance(text_orcid, str): 

659 continue 

660 current_author["orcid"] = text_orcid 

661 

662 if "pdf" in what: 

663 # PDF 

664 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

665 if citation_pdf_node: 

666 pdf_url = citation_pdf_node.get("content") 

667 if isinstance(pdf_url, str): 667 ↛ 670line 667 didn't jump to line 670 because the condition on line 667 was always true

668 add_pdf_link_to_xarticle(xarticle, pdf_url) 

669 

670 if "lang" in what: 

671 # LANG 

672 citation_lang_node = soup.select_one("meta[name='citation_language']") 

673 if citation_lang_node: 673 ↛ 679line 673 didn't jump to line 679 because the condition on line 673 was always true

674 # TODO: check other language code 

675 content_text = citation_lang_node.get("content") 

676 if isinstance(content_text, str): 676 ↛ 679line 676 didn't jump to line 679 because the condition on line 676 was always true

677 xarticle.lang = standardize_tag(content_text) 

678 

679 if "abstract" in what: 

680 # ABSTRACT 

681 abstract_node = soup.select_one("meta[name='citation_abstract']") 

682 if abstract_node is not None: 

683 abstract = abstract_node.get("content") 

684 if not isinstance(abstract, str): 684 ↛ 685line 684 didn't jump to line 685 because the condition on line 684 was never true

685 raise ValueError("Couldn't parse abstract from meta") 

686 abstract = BeautifulSoup(abstract, "html.parser").text 

687 lang = abstract_node.get("lang") 

688 if not isinstance(lang, str): 

689 lang = self.detect_language(abstract, xarticle) 

690 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

691 

692 if "page" in what: 

693 # PAGES 

694 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

695 if citation_fpage_node: 

696 page = citation_fpage_node.get("content") 

697 if isinstance(page, str): 697 ↛ 702line 697 didn't jump to line 702 because the condition on line 697 was always true

698 page = page.split("(")[0] 

699 if len(page) < 32: 699 ↛ 702line 699 didn't jump to line 702 because the condition on line 699 was always true

700 xarticle.fpage = page 

701 

702 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

703 if citation_lpage_node: 

704 page = citation_lpage_node.get("content") 

705 if isinstance(page, str): 705 ↛ 710line 705 didn't jump to line 710 because the condition on line 705 was always true

706 page = page.split("(")[0] 

707 if len(page) < 32: 707 ↛ 710line 707 didn't jump to line 710 because the condition on line 707 was always true

708 xarticle.lpage = page 

709 

710 if "doi" in what: 

711 # DOI 

712 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

713 if citation_doi_node: 

714 doi = citation_doi_node.get("content") 

715 if isinstance(doi, str): 715 ↛ 722line 715 didn't jump to line 722 because the condition on line 715 was always true

716 doi = doi.strip() 

717 pos = doi.find("10.") 

718 if pos > 0: 

719 doi = doi[pos:] 

720 xarticle.doi = doi 

721 

722 if "mr" in what: 

723 # MR 

724 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

725 if citation_mr_node: 

726 mr = citation_mr_node.get("content") 

727 if isinstance(mr, str): 727 ↛ 733line 727 didn't jump to line 733 because the condition on line 727 was always true

728 mr = mr.strip() 

729 if mr.find("MR") == 0: 729 ↛ 733line 729 didn't jump to line 733 because the condition on line 729 was always true

730 mr = mr[2:] 

731 xarticle.extids.append(("mr-item-id", mr)) 

732 

733 if "zbl" in what: 

734 # ZBL 

735 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

736 if citation_zbl_node: 

737 zbl = citation_zbl_node.get("content") 

738 if isinstance(zbl, str): 738 ↛ 744line 738 didn't jump to line 744 because the condition on line 738 was always true

739 zbl = zbl.strip() 

740 if zbl.find("Zbl") == 0: 740 ↛ 744line 740 didn't jump to line 744 because the condition on line 740 was always true

741 zbl = zbl[3:].strip() 

742 xarticle.extids.append(("zbl-item-id", zbl)) 

743 

744 if "publisher" in what: 

745 # PUBLISHER 

746 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

747 if citation_publisher_node: 

748 pub = citation_publisher_node.get("content") 

749 if isinstance(pub, str): 749 ↛ 756line 749 didn't jump to line 756 because the condition on line 749 was always true

750 pub = pub.strip() 

751 if pub != "": 751 ↛ 756line 751 didn't jump to line 756 because the condition on line 751 was always true

752 xpub = create_publisherdata() 

753 xpub.name = pub 

754 xissue.publisher = xpub 

755 

756 if "keywords" in what: 

757 # KEYWORDS 

758 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

759 for kwd_node in citation_kwd_nodes: 

760 kwds = kwd_node.get("content") 

761 if isinstance(kwds, str): 761 ↛ 759line 761 didn't jump to line 759 because the condition on line 761 was always true

762 kwds = kwds.split(",") 

763 for kwd in kwds: 

764 if kwd == "": 

765 continue 

766 kwd = kwd.strip() 

767 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

768 

769 if "references" in what: 

770 citation_references = soup.select("meta[name='citation_reference']") 

771 for index, tag in enumerate(citation_references): 

772 content = tag.get("content") 

773 if not isinstance(content, str): 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true

774 raise ValueError("Cannot parse citation_reference meta") 

775 label = str(index + 1) 

776 if regex.match(r"^\[\d+\].*", content): 776 ↛ 777line 776 didn't jump to line 777 because the condition on line 776 was never true

777 label = None 

778 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

779 

780 def get_metadata_using_dcterms( 

781 self, 

782 xarticle: ArticleData, 

783 soup: "Tag", 

784 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]", 

785 ): 

786 if "abstract" in what: 786 ↛ 794line 786 didn't jump to line 794 because the condition on line 786 was always true

787 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

788 if abstract_tag: 788 ↛ 794line 788 didn't jump to line 794 because the condition on line 788 was always true

789 abstract_text = self.get_str_attr(abstract_tag, "content") 

790 xarticle.abstracts.append( 

791 create_abstract(lang="en", value_tex=cleanup_str(abstract_text)) 

792 ) 

793 

794 if "keywords" in what: 794 ↛ 803line 794 didn't jump to line 803 because the condition on line 794 was always true

795 keyword_tags = soup.select("meta[name='DC.subject']") 

796 for tag in keyword_tags: 

797 kwd_text = tag.get("content") 

798 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true

799 continue 

800 kwd = create_subj(value=kwd_text) 

801 xarticle.kwds.append(kwd) 

802 

803 if "date_published" in what: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true

804 published_tag = soup.select_one("meta[name='DC.Date.created']") 

805 if published_tag: 

806 published_text = self.get_str_attr(published_tag, "content") 

807 xarticle.date_published = published_text 

808 

809 if "article_type" in what: 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true

810 type_tag = soup.select_one("meta[name='DC.Type.articleType']") 

811 if type_tag: 

812 type_text = self.get_str_attr(type_tag, "content") 

813 xarticle.atype = type_text 

814 

815 def create_xissue( 

816 self, 

817 url: str | None, 

818 year: str, 

819 volume_number: str | None, 

820 issue_number: str | None = None, 

821 vseries: str | None = None, 

822 ): 

823 if url is not None and url.endswith("/"): 

824 url = url[:-1] 

825 xissue = create_issuedata() 

826 xissue.url = url 

827 

828 xissue.pid = self.get_issue_pid( 

829 self.collection_id, year, volume_number, issue_number, vseries 

830 ) 

831 

832 xissue.year = year 

833 

834 if volume_number is not None: 

835 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number) 

836 

837 if issue_number is not None: 

838 xissue.number = issue_number.replace(",", "-") 

839 

840 if vseries is not None: 840 ↛ 841line 840 didn't jump to line 841 because the condition on line 840 was never true

841 xissue.vseries = vseries 

842 return xissue 

843 

844 def detect_language(self, text: str, article: ArticleData | None = None): 

845 if article and article.lang is not None and article.lang != "und": 

846 return article.lang 

847 

848 language = self.language_detector.detect_language_of(text) 

849 

850 if not language: 850 ↛ 851line 850 didn't jump to line 851 because the condition on line 850 was never true

851 return "und" 

852 return language.iso_code_639_1.name.lower() 

853 

854 def get_str_attr(self, tag: "Tag", attr: str): 

855 """Equivalent of `tag.get(attr)`, but ensures the return value is a string""" 

856 node_attr = tag.get(attr) 

857 if isinstance(node_attr, list): 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true

858 raise ValueError( 

859 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes." 

860 ) 

861 if node_attr is None: 861 ↛ 862line 861 didn't jump to line 862 because the condition on line 861 was never true

862 raise ValueError( 

863 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes" 

864 ) 

865 return node_attr 

866 

867 def create_trans_title( 

868 self, 

869 resource_type: str, 

870 title_str: str, 

871 lang: str, 

872 xresource_lang: str, 

873 title_type: str = "main", 

874 ): 

875 tag = "trans-title" if resource_type == "article" else "issue-title" 

876 

877 ckeditor_data = build_jats_data_from_html_field( 

878 title_str, 

879 tag=tag, 

880 text_lang=lang, 

881 resource_lang=xresource_lang, 

882 delimiter_inline=self.delimiter_inline_formula, 

883 delimiter_disp=self.delimiter_disp_formula, 

884 ) 

885 

886 titledata = create_titledata( 

887 lang=lang, 

888 type="main", 

889 title_html=ckeditor_data["value_html"], 

890 title_xml=ckeditor_data["value_xml"], 

891 ) 

892 

893 return titledata 

894 

895 references_mapping = { 

896 "citation_title": get_article_title_xml, 

897 "citation_journal_title": get_source_xml, 

898 "citation_publication_date": get_year_xml, 

899 "citation_firstpage": get_fpage_xml, 

900 "citation_lastpage": get_lpage_xml, 

901 } 

902 

903 @classmethod 

904 def __parse_meta_citation_reference(cls, content: str, label=None): 

905 categories = content.split(";") 

906 

907 if len(categories) == 1: 

908 return JatsBase.bake_ref(content, label=label) 

909 

910 citation_data = [c.split("=") for c in categories if "=" in c] 

911 del categories 

912 

913 xml_string = "" 

914 authors_parsed = False 

915 authors_strings = [] 

916 for data in citation_data: 

917 key = data[0].strip() 

918 citation_content = data[1] 

919 if key == "citation_author": 

920 authors_strings.append(get_author_xml(template_str=citation_content)) 

921 continue 

922 elif not authors_parsed: 

923 xml_string += ", ".join(authors_strings) 

924 authors_parsed = True 

925 

926 if key in cls.references_mapping: 

927 xml_string += " " + cls.references_mapping[key](citation_content) 

928 

929 return JatsBase.bake_ref(xml_string, label=label) 

930 

931 @classmethod 

932 def get_or_create_source(cls): 

933 source, created = Source.objects.get_or_create( 

934 domain=cls.source_domain, 

935 defaults={ 

936 "name": cls.source_name, 

937 "website": cls.source_website, 

938 "view_id": cls.get_view_id(), 

939 }, 

940 ) 

941 if created: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true

942 source.save() 

943 return source 

944 

945 @staticmethod 

946 def get_issue_pid( 

947 collection_id: str, 

948 year: str, 

949 volume_number: str | None = None, 

950 issue_number: str | None = None, 

951 series: str | None = None, 

952 ): 

953 # Replace any non-word character with an underscore 

954 pid = f"{collection_id}_{year}" 

955 if series is not None: 955 ↛ 956line 955 didn't jump to line 956 because the condition on line 955 was never true

956 pid += f"_{series}" 

957 if volume_number is not None: 

958 pid += f"_{volume_number}" 

959 if issue_number is not None: 

960 pid += f"_{issue_number}" 

961 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid)) 

962 return pid 

963 

964 @staticmethod 

965 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

966 pages_split = pages.split(separator) 

967 if len(pages_split) == 0: 967 ↛ 968line 967 didn't jump to line 968 because the condition on line 967 was never true

968 article.page_range = pages 

969 if len(pages_split) > 0: 969 ↛ exitline 969 didn't return from function 'set_pages' because the condition on line 969 was always true

970 if pages[0].isnumeric(): 970 ↛ exitline 970 didn't return from function 'set_pages' because the condition on line 970 was always true

971 article.fpage = pages_split[0] 

972 if ( 972 ↛ 977line 972 didn't jump to line 977 because the condition on line 972 was never true

973 len(pages_split) > 1 

974 and pages_split[0] != pages_split[1] 

975 and pages_split[1].isnumeric() 

976 ): 

977 article.lpage = pages_split[1] 

978 

979 @staticmethod 

980 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse): 

981 content_type = response.headers.get("Content-Type") 

982 if regex.match(rb"^%PDF-\d\.\d", chunk): 

983 if content_type and "application/pdf" in content_type: 

984 # The file is unmistakably a pdf 

985 return [ 

986 True, 

987 response, 

988 { 

989 "status": ExtlinkChecked.Status.OK, 

990 "message": "", 

991 }, 

992 ] 

993 # The file is a pdf, but the content type advertised by the server is wrong 

994 return [ 

995 True, 

996 response, 

997 { 

998 "status": ExtlinkChecked.Status.WARNING, 

999 "message": f"Content-Type header: {content_type}", 

1000 }, 

1001 ] 

1002 

1003 # Reaching here means we couldn't find the pdf. 

1004 if not content_type or "application/pdf" not in content_type: 

1005 return [ 

1006 False, 

1007 response, 

1008 { 

1009 "status": ExtlinkChecked.Status.ERROR, 

1010 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}", 

1011 }, 

1012 ] 

1013 

1014 return [ 

1015 False, 

1016 response, 

1017 { 

1018 "status": ExtlinkChecked.Status.ERROR, 

1019 "message": f"PDF Header not found: got {chunk}", 

1020 }, 

1021 ] 

1022 

1023 @classmethod 

1024 async def a_check_pdf_link_validity( 

1025 cls, url: str, verify=True 

1026 ) -> list[bool | aiohttp.ClientResponse | dict]: 

1027 """ 

1028 Check the validity of the PDF links. 

1029 """ 

1030 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1031 header = { 

1032 "Range": f"bytes=0-{CHUNK_SIZE}", 

1033 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1034 } 

1035 async with cls.async_session.get( 

1036 url, headers=header, allow_redirects=True, ssl=verify 

1037 ) as response: 

1038 try: 

1039 chunk = await response.content.read(CHUNK_SIZE) 

1040 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1041 except StopIteration: 

1042 return [ 

1043 False, 

1044 response, 

1045 { 

1046 "status": ExtlinkChecked.Status.ERROR, 

1047 "message": "Error reading PDF header", 

1048 }, 

1049 ] 

1050 

1051 @classmethod 

1052 def check_pdf_link_validity( 

1053 cls, url: str, verify=True 

1054 ) -> list[bool | requests.Response | None | dict]: 

1055 """ 

1056 Check the validity of the PDF links. 

1057 """ 

1058 CHUNK_SIZE = 10 # Nombre de caractères à récupérer 

1059 header = { 

1060 "Range": f"bytes=0-{CHUNK_SIZE}", 

1061 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

1062 } 

1063 with cls.get( 

1064 url, headers=header, allow_redirects=True, verify=verify, stream=True 

1065 ) as response: 

1066 try: 

1067 chunk = next(response.iter_content(CHUNK_SIZE)) 

1068 return BaseCollectionCrawler._process_pdf_header(chunk, response) 

1069 except StopIteration: 

1070 return [ 

1071 False, 

1072 response, 

1073 { 

1074 "status": ExtlinkChecked.Status.ERROR, 

1075 "message": "Error reading PDF header", 

1076 }, 

1077 ] 

1078 

1079 @classmethod 

1080 async def check_extlink_validity(cls, extlink: "ExtLink"): 

1081 """ 

1082 Method used by rot_monitoring to check if links have expired 

1083 """ 

1084 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK} 

1085 header = { 

1086 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" 

1087 } 

1088 verify = True 

1089 if not cls.verify: 

1090 verify = False 

1091 try: 

1092 # For the GDZ links, we just check if the http response is 200 or 206 

1093 if ( 

1094 extlink.rel == "article-pdf" 

1095 and "gdz.sub.uni-goettingen.de" not in extlink.location 

1096 ): 

1097 isok, response, message = await cls.a_check_pdf_link_validity( 

1098 extlink.location, verify 

1099 ) 

1100 defaults.update(message) 

1101 defaults["http_status"] = response.status 

1102 else: 

1103 async with cls.async_session.get( 

1104 url=extlink.location, 

1105 headers=header, 

1106 allow_redirects=True, 

1107 ssl=verify, 

1108 ) as response: 

1109 defaults["http_status"] = response.status 

1110 if response.status not in (200, 206): 

1111 defaults["status"] = ExtlinkChecked.Status.ERROR 

1112 

1113 except aiohttp.ClientSSLError: 

1114 cls.logger.error("SSL error for the url: %s", extlink.location) 

1115 defaults["status"] = ExtlinkChecked.Status.ERROR 

1116 defaults["message"] = "SSL error" 

1117 except aiohttp.ClientConnectionError: 

1118 cls.logger.error("Connection error for the url: %s", extlink.location) 

1119 defaults["status"] = ExtlinkChecked.Status.ERROR 

1120 defaults["message"] = "Connection error" 

1121 except TimeoutError: 

1122 cls.logger.error("Timeout error for the url: %s", extlink.location) 

1123 defaults["status"] = ExtlinkChecked.Status.ERROR 

1124 defaults["message"] = "Timeout error" 

1125 finally: 

1126 try: 

1127 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults) 

1128 cls.logger.info( 

1129 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location 

1130 ) 

1131 except IntegrityError: 

1132 cls.logger.error( 

1133 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location 

1134 ) 

1135 

1136 def resolve_year_end(self, pid: str, default: int) -> int: 

1137 if pid in self.pid_year_restrictions: 

1138 return datetime.now().year - self.pid_year_restrictions[pid] 

1139 return default