Coverage for src/crawler/base_crawler.py: 68%

516 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-10-29 14:25 +0000

1import logging 

2import time 

3from concurrent.futures import ( 

4 Executor, 

5 ThreadPoolExecutor, 

6) 

7from datetime import datetime, timedelta 

8from email.policy import EmailPolicy 

9 

10import regex 

11import requests 

12from bs4 import BeautifulSoup 

13from django.conf import settings 

14from django.contrib.auth.models import User 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetector, LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_titledata, 

43) 

44from ptf.model_data_converter import update_data_for_jats 

45from pylatexenc.latex2text import LatexNodes2Text 

46from pymongo.errors import DocumentTooLarge 

47from pysolr import SolrError 

48from requests.adapters import HTTPAdapter 

49from requests_cache import CachedSession, MongoCache 

50from urllib3 import Retry 

51 

52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

53from crawler.models import Source 

54from crawler.types import CitationLiteral 

55from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

56 

57 

58class CrawlerTitleDict(TitleDict): 

59 title_tex: str | None 

60 

61 

62class BaseCollectionCrawler: 

63 """ 

64 Base collection for the crawlers. 

65 To create a crawler: 

66 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

67 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

68 3) update factory.py so that crawler_factory can return your new crawler 

69 """ 

70 

71 logger = logging.getLogger(__name__) 

72 tracer = trace.get_tracer(__name__) 

73 

74 source_name = "" 

75 source_domain = "" 

76 source_website = "" 

77 

78 issue_href = "" 

79 

80 collection = None 

81 source = None 

82 user = None 

83 session: requests.Session | CachedSession 

84 

85 verify = True 

86 headers = { 

87 "accept_encoding": "utf-8", 

88 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

89 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

90 } 

91 

92 next_allowed_request: float = time.time() 

93 

94 # seconds to wait between two http requests 

95 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

96 # seconds to wait before aborting the connection (if no bytes are recieved) 

97 requests_timeout = 60 

98 

99 latext_parser = LatexNodes2Text() 

100 

101 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

102 # do not use the "$" to surround tex formulas 

103 delimiter_inline_formula = "$" 

104 delimiter_disp_formula = "$" 

105 

106 # HACK : Workaround for tests (monkeypatching) 

107 # We store the class here, so we can monkeypatch it when running tests 

108 # subCrawlers = { 

109 # LofplCrawler: None 

110 # } 

111 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

112 

113 _language_detector: LanguageDetector | None = None 

114 _language_detector_builder = LanguageDetectorBuilder.from_all_languages() 

115 

116 force_refresh = False 

117 

118 # Whereas to include headers in requests cache key 

119 match_headers = False 

120 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

121 

122 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

123 ignore_missing_pdf = True 

124 

125 database_executor: Executor 

126 

127 @classmethod 

128 def get_view_id(cls): 

129 return cls.source_domain 

130 

131 @property 

132 def language_detector(self): 

133 """Crawler Instance singleton for language builder. 

134 Late init of LanguageDetector to save on memory""" 

135 if not self._language_detector: 

136 self._language_detector = self._language_detector_builder.build() 

137 return self._language_detector 

138 

139 def __init__( 

140 self, 

141 *args, 

142 username: str, 

143 collection_id: str, 

144 collection_url: str, 

145 test_mode: bool = False, 

146 publisher: str = "mathdoc", 

147 force_refresh=False, 

148 ): 

149 for CrawlerClass in self.subCrawlers: 149 ↛ 150line 149 didn't jump to line 150 because the loop on line 149 never started

150 self.subCrawlers[CrawlerClass] = CrawlerClass( 

151 *args, 

152 username=username, 

153 collection_id=collection_id, 

154 collection_url=collection_url, 

155 test_mode=test_mode, 

156 publisher=publisher, 

157 ) 

158 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

159 

160 self.username = username 

161 

162 self.collection_id = collection_id 

163 self.collection_url = ( 

164 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

165 ) 

166 

167 self.test_mode = test_mode 

168 self.publisher = publisher 

169 

170 self.session = requests.session() 

171 

172 # Skipped when running tests 

173 self.initialize() 

174 self.session.verify = self.verify 

175 self.force_refresh = force_refresh 

176 

177 # We implemented custom retry behaviour, so we don't want to make extra requests here 

178 retries = Retry( 

179 total=0, 

180 ) 

181 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

182 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

183 

184 self.database_executor = ThreadPoolExecutor( 

185 max_workers=2, thread_name_prefix="crawler_database_thread" 

186 ) 

187 

188 def initialize(self): 

189 """ 

190 Acts as a "second" init function to skip model accesses during test data generation 

191 """ 

192 self.collection = get_or_create_collection(self.collection_id) 

193 self.source = self.get_or_create_source() 

194 self.user = User.objects.get(username=self.username) 

195 self.session = CachedSession( 

196 match_headers=self.match_headers, 

197 headers=self.headers, 

198 backend=MongoCache( 

199 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), 

200 ), 

201 expire_after=timedelta(days=30), 

202 ) 

203 

204 @classmethod 

205 def can_crawl(cls, pid: str) -> bool: 

206 return True 

207 

208 def parse_collection_content(self, content: str) -> list[IssueData]: 

209 """ 

210 Parse the HTML content with BeautifulSoup 

211 returns a list of xissue. 

212 Override this function in a derived class 

213 """ 

214 return [] 

215 

216 def parse_issue_content(self, content: str, xissue: IssueData): 

217 """ 

218 Parse the HTML content with BeautifulSoup 

219 Fills the xissue.articles 

220 Override this function in a derived class. 

221 

222 CAV : You are supposed to create articles there. Please assign a PID to each article. 

223 The PID can be `a + article_index`, like this : `a0` `a21` 

224 """ 

225 

226 def parse_article_content( 

227 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

228 ) -> ArticleData | None: 

229 """ 

230 Parse the HTML content with BeautifulSoup 

231 returns the xarticle. 

232 Override this function in a derived class. 

233 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

234 The article url is also passed as a parameter 

235 

236 CAV : You are supposed to assign articles pid again here 

237 """ 

238 return xarticle 

239 

240 @tracer.start_as_current_span("crawl_collection") 

241 def crawl_collection(self): 

242 # TODO: Comments, filter 

243 """ 

244 Crawl an entire collection. ptf.models.Container objects are created. 

245 - get the HTML content of the collection_url 

246 - parse the HTML content with beautifulsoup to extract the list of issues 

247 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

248 - crawl each issue if col_only is False 

249 - Returns the list of merged issues. 

250 It is an OrderedDict {pid: {"issues": xissues}} 

251 The key is the pid of the merged issues. 

252 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

253 the pid is then made with 1999-2000__6_ 

254 """ 

255 

256 if self.source is None: 

257 raise RuntimeError("ERROR: the source is not set") 

258 

259 content = self.download_file(self.collection_url) 

260 if content: 

261 xissues = self.parse_collection_content(content) 

262 else: 

263 # download_file returns None (404) 

264 return None 

265 

266 """ 

267 Some collections split the same volumes in different pages 

268 Ex: Volume 6 (2000) and Volume 6 (1999) 

269 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

270 """ 

271 # merged_xissues = self.merge_xissues(xissues) 

272 

273 xissues_dict = {str(i.pid): i for i in xissues} 

274 

275 return xissues_dict 

276 

277 @tracer.start_as_current_span("crawl_issue") 

278 def crawl_issue(self, xissue: IssueData): 

279 """ 

280 Crawl 1 wag page of an issue. 

281 - get the HTML content of the issue 

282 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

283 - crawl each article 

284 """ 

285 

286 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

287 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

288 

289 issue_url = xissue.url 

290 if issue_url is not None: 

291 if issue_url.endswith(".pdf"): 

292 add_pdf_link_to_xarticle(xissue, issue_url) 

293 xissue.url = None 

294 else: 

295 content = self.download_file(issue_url) 

296 with self.tracer.start_as_current_span("parse_issue_content"): 

297 self.parse_issue_content(content, xissue) 

298 

299 xarticles = xissue.articles 

300 

301 parsed_xarticles = [] 

302 

303 for xarticle in xarticles: 

304 parsed_xarticle = self.crawl_article(xarticle, xissue) 

305 if parsed_xarticle is not None: 

306 parsed_xarticles.append(parsed_xarticle) 

307 

308 xissue.articles = parsed_xarticles 

309 

310 article_has_pdf = self.article_has_pdf(xissue) 

311 

312 if self.ignore_missing_pdf: 

313 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

314 

315 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

316 self.process_resource_metadata(xissue, resource_type="issue") 

317 self.database_executor.submit(self.add_xissue_into_database, xissue) 

318 

319 @staticmethod 

320 def article_has_source(art: ArticleData | IssueData): 

321 return ( 

322 next( 

323 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

324 None, 

325 ) 

326 is not None 

327 ) 

328 

329 @staticmethod 

330 def article_has_pdf(art: ArticleData | IssueData): 

331 return ( 

332 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

333 is not None 

334 ) 

335 

336 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

337 # ARTICLE URL as en ExtLink (to display the link in the article page) 

338 if xarticle.url is None: 

339 if not self.article_has_source(xarticle): 339 ↛ 349line 339 didn't jump to line 349 because the condition on line 339 was always true

340 if xissue.url: 

341 article_source = xissue.url 

342 else: 

343 article_source = self.collection_url 

344 ext_link = create_extlink() 

345 ext_link["rel"] = "source" 

346 ext_link["location"] = article_source 

347 ext_link["metadata"] = self.source_domain 

348 xarticle.ext_links.append(ext_link) 

349 return self.process_article_metadata(xarticle) 

350 

351 content = self.download_file(xarticle.url) 

352 if not content: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 return None 

354 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

355 

356 try: 

357 with self.tracer.start_as_current_span("parse_article_content"): 

358 parsed_xarticle = self.parse_article_content( 

359 content, xissue, xarticle, xarticle.url 

360 ) 

361 except ValueError as e: 

362 self.logger.warning(e) 

363 self.logger.warning("Retrying in 5 mins while invalidating cache") 

364 time.sleep(5 * 60) 

365 content = self.download_file(xarticle.url, force_refresh=True) 

366 with self.tracer.start_as_current_span("parse_article_content"): 

367 parsed_xarticle = self.parse_article_content( 

368 content, xissue, xarticle, xarticle.url 

369 ) 

370 

371 if parsed_xarticle is None: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 return None 

373 

374 if parsed_xarticle.doi: 

375 parsed_xarticle.pid = ( 

376 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

377 ) 

378 

379 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

380 ext_link = create_extlink() 

381 ext_link["rel"] = "source" 

382 ext_link["location"] = parsed_xarticle.url 

383 ext_link["metadata"] = self.source_domain 

384 parsed_xarticle.ext_links.append(ext_link) 

385 

386 # The article title may have formulas surrounded with '$' 

387 return self.process_article_metadata(parsed_xarticle) 

388 

389 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

390 tag = "article-title" if resource_type == "article" else "issue-title" 

391 

392 # Process title tex 

393 ckeditor_data = build_jats_data_from_html_field( 

394 xresource.title_tex, 

395 tag=tag, 

396 text_lang=xresource.lang, 

397 delimiter_inline=self.delimiter_inline_formula, 

398 delimiter_disp=self.delimiter_disp_formula, 

399 ) 

400 

401 xresource.title_html = ckeditor_data["value_html"] 

402 # xresource.title_tex = ckeditor_data["value_tex"] 

403 xresource.title_xml = ckeditor_data["value_xml"] 

404 

405 # Process trans_title tex 

406 if xresource.trans_title_tex: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 self.logger.warning( 

408 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

409 ) 

410 trans_title = self.create_trans_title( 

411 xresource_lang=xresource.lang, 

412 resource_type=resource_type, 

413 title_tex=xresource.trans_title_tex, 

414 lang=xresource.trans_lang, 

415 ) 

416 xresource.titles.append(trans_title) 

417 

418 abstracts_to_parse = [ 

419 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

420 ] 

421 # abstract may have formulas surrounded with '$' 

422 if len(abstracts_to_parse) > 0: 

423 for xabstract in abstracts_to_parse: 

424 ckeditor_data = build_jats_data_from_html_field( 

425 xabstract["value_tex"], 

426 tag="abstract", 

427 text_lang=xabstract["lang"], 

428 resource_lang=xresource.lang, 

429 field_type="abstract", 

430 delimiter_inline=self.delimiter_inline_formula, 

431 delimiter_disp=self.delimiter_disp_formula, 

432 ) 

433 

434 xabstract["value_html"] = ckeditor_data["value_html"] 

435 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

436 xabstract["value_xml"] = ckeditor_data["value_xml"] 

437 

438 return xresource 

439 

440 def process_article_metadata(self, xarticle: ArticleData): 

441 self.process_resource_metadata(xarticle) 

442 for bibitem in xarticle.bibitems: 

443 bibitem.type = "unknown" 

444 update_data_for_jats(xarticle, with_label=False) 

445 

446 return xarticle 

447 

448 def _wait_download_delay(self): 

449 delta = self.next_allowed_request - time.time() 

450 self.next_allowed_request = time.time() + self.requests_interval 

451 if delta > 0: 

452 self.logger.info(f"Waiting {int(delta)}s before making another request") 

453 time.sleep(delta) 

454 

455 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response: 

456 """ 

457 Wrapper around requests.get with delay based on the crawler class instance 

458 """ 

459 

460 kwargs = {} 

461 # self.session.cache.delete(urls=[url]) 

462 if isinstance(self.session, CachedSession): 

463 kwargs["force_refresh"] = force_refresh 

464 

465 try: 

466 response = self.session.get( 

467 url, 

468 headers={**self.headers, **headers}, 

469 timeout=self.requests_timeout, 

470 **kwargs, 

471 ) 

472 except DocumentTooLarge as e: 

473 self.logger.error(e) 

474 response = requests.get( 

475 url, headers={**self.headers, **headers}, timeout=self.requests_timeout 

476 ) 

477 

478 if not response.ok: 

479 raise requests.exceptions.HTTPError( 

480 f"Endpoint answered with code {response.status_code} : {url}", 

481 response=response, 

482 ) 

483 

484 if not getattr(response, "from_cache", False): 

485 self._wait_download_delay() 

486 return response 

487 

488 def download_file(self, url: str, force_refresh=False, headers={}): 

489 """ 

490 Downloads a page and returns its content (decoded string). 

491 This function handles retries and decoding 

492 """ 

493 attempts = 0 

494 while True: 

495 try: 

496 if attempts > 0: 

497 force_refresh = True 

498 response = self._get( 

499 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

500 ) 

501 

502 if getattr(response, "from_cache", False): 

503 return response.text 

504 

505 content = self.decode_response(response) 

506 if content == "" or not content: 

507 raise requests.exceptions.HTTPError(response) 

508 

509 if isinstance(self.session, CachedSession): 

510 if "Expires" in response.headers: 

511 del response.headers["Expires"] 

512 del response.headers["Cache-Control"] 

513 try: 

514 self.session.cache.save_response(response) 

515 except DocumentTooLarge as e: 

516 self.logger.warning(e) 

517 return content 

518 except ( 

519 requests.ConnectionError, 

520 requests.ConnectTimeout, 

521 requests.exceptions.HTTPError, 

522 ) as e: 

523 if isinstance(e, requests.exceptions.HTTPError): 

524 # if Error 404 (resource not found) we skip 

525 status_code = e.response.status_code 

526 if status_code == 404: 

527 return None 

528 else: 

529 raise e 

530 if attempts > 3: 

531 raise e 

532 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

533 attempts += 1 

534 # 15 mins, 30 mins, 45 mins 

535 delay_minutes = attempts * 15 

536 self.logger.debug( 

537 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

538 extra={"url": url}, 

539 ) 

540 time.sleep(delay_minutes * 60) 

541 

542 def decode_response(self, response: requests.Response, encoding: str | None = None): 

543 """Override this if the content-type headers from the sources are advertising something else than the actual content 

544 SASA needs this""" 

545 # Force 

546 if encoding: 

547 response.encoding = encoding 

548 return response.text 

549 

550 # Attempt to get encoding using HTTP headers 

551 content_type_tag = response.headers.get("Content-Type", None) 

552 

553 if content_type_tag: 553 ↛ 560line 553 didn't jump to line 560 because the condition on line 553 was always true

554 charset = self.parse_content_type_charset(content_type_tag) 

555 if charset: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 response.encoding = charset 

557 return response.text 

558 

559 # Attempt to get encoding using HTML meta charset tag 

560 soup = BeautifulSoup(response.text, "html5lib") 

561 charset = soup.select_one("meta[charset]") 

562 if charset: 

563 htmlencoding = charset.get("charset") 

564 if isinstance(htmlencoding, str): 564 ↛ 569line 564 didn't jump to line 569 because the condition on line 564 was always true

565 response.encoding = htmlencoding 

566 return response.text 

567 

568 # Attempt to get encoding using HTML meta content type tag 

569 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]') 

570 if content_type_tag: 

571 content_type = content_type_tag.get("content") 

572 if isinstance(content_type, str): 572 ↛ 578line 572 didn't jump to line 578 because the condition on line 572 was always true

573 charset = self.parse_content_type_charset(content_type) 

574 if charset: 574 ↛ 578line 574 didn't jump to line 578 because the condition on line 574 was always true

575 response.encoding = charset 

576 return response.text 

577 

578 return response.text 

579 

580 @staticmethod 

581 def parse_content_type_charset(content_type: str): 

582 header = EmailPolicy.header_factory("content-type", content_type) 

583 if "charset" in header.params: 

584 return header.params.get("charset") 

585 

586 @tracer.start_as_current_span("add_xissue_to_database") 

587 def add_xissue_into_database(self, xissue: IssueData): 

588 xissue.journal = self.collection 

589 xissue.source = self.source_domain 

590 

591 if xissue.year == "": 

592 raise ValueError("Failsafe : Cannot insert issue without a year") 

593 

594 xpub = create_publisherdata() 

595 xpub.name = self.publisher 

596 xissue.publisher = xpub 

597 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

598 

599 attempt = 1 

600 success = False 

601 

602 while not success and attempt < 4: 

603 try: 

604 params = {"xissue": xissue, "use_body": False} 

605 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

606 cmd.do() 

607 success = True 

608 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

609 except SolrError: 

610 self.logger.warning( 

611 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

612 ) 

613 attempt += 1 

614 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

615 time.sleep(10) 

616 except Exception as e: 

617 self.logger.error( 

618 f"Got exception while attempting to insert {xissue.pid} in database : {e}" 

619 ) 

620 raise e 

621 

622 if success is False: 

623 raise ConnectionRefusedError("Cannot connect to SolR") 

624 

625 def get_metadata_using_citation_meta( 

626 self, 

627 xarticle: ArticleData, 

628 xissue: IssueData, 

629 soup: BeautifulSoup, 

630 what: list[CitationLiteral] = [], 

631 ): 

632 """ 

633 :param xarticle: the xarticle that will collect the metadata 

634 :param xissue: the xissue that will collect the publisher 

635 :param soup: the BeautifulSoup object of tha article page 

636 :param what: list of citation_ items to collect. 

637 :return: None. The given article is modified 

638 """ 

639 

640 if "title" in what: 

641 # TITLE 

642 citation_title_node = soup.select_one("meta[name='citation_title']") 

643 if citation_title_node: 643 ↛ 648line 643 didn't jump to line 648 because the condition on line 643 was always true

644 title = citation_title_node.get("content") 

645 if isinstance(title, str): 645 ↛ 648line 645 didn't jump to line 648 because the condition on line 645 was always true

646 xarticle.title_tex = title 

647 

648 if "author" in what: 648 ↛ 677line 648 didn't jump to line 677 because the condition on line 648 was always true

649 # AUTHORS 

650 citation_author_nodes = soup.select("meta[name^='citation_author']") 

651 current_author: ContributorDict | None = None 

652 for citation_author_node in citation_author_nodes: 

653 if citation_author_node.get("name") == "citation_author": 

654 text_author = citation_author_node.get("content") 

655 if not isinstance(text_author, str): 655 ↛ 656line 655 didn't jump to line 656 because the condition on line 655 was never true

656 raise ValueError("Cannot parse author") 

657 if text_author == "": 657 ↛ 658line 657 didn't jump to line 658 because the condition on line 657 was never true

658 current_author = None 

659 continue 

660 current_author = create_contributor(role="author", string_name=text_author) 

661 xarticle.contributors.append(current_author) 

662 continue 

663 if current_author is None: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true

664 self.logger.warning("Couldn't parse citation author") 

665 continue 

666 if citation_author_node.get("name") == "citation_author_institution": 

667 text_institution = citation_author_node.get("content") 

668 if not isinstance(text_institution, str): 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true

669 continue 

670 current_author["addresses"].append(text_institution) 

671 if citation_author_node.get("name") == "citation_author_ocrid": 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 text_orcid = citation_author_node.get("content") 

673 if not isinstance(text_orcid, str): 

674 continue 

675 current_author["orcid"] = text_orcid 

676 

677 if "pdf" in what: 

678 # PDF 

679 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

680 if citation_pdf_node: 

681 pdf_url = citation_pdf_node.get("content") 

682 if isinstance(pdf_url, str): 682 ↛ 685line 682 didn't jump to line 685 because the condition on line 682 was always true

683 add_pdf_link_to_xarticle(xarticle, pdf_url) 

684 

685 if "lang" in what: 

686 # LANG 

687 citation_lang_node = soup.select_one("meta[name='citation_language']") 

688 if citation_lang_node: 688 ↛ 694line 688 didn't jump to line 694 because the condition on line 688 was always true

689 # TODO: check other language code 

690 content_text = citation_lang_node.get("content") 

691 if isinstance(content_text, str): 691 ↛ 694line 691 didn't jump to line 694 because the condition on line 691 was always true

692 xarticle.lang = standardize_tag(content_text) 

693 

694 if "abstract" in what: 

695 # ABSTRACT 

696 abstract_node = soup.select_one("meta[name='citation_abstract']") 

697 if abstract_node is not None: 

698 abstract = abstract_node.get("content") 

699 if not isinstance(abstract, str): 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true

700 raise ValueError("Couldn't parse abstract from meta") 

701 abstract = BeautifulSoup(abstract, "html.parser").text 

702 lang = abstract_node.get("lang") 

703 if not isinstance(lang, str): 

704 lang = self.detect_language(abstract, xarticle) 

705 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

706 

707 if "page" in what: 

708 # PAGES 

709 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

710 if citation_fpage_node: 

711 page = citation_fpage_node.get("content") 

712 if isinstance(page, str): 712 ↛ 717line 712 didn't jump to line 717 because the condition on line 712 was always true

713 page = page.split("(")[0] 

714 if len(page) < 32: 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was always true

715 xarticle.fpage = page 

716 

717 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

718 if citation_lpage_node: 

719 page = citation_lpage_node.get("content") 

720 if isinstance(page, str): 720 ↛ 725line 720 didn't jump to line 725 because the condition on line 720 was always true

721 page = page.split("(")[0] 

722 if len(page) < 32: 722 ↛ 725line 722 didn't jump to line 725 because the condition on line 722 was always true

723 xarticle.lpage = page 

724 

725 if "doi" in what: 

726 # DOI 

727 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

728 if citation_doi_node: 

729 doi = citation_doi_node.get("content") 

730 if isinstance(doi, str): 730 ↛ 737line 730 didn't jump to line 737 because the condition on line 730 was always true

731 doi = doi.strip() 

732 pos = doi.find("10.") 

733 if pos > 0: 

734 doi = doi[pos:] 

735 xarticle.doi = doi 

736 

737 if "mr" in what: 

738 # MR 

739 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

740 if citation_mr_node: 

741 mr = citation_mr_node.get("content") 

742 if isinstance(mr, str): 742 ↛ 748line 742 didn't jump to line 748 because the condition on line 742 was always true

743 mr = mr.strip() 

744 if mr.find("MR") == 0: 744 ↛ 748line 744 didn't jump to line 748 because the condition on line 744 was always true

745 mr = mr[2:] 

746 xarticle.extids.append(("mr-item-id", mr)) 

747 

748 if "zbl" in what: 

749 # ZBL 

750 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

751 if citation_zbl_node: 

752 zbl = citation_zbl_node.get("content") 

753 if isinstance(zbl, str): 753 ↛ 759line 753 didn't jump to line 759 because the condition on line 753 was always true

754 zbl = zbl.strip() 

755 if zbl.find("Zbl") == 0: 755 ↛ 759line 755 didn't jump to line 759 because the condition on line 755 was always true

756 zbl = zbl[3:].strip() 

757 xarticle.extids.append(("zbl-item-id", zbl)) 

758 

759 if "publisher" in what: 

760 # PUBLISHER 

761 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

762 if citation_publisher_node: 

763 pub = citation_publisher_node.get("content") 

764 if isinstance(pub, str): 764 ↛ 771line 764 didn't jump to line 771 because the condition on line 764 was always true

765 pub = pub.strip() 

766 if pub != "": 766 ↛ 771line 766 didn't jump to line 771 because the condition on line 766 was always true

767 xpub = create_publisherdata() 

768 xpub.name = pub 

769 xissue.publisher = xpub 

770 

771 if "keywords" in what: 

772 # KEYWORDS 

773 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

774 for kwd_node in citation_kwd_nodes: 

775 kwds = kwd_node.get("content") 

776 if isinstance(kwds, str): 776 ↛ 774line 776 didn't jump to line 774 because the condition on line 776 was always true

777 kwds = kwds.split(",") 

778 for kwd in kwds: 

779 if kwd == "": 

780 continue 

781 kwd = kwd.strip() 

782 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

783 

784 if "references" in what: 

785 citation_references = soup.select("meta[name='citation_reference']") 

786 for index, tag in enumerate(citation_references): 

787 content = tag.get("content") 

788 if not isinstance(content, str): 788 ↛ 789line 788 didn't jump to line 789 because the condition on line 788 was never true

789 raise ValueError("Cannot parse citation_reference meta") 

790 label = str(index + 1) 

791 if regex.match(r"^\[\d+\].*", content): 791 ↛ 792line 791 didn't jump to line 792 because the condition on line 791 was never true

792 label = None 

793 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

794 

795 def create_xissue( 

796 self, 

797 url: str | None, 

798 year: str, 

799 volume_number: str | None, 

800 issue_number: str | None = "1", 

801 vseries: str | None = None, 

802 ): 

803 if url is not None and url.endswith("/"): 

804 url = url[:-1] 

805 xissue = create_issuedata() 

806 xissue.url = url 

807 

808 xissue.pid = self.get_issue_pid( 

809 self.collection_id, year, volume_number, issue_number, vseries 

810 ) 

811 

812 xissue.year = year 

813 

814 if volume_number is not None: 

815 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

816 

817 if issue_number is not None: 

818 xissue.number = issue_number.replace(",", "-") 

819 

820 if vseries is not None: 820 ↛ 821line 820 didn't jump to line 821 because the condition on line 820 was never true

821 xissue.vseries = vseries 

822 return xissue 

823 

824 def detect_language(self, text: str, article: ArticleData | None = None): 

825 if article and article.lang is not None and article.lang != "und": 

826 return article.lang 

827 

828 language = self.language_detector.detect_language_of(text) 

829 

830 if not language: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 return "und" 

832 return language.iso_code_639_1.name.lower() 

833 

834 def create_trans_title( 

835 self, 

836 resource_type: str, 

837 title_tex: str, 

838 lang: str, 

839 xresource_lang: str, 

840 title_type: str = "main", 

841 ): 

842 tag = "trans-article" if resource_type == "article" else "issue-title" 

843 

844 ckeditor_data = build_jats_data_from_html_field( 

845 title_tex, 

846 tag=tag, 

847 text_lang=lang, 

848 resource_lang=xresource_lang, 

849 delimiter_inline=self.delimiter_inline_formula, 

850 delimiter_disp=self.delimiter_disp_formula, 

851 ) 

852 

853 titledata = create_titledata( 

854 lang=lang, 

855 type="main", 

856 title_html=ckeditor_data["value_html"], 

857 title_xml=ckeditor_data["value_xml"], 

858 ) 

859 

860 return titledata 

861 

862 references_mapping = { 

863 "citation_title": get_article_title_xml, 

864 "citation_journal_title": get_source_xml, 

865 "citation_publication_date": get_year_xml, 

866 "citation_firstpage": get_fpage_xml, 

867 "citation_lastpage": get_lpage_xml, 

868 } 

869 

870 @classmethod 

871 def __parse_meta_citation_reference(cls, content: str, label=None): 

872 categories = content.split(";") 

873 

874 if len(categories) == 1: 

875 return JatsBase.bake_ref(content, label=label) 

876 

877 citation_data = [c.split("=") for c in categories if "=" in c] 

878 del categories 

879 

880 xml_string = "" 

881 authors_parsed = False 

882 authors_strings = [] 

883 for data in citation_data: 

884 key = data[0].strip() 

885 citation_content = data[1] 

886 if key == "citation_author": 

887 authors_strings.append(get_author_xml(template_str=citation_content)) 

888 continue 

889 elif not authors_parsed: 

890 xml_string += ", ".join(authors_strings) 

891 authors_parsed = True 

892 

893 if key in cls.references_mapping: 

894 xml_string += " " + cls.references_mapping[key](citation_content) 

895 

896 return JatsBase.bake_ref(xml_string, label=label) 

897 

898 @classmethod 

899 def get_or_create_source(cls): 

900 source, created = Source.objects.get_or_create( 

901 domain=cls.source_domain, 

902 defaults={ 

903 "name": cls.source_name, 

904 "website": cls.source_website, 

905 }, 

906 ) 

907 if created: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true

908 source.save() 

909 return source 

910 

911 @staticmethod 

912 def get_issue_pid( 

913 collection_id: str, 

914 year: str, 

915 volume_number: str | None = None, 

916 issue_number: str | None = None, 

917 series: str | None = None, 

918 ): 

919 # Replace any non-word character with an underscore 

920 pid = f"{collection_id}_{year}" 

921 if series is not None: 921 ↛ 922line 921 didn't jump to line 922 because the condition on line 921 was never true

922 pid += f"_{series}" 

923 if volume_number is not None: 

924 pid += f"_{volume_number}" 

925 if issue_number is not None: 

926 pid += f"_{issue_number}" 

927 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

928 return pid 

929 

930 @staticmethod 

931 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

932 pages_split = pages.split(separator) 

933 if len(pages_split) == 0: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true

934 article.page_range = pages 

935 if len(pages_split) > 0: 935 ↛ exitline 935 didn't return from function 'set_pages' because the condition on line 935 was always true

936 if pages[0].isnumeric(): 936 ↛ exitline 936 didn't return from function 'set_pages' because the condition on line 936 was always true

937 article.fpage = pages_split[0] 

938 if ( 

939 len(pages_split) > 1 

940 and pages_split[0] != pages_split[1] 

941 and pages_split[1].isnumeric() 

942 ): 

943 article.lpage = pages_split[1]