Coverage for src/crawler/base_crawler.py: 70%

493 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1import logging 

2import time 

3from concurrent.futures import ( 

4 Executor, 

5 ThreadPoolExecutor, 

6) 

7from datetime import datetime, timedelta 

8from email.policy import EmailPolicy 

9 

10import regex 

11import requests 

12from bs4 import BeautifulSoup 

13from django.conf import settings 

14from django.contrib.auth.models import User 

15from django.utils import timezone 

16from langcodes import standardize_tag 

17from lingua import LanguageDetectorBuilder 

18from opentelemetry import trace 

19from ptf.cmds.xml.ckeditor.utils import ( 

20 build_jats_data_from_html_field, 

21) 

22from ptf.cmds.xml.jats.builder.references import ( 

23 get_article_title_xml, 

24 get_author_xml, 

25 get_fpage_xml, 

26 get_lpage_xml, 

27 get_source_xml, 

28 get_year_xml, 

29) 

30from ptf.cmds.xml.jats.jats_parser import JatsBase 

31from ptf.model_data import ( 

32 ArticleData, 

33 ContributorDict, 

34 IssueData, 

35 ResourceData, 

36 TitleDict, 

37 create_abstract, 

38 create_contributor, 

39 create_extlink, 

40 create_issuedata, 

41 create_publisherdata, 

42 create_titledata, 

43) 

44from ptf.model_data_converter import update_data_for_jats 

45from pylatexenc.latex2text import LatexNodes2Text 

46from pymongo.errors import DocumentTooLarge 

47from pysolr import SolrError 

48from requests.adapters import HTTPAdapter 

49from requests_cache import CachedSession, MongoCache 

50from urllib3 import Retry 

51 

52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

53from crawler.models import Source 

54from crawler.types import CitationLiteral 

55from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

56 

57# TODO: pass a class factory instead of a dependency to a site 

58# TODO: pass a class factory instead of a dependency to a site 

59 

60 

61class CrawlerTitleDict(TitleDict): 

62 title_tex: str | None 

63 

64 

65class BaseCollectionCrawler: 

66 """ 

67 Base collection for the crawlers. 

68 To create a crawler: 

69 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

70 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

71 3) update factory.py so that crawler_factory can return your new crawler 

72 """ 

73 

74 logger = logging.getLogger(__name__) 

75 tracer = trace.get_tracer(__name__) 

76 

77 source_name = "" 

78 source_domain = "" 

79 source_website = "" 

80 

81 issue_href = "" 

82 

83 collection = None 

84 source = None 

85 user = None 

86 session: requests.Session | CachedSession 

87 

88 verify = True 

89 headers = { 

90 "accept_encoding": "utf-8", 

91 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

92 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

93 } 

94 

95 next_allowed_request: float = time.time() 

96 

97 # seconds to wait between two http requests 

98 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

99 # seconds to wait before aborting the connection (if no bytes are recieved) 

100 requests_timeout = 10 

101 

102 latext_parser = LatexNodes2Text() 

103 

104 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

105 # do not use the "$" to surround tex formulas 

106 delimiter_inline_formula = "$" 

107 delimiter_disp_formula = "$" 

108 

109 # HACK : Workaround for tests (monkeypatching) 

110 # We store the class here, so we can monkeypatch it when running tests 

111 # subCrawlers = { 

112 # LofplCrawler: None 

113 # } 

114 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

115 

116 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

117 

118 force_refresh = False 

119 

120 # Whereas to include headers in requests cache key 

121 match_headers = False 

122 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

123 

124 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

125 ignore_missing_pdf = True 

126 

127 database_executor: Executor 

128 

129 @classmethod 

130 def get_view_id(cls): 

131 return cls.source_domain 

132 

133 def __init__( 

134 self, 

135 *args, 

136 username: str, 

137 collection_id: str, 

138 collection_url: str, 

139 test_mode: bool = False, 

140 publisher: str = "mathdoc", 

141 force_refresh=False, 

142 ): 

143 for CrawlerClass in self.subCrawlers: 143 ↛ 144line 143 didn't jump to line 144 because the loop on line 143 never started

144 self.subCrawlers[CrawlerClass] = CrawlerClass( 

145 *args, 

146 username=username, 

147 collection_id=collection_id, 

148 collection_url=collection_url, 

149 test_mode=test_mode, 

150 publisher=publisher, 

151 ) 

152 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

153 

154 self.username = username 

155 

156 self.collection_id = collection_id 

157 self.collection_url = ( 

158 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

159 ) 

160 

161 self.test_mode = test_mode 

162 self.publisher = publisher 

163 

164 self.session = requests.session() 

165 

166 # Skipped when running tests 

167 self.initialize() 

168 self.session.verify = self.verify 

169 self.force_refresh = force_refresh 

170 

171 # We implemented custom retry behaviour, so we don't want to make extra requests here 

172 retries = Retry( 

173 total=0, 

174 ) 

175 self.session.mount("https://", HTTPAdapter(max_retries=retries)) 

176 self.session.mount("http://", HTTPAdapter(max_retries=retries)) 

177 

178 self.database_executor = ThreadPoolExecutor( 

179 max_workers=2, thread_name_prefix="crawler_database_thread" 

180 ) 

181 

182 def initialize(self): 

183 """ 

184 Acts as a "second" init function to skip model accesses during test data generation 

185 """ 

186 self.collection = get_or_create_collection(self.collection_id) 

187 self.source = self.get_or_create_source() 

188 self.user = User.objects.get(username=self.username) 

189 self.session = CachedSession( 

190 match_headers=self.match_headers, 

191 headers=self.headers, 

192 backend=MongoCache( 

193 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), 

194 ), 

195 expire_after=timedelta(days=30), 

196 ) 

197 

198 @classmethod 

199 def can_crawl(cls, pid: str) -> bool: 

200 return True 

201 

202 def parse_collection_content(self, content: str) -> list[IssueData]: 

203 """ 

204 Parse the HTML content with BeautifulSoup 

205 returns a list of xissue. 

206 Override this function in a derived class 

207 """ 

208 return [] 

209 

210 def parse_issue_content(self, content: str, xissue: IssueData): 

211 """ 

212 Parse the HTML content with BeautifulSoup 

213 Fills the xissue.articles 

214 Override this function in a derived class. 

215 

216 CAV : You are supposed to create articles there. Please assign a PID to each article. 

217 The PID can be `a + article_index`, like this : `a0` `a21` 

218 """ 

219 

220 def parse_article_content( 

221 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

222 ) -> ArticleData | None: 

223 """ 

224 Parse the HTML content with BeautifulSoup 

225 returns the xarticle. 

226 Override this function in a derived class. 

227 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

228 The article url is also passed as a parameter 

229 

230 CAV : You are supposed to assign articles pid again here 

231 """ 

232 return xarticle 

233 

234 @tracer.start_as_current_span("crawl_collection") 

235 def crawl_collection(self): 

236 # TODO: Comments, filter 

237 """ 

238 Crawl an entire collection. ptf.models.Container objects are created. 

239 - get the HTML content of the collection_url 

240 - parse the HTML content with beautifulsoup to extract the list of issues 

241 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

242 - crawl each issue if col_only is False 

243 - Returns the list of merged issues. 

244 It is an OrderedDict {pid: {"issues": xissues}} 

245 The key is the pid of the merged issues. 

246 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

247 the pid is then made with 1999-2000__6_ 

248 """ 

249 

250 if self.source is None: 

251 raise RuntimeError("ERROR: the source is not set") 

252 

253 content = self.download_file(self.collection_url) 

254 xissues = self.parse_collection_content(content) 

255 

256 """ 

257 Some collections split the same volumes in different pages 

258 Ex: Volume 6 (2000) and Volume 6 (1999) 

259 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

260 """ 

261 # merged_xissues = self.merge_xissues(xissues) 

262 

263 xissues_dict = {str(i.pid): i for i in xissues} 

264 

265 return xissues_dict 

266 

267 @tracer.start_as_current_span("crawl_issue") 

268 def crawl_issue(self, xissue: IssueData): 

269 """ 

270 Crawl 1 wag page of an issue. 

271 - get the HTML content of the issue 

272 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

273 - crawl each article 

274 """ 

275 

276 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

277 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

278 

279 issue_url = xissue.url 

280 if issue_url is not None: 

281 if issue_url.endswith(".pdf"): 

282 add_pdf_link_to_xarticle(xissue, issue_url) 

283 xissue.url = None 

284 else: 

285 content = self.download_file(issue_url) 

286 with self.tracer.start_as_current_span("parse_issue_content"): 

287 self.parse_issue_content(content, xissue) 

288 

289 xarticles = xissue.articles 

290 

291 parsed_xarticles = [] 

292 

293 for xarticle in xarticles: 

294 parsed_xarticle = self.crawl_article(xarticle, xissue) 

295 if parsed_xarticle is not None: 

296 parsed_xarticles.append(parsed_xarticle) 

297 

298 xissue.articles = parsed_xarticles 

299 

300 article_has_pdf = self.article_has_pdf(xissue) 

301 

302 if self.ignore_missing_pdf: 

303 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

304 

305 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

306 self.process_resource_metadata(xissue, resource_type="issue") 

307 self.database_executor.submit(self.add_xissue_into_database, xissue) 

308 

309 @staticmethod 

310 def article_has_source(art: ArticleData | IssueData): 

311 return ( 

312 next( 

313 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

314 None, 

315 ) 

316 is not None 

317 ) 

318 

319 @staticmethod 

320 def article_has_pdf(art: ArticleData | IssueData): 

321 return ( 

322 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

323 is not None 

324 ) 

325 

326 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

327 # ARTICLE URL as en ExtLink (to display the link in the article page) 

328 if xarticle.url is None: 

329 if not self.article_has_source(xarticle): 329 ↛ 339line 329 didn't jump to line 339 because the condition on line 329 was always true

330 if xissue.url: 

331 article_source = xissue.url 

332 else: 

333 article_source = self.collection_url 

334 ext_link = create_extlink() 

335 ext_link["rel"] = "source" 

336 ext_link["location"] = article_source 

337 ext_link["metadata"] = self.source_domain 

338 xarticle.ext_links.append(ext_link) 

339 return self.process_article_metadata(xarticle) 

340 

341 content = self.download_file(xarticle.url) 

342 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

343 

344 try: 

345 with self.tracer.start_as_current_span("parse_article_content"): 

346 parsed_xarticle = self.parse_article_content( 

347 content, xissue, xarticle, xarticle.url 

348 ) 

349 except ValueError as e: 

350 self.logger.warning(e) 

351 self.logger.warning("Retrying while invalidating cache") 

352 content = self.download_file(xarticle.url, force_refresh=True) 

353 with self.tracer.start_as_current_span("parse_article_content"): 

354 parsed_xarticle = self.parse_article_content( 

355 content, xissue, xarticle, xarticle.url 

356 ) 

357 

358 if parsed_xarticle is None: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 return None 

360 

361 if parsed_xarticle.doi: 

362 parsed_xarticle.pid = ( 

363 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

364 ) 

365 

366 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

367 ext_link = create_extlink() 

368 ext_link["rel"] = "source" 

369 ext_link["location"] = parsed_xarticle.url 

370 ext_link["metadata"] = self.source_domain 

371 parsed_xarticle.ext_links.append(ext_link) 

372 

373 # The article title may have formulas surrounded with '$' 

374 return self.process_article_metadata(parsed_xarticle) 

375 

376 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

377 tag = "article-title" if resource_type == "article" else "issue-title" 

378 

379 # Process title tex 

380 ckeditor_data = build_jats_data_from_html_field( 

381 xresource.title_tex, 

382 tag=tag, 

383 text_lang=xresource.lang, 

384 delimiter_inline=self.delimiter_inline_formula, 

385 delimiter_disp=self.delimiter_disp_formula, 

386 ) 

387 

388 xresource.title_html = ckeditor_data["value_html"] 

389 # xresource.title_tex = ckeditor_data["value_tex"] 

390 xresource.title_xml = ckeditor_data["value_xml"] 

391 

392 # Process trans_title tex 

393 if xresource.trans_title_tex: 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 self.logger.warning( 

395 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

396 ) 

397 trans_title = self.create_trans_title( 

398 xresource_lang=xresource.lang, 

399 resource_type=resource_type, 

400 title_tex=xresource.trans_title_tex, 

401 lang=xresource.trans_lang, 

402 ) 

403 xresource.titles.append(trans_title) 

404 

405 abstracts_to_parse = [ 

406 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

407 ] 

408 # abstract may have formulas surrounded with '$' 

409 if len(abstracts_to_parse) > 0: 

410 for xabstract in abstracts_to_parse: 

411 ckeditor_data = build_jats_data_from_html_field( 

412 xabstract["value_tex"], 

413 tag="abstract", 

414 text_lang=xabstract["lang"], 

415 resource_lang=xresource.lang, 

416 field_type="abstract", 

417 delimiter_inline=self.delimiter_inline_formula, 

418 delimiter_disp=self.delimiter_disp_formula, 

419 ) 

420 

421 xabstract["value_html"] = ckeditor_data["value_html"] 

422 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

423 xabstract["value_xml"] = ckeditor_data["value_xml"] 

424 

425 return xresource 

426 

427 def process_article_metadata(self, xarticle: ArticleData): 

428 self.process_resource_metadata(xarticle) 

429 for bibitem in xarticle.bibitems: 

430 bibitem.type = "unknown" 

431 update_data_for_jats(xarticle, with_label=False) 

432 

433 return xarticle 

434 

435 def _wait_download_delay(self, url: str, force_refresh=False): 

436 # If we already have a key, we can skip the timeout 

437 if isinstance(self.session, CachedSession): 

438 if self.session.cache.contains(url=url) and not force_refresh: 

439 return 

440 

441 delta = self.next_allowed_request - time.time() 

442 if delta > 0: 

443 self.logger.info(f"Waiting {int(delta)}s before making another request") 

444 time.sleep(delta) 

445 self.next_allowed_request = time.time() + self.requests_interval 

446 

447 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response: 

448 """ 

449 Wrapper around requests.get with delay based on the crawler class instance 

450 """ 

451 

452 self._wait_download_delay(url, force_refresh) 

453 

454 kwargs = {} 

455 # self.session.cache.delete(urls=[url]) 

456 if isinstance(self.session, CachedSession): 

457 kwargs["force_refresh"] = force_refresh 

458 

459 try: 

460 response = self.session.get( 

461 url, 

462 headers={**self.headers, **headers}, 

463 timeout=self.requests_timeout, 

464 **kwargs, 

465 ) 

466 except DocumentTooLarge as e: 

467 self.logger.error(e) 

468 response = requests.get( 

469 url, headers={**self.headers, **headers}, timeout=self.requests_timeout 

470 ) 

471 

472 if not response.ok: 

473 raise requests.exceptions.HTTPError( 

474 f"Endpoint answered with code {response.status_code} : {url}", 

475 response=response, 

476 ) 

477 

478 return response 

479 

480 def download_file(self, url: str, force_refresh=False, headers={}): 

481 """ 

482 Downloads a page and returns its content (decoded string). 

483 This function handles retries and decoding 

484 """ 

485 attempts = 0 

486 while True: 

487 try: 

488 if attempts > 0: 

489 force_refresh = True 

490 response = self._get( 

491 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

492 ) 

493 content = self.decode_response(response) 

494 if content == "" or not content: 

495 raise requests.exceptions.HTTPError(response) 

496 if isinstance(self.session, CachedSession): 

497 if "Expires" in response.headers: 

498 del response.headers["Expires"] 

499 del response.headers["Cache-Control"] 

500 try: 

501 self.session.cache.save_response(response) 

502 except DocumentTooLarge as e: 

503 self.logger.warning(e) 

504 return content 

505 except ( 

506 requests.ConnectionError, 

507 requests.ConnectTimeout, 

508 requests.exceptions.HTTPError, 

509 ) as e: 

510 if attempts > 3: 

511 raise e 

512 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

513 attempts += 1 

514 # 15 mins, 30 mins, 45 mins 

515 delay_minutes = attempts * 15 

516 self.logger.debug( 

517 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

518 extra={"url": url}, 

519 ) 

520 time.sleep(delay_minutes * 60) 

521 

522 def decode_response(self, response: requests.Response, encoding: str | None = None): 

523 """Override this if the content-type headers from the sources are advertising something else than the actual content 

524 SASA needs this""" 

525 # Force 

526 if encoding: 

527 response.encoding = encoding 

528 return response.text 

529 

530 # Attempt to get encoding using HTTP headers 

531 content_type_tag = response.headers.get("Content-Type", None) 

532 

533 if content_type_tag: 533 ↛ 540line 533 didn't jump to line 540 because the condition on line 533 was always true

534 charset = self.parse_content_type_charset(content_type_tag) 

535 if charset: 

536 response.encoding = charset 

537 return response.text 

538 

539 # Attempt to get encoding using HTML meta charset tag 

540 soup = BeautifulSoup(response.text, "html5lib") 

541 charset = soup.select_one("meta[charset]") 

542 if charset: 

543 htmlencoding = charset.get("charset") 

544 if isinstance(htmlencoding, str): 544 ↛ 549line 544 didn't jump to line 549 because the condition on line 544 was always true

545 response.encoding = htmlencoding 

546 return response.text 

547 

548 # Attempt to get encoding using HTML meta content type tag 

549 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]') 

550 if content_type_tag: 

551 content_type = content_type_tag.get("content") 

552 if isinstance(content_type, str): 552 ↛ 558line 552 didn't jump to line 558 because the condition on line 552 was always true

553 charset = self.parse_content_type_charset(content_type) 

554 if charset: 554 ↛ 558line 554 didn't jump to line 558 because the condition on line 554 was always true

555 response.encoding = charset 

556 return response.text 

557 

558 return response.text 

559 

560 @staticmethod 

561 def parse_content_type_charset(content_type: str): 

562 header = EmailPolicy.header_factory("content-type", content_type) 

563 if "charset" in header.params: 

564 return header.params.get("charset") 

565 

566 @tracer.start_as_current_span("add_xissue_to_database") 

567 def add_xissue_into_database(self, xissue: IssueData): 

568 xissue.journal = self.collection 

569 xissue.source = self.source_domain 

570 

571 if xissue.year == "": 

572 raise ValueError("Failsafe : Cannot insert issue without a year") 

573 

574 xpub = create_publisherdata() 

575 xpub.name = self.publisher 

576 xissue.publisher = xpub 

577 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

578 

579 attempt = 1 

580 success = False 

581 

582 while not success and attempt < 4: 

583 try: 

584 params = {"xissue": xissue, "use_body": False} 

585 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

586 cmd.do() 

587 success = True 

588 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

589 except SolrError: 

590 self.logger.warning( 

591 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

592 ) 

593 attempt += 1 

594 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

595 time.sleep(10) 

596 

597 if success is False: 

598 raise ConnectionRefusedError("Cannot connect to SolR") 

599 

600 def get_metadata_using_citation_meta( 

601 self, 

602 xarticle: ArticleData, 

603 xissue: IssueData, 

604 soup: BeautifulSoup, 

605 what: list[CitationLiteral] = [], 

606 ): 

607 """ 

608 :param xarticle: the xarticle that will collect the metadata 

609 :param xissue: the xissue that will collect the publisher 

610 :param soup: the BeautifulSoup object of tha article page 

611 :param what: list of citation_ items to collect. 

612 :return: None. The given article is modified 

613 """ 

614 

615 if "title" in what: 

616 # TITLE 

617 citation_title_node = soup.select_one("meta[name='citation_title']") 

618 if citation_title_node: 618 ↛ 623line 618 didn't jump to line 623 because the condition on line 618 was always true

619 title = citation_title_node.get("content") 

620 if isinstance(title, str): 620 ↛ 623line 620 didn't jump to line 623 because the condition on line 620 was always true

621 xarticle.title_tex = title 

622 

623 if "author" in what: 623 ↛ 652line 623 didn't jump to line 652 because the condition on line 623 was always true

624 # AUTHORS 

625 citation_author_nodes = soup.select("meta[name^='citation_author']") 

626 current_author: ContributorDict | None = None 

627 for citation_author_node in citation_author_nodes: 

628 if citation_author_node.get("name") == "citation_author": 

629 text_author = citation_author_node.get("content") 

630 if not isinstance(text_author, str): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true

631 raise ValueError("Cannot parse author") 

632 if text_author == "": 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true

633 current_author = None 

634 continue 

635 current_author = create_contributor(role="author", string_name=text_author) 

636 xarticle.contributors.append(current_author) 

637 continue 

638 if current_author is None: 638 ↛ 639line 638 didn't jump to line 639 because the condition on line 638 was never true

639 self.logger.warning("Couldn't parse citation author") 

640 continue 

641 if citation_author_node.get("name") == "citation_author_institution": 

642 text_institution = citation_author_node.get("content") 

643 if not isinstance(text_institution, str): 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true

644 continue 

645 current_author["addresses"].append(text_institution) 

646 if citation_author_node.get("name") == "citation_author_ocrid": 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true

647 text_orcid = citation_author_node.get("content") 

648 if not isinstance(text_orcid, str): 

649 continue 

650 current_author["orcid"] = text_orcid 

651 

652 if "pdf" in what: 

653 # PDF 

654 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

655 if citation_pdf_node: 

656 pdf_url = citation_pdf_node.get("content") 

657 if isinstance(pdf_url, str): 657 ↛ 660line 657 didn't jump to line 660 because the condition on line 657 was always true

658 add_pdf_link_to_xarticle(xarticle, pdf_url) 

659 

660 if "lang" in what: 

661 # LANG 

662 citation_lang_node = soup.select_one("meta[name='citation_language']") 

663 if citation_lang_node: 663 ↛ 669line 663 didn't jump to line 669 because the condition on line 663 was always true

664 # TODO: check other language code 

665 content_text = citation_lang_node.get("content") 

666 if isinstance(content_text, str): 666 ↛ 669line 666 didn't jump to line 669 because the condition on line 666 was always true

667 xarticle.lang = standardize_tag(content_text) 

668 

669 if "abstract" in what: 

670 # ABSTRACT 

671 abstract_node = soup.select_one("meta[name='citation_abstract']") 

672 if abstract_node is not None: 

673 abstract = abstract_node.get("content") 

674 if not isinstance(abstract, str): 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 raise ValueError("Couldn't parse abstract from meta") 

676 abstract = BeautifulSoup(abstract, "html.parser").text 

677 lang = abstract_node.get("lang") 

678 if not isinstance(lang, str): 

679 lang = self.detect_language(abstract, xarticle) 

680 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

681 

682 if "page" in what: 

683 # PAGES 

684 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

685 if citation_fpage_node: 

686 page = citation_fpage_node.get("content") 

687 if isinstance(page, str): 687 ↛ 692line 687 didn't jump to line 692 because the condition on line 687 was always true

688 page = page.split("(")[0] 

689 if len(page) < 32: 689 ↛ 692line 689 didn't jump to line 692 because the condition on line 689 was always true

690 xarticle.fpage = page 

691 

692 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

693 if citation_lpage_node: 

694 page = citation_lpage_node.get("content") 

695 if isinstance(page, str): 695 ↛ 700line 695 didn't jump to line 700 because the condition on line 695 was always true

696 page = page.split("(")[0] 

697 if len(page) < 32: 697 ↛ 700line 697 didn't jump to line 700 because the condition on line 697 was always true

698 xarticle.lpage = page 

699 

700 if "doi" in what: 

701 # DOI 

702 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

703 if citation_doi_node: 

704 doi = citation_doi_node.get("content") 

705 if isinstance(doi, str): 705 ↛ 712line 705 didn't jump to line 712 because the condition on line 705 was always true

706 doi = doi.strip() 

707 pos = doi.find("10.") 

708 if pos > 0: 

709 doi = doi[pos:] 

710 xarticle.doi = doi 

711 

712 if "mr" in what: 

713 # MR 

714 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

715 if citation_mr_node: 

716 mr = citation_mr_node.get("content") 

717 if isinstance(mr, str): 717 ↛ 723line 717 didn't jump to line 723 because the condition on line 717 was always true

718 mr = mr.strip() 

719 if mr.find("MR") == 0: 719 ↛ 723line 719 didn't jump to line 723 because the condition on line 719 was always true

720 mr = mr[2:] 

721 xarticle.extids.append(("mr-item-id", mr)) 

722 

723 if "zbl" in what: 

724 # ZBL 

725 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

726 if citation_zbl_node: 

727 zbl = citation_zbl_node.get("content") 

728 if isinstance(zbl, str): 728 ↛ 734line 728 didn't jump to line 734 because the condition on line 728 was always true

729 zbl = zbl.strip() 

730 if zbl.find("Zbl") == 0: 730 ↛ 734line 730 didn't jump to line 734 because the condition on line 730 was always true

731 zbl = zbl[3:].strip() 

732 xarticle.extids.append(("zbl-item-id", zbl)) 

733 

734 if "publisher" in what: 

735 # PUBLISHER 

736 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

737 if citation_publisher_node: 

738 pub = citation_publisher_node.get("content") 

739 if isinstance(pub, str): 739 ↛ 746line 739 didn't jump to line 746 because the condition on line 739 was always true

740 pub = pub.strip() 

741 if pub != "": 741 ↛ 746line 741 didn't jump to line 746 because the condition on line 741 was always true

742 xpub = create_publisherdata() 

743 xpub.name = pub 

744 xissue.publisher = xpub 

745 

746 if "keywords" in what: 

747 # KEYWORDS 

748 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

749 for kwd_node in citation_kwd_nodes: 

750 kwds = kwd_node.get("content") 

751 if isinstance(kwds, str): 751 ↛ 749line 751 didn't jump to line 749 because the condition on line 751 was always true

752 kwds = kwds.split(",") 

753 for kwd in kwds: 

754 if kwd == "": 

755 continue 

756 kwd = kwd.strip() 

757 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

758 

759 if "references" in what: 

760 citation_references = soup.select("meta[name='citation_reference']") 

761 for index, tag in enumerate(citation_references): 

762 content = tag.get("content") 

763 if not isinstance(content, str): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true

764 raise ValueError("Cannot parse citation_reference meta") 

765 label = str(index + 1) 

766 if regex.match(r"^\[\d+\].*", content): 766 ↛ 767line 766 didn't jump to line 767 because the condition on line 766 was never true

767 label = None 

768 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

769 

770 def create_xissue( 

771 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

772 ): 

773 if url is not None and url.endswith("/"): 

774 url = url[:-1] 

775 xissue = create_issuedata() 

776 xissue.url = url 

777 

778 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

779 

780 xissue.year = year 

781 

782 if volume_number is not None: 

783 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

784 

785 if issue_number is not None: 

786 xissue.number = issue_number.replace(",", "-") 

787 return xissue 

788 

789 def detect_language(self, text: str, article: ArticleData | None = None): 

790 if article and article.lang is not None and article.lang != "und": 

791 return article.lang 

792 

793 language = self.language_detector.detect_language_of(text) 

794 

795 if not language: 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true

796 return "und" 

797 return language.iso_code_639_1.name.lower() 

798 

799 def create_trans_title( 

800 self, 

801 resource_type: str, 

802 title_tex: str, 

803 lang: str, 

804 xresource_lang: str, 

805 title_type: str = "main", 

806 ): 

807 tag = "trans-article" if resource_type == "article" else "issue-title" 

808 

809 ckeditor_data = build_jats_data_from_html_field( 

810 title_tex, 

811 tag=tag, 

812 text_lang=lang, 

813 resource_lang=xresource_lang, 

814 delimiter_inline=self.delimiter_inline_formula, 

815 delimiter_disp=self.delimiter_disp_formula, 

816 ) 

817 

818 titledata = create_titledata( 

819 lang=lang, 

820 type="main", 

821 title_html=ckeditor_data["value_html"], 

822 title_xml=ckeditor_data["value_xml"], 

823 ) 

824 

825 return titledata 

826 

827 references_mapping = { 

828 "citation_title": get_article_title_xml, 

829 "citation_journal_title": get_source_xml, 

830 "citation_publication_date": get_year_xml, 

831 "citation_firstpage": get_fpage_xml, 

832 "citation_lastpage": get_lpage_xml, 

833 } 

834 

835 @classmethod 

836 def __parse_meta_citation_reference(cls, content: str, label=None): 

837 categories = content.split(";") 

838 

839 if len(categories) == 1: 

840 return JatsBase.bake_ref(content, label=label) 

841 

842 citation_data = [c.split("=") for c in categories if "=" in c] 

843 del categories 

844 

845 xml_string = "" 

846 authors_parsed = False 

847 authors_strings = [] 

848 for data in citation_data: 

849 key = data[0].strip() 

850 citation_content = data[1] 

851 if key == "citation_author": 

852 authors_strings.append(get_author_xml(template_str=citation_content)) 

853 continue 

854 elif not authors_parsed: 

855 xml_string += ", ".join(authors_strings) 

856 authors_parsed = True 

857 

858 if key in cls.references_mapping: 

859 xml_string += " " + cls.references_mapping[key](citation_content) 

860 

861 return JatsBase.bake_ref(xml_string, label=label) 

862 

863 @classmethod 

864 def get_or_create_source(cls): 

865 source, created = Source.objects.get_or_create( 

866 domain=cls.source_domain, 

867 defaults={ 

868 "name": cls.source_name, 

869 "website": cls.source_website, 

870 }, 

871 ) 

872 if created: 872 ↛ 873line 872 didn't jump to line 873 because the condition on line 872 was never true

873 source.save() 

874 return source 

875 

876 @staticmethod 

877 def get_issue_pid( 

878 collection_id: str, 

879 year: str, 

880 volume_number: str | None = None, 

881 issue_number: str | None = None, 

882 ): 

883 # Replace any non-word character with an underscore 

884 pid = f"{collection_id}_{year}" 

885 if volume_number is not None: 

886 pid += f"_{volume_number}" 

887 if issue_number is not None: 

888 pid += f"_{issue_number}" 

889 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

890 return pid 

891 

892 @staticmethod 

893 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

894 pages_split = pages.split(separator) 

895 if len(pages_split) == 0: 895 ↛ 896line 895 didn't jump to line 896 because the condition on line 895 was never true

896 article.page_range = pages 

897 if len(pages_split) > 0: 897 ↛ exitline 897 didn't return from function 'set_pages' because the condition on line 897 was always true

898 if pages[0].isnumeric(): 898 ↛ exitline 898 didn't return from function 'set_pages' because the condition on line 898 was always true

899 article.fpage = pages_split[0] 

900 if ( 

901 len(pages_split) > 1 

902 and pages_split[0] != pages_split[1] 

903 and pages_split[1].isnumeric() 

904 ): 

905 article.lpage = pages_split[1]