Coverage for src/crawler/base_crawler.py: 73%

430 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import logging 

2import time 

3from datetime import datetime, timedelta 

4 

5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetectorBuilder 

13from opentelemetry import trace 

14from ptf.cmds.xml.ckeditor.utils import ( 

15 build_jats_data_from_html_field, 

16) 

17from ptf.cmds.xml.jats.builder.citation import ( 

18 get_article_title_xml, 

19 get_author_xml, 

20 get_fpage_xml, 

21 get_lpage_xml, 

22 get_source_xml, 

23 get_year_xml, 

24) 

25from ptf.cmds.xml.jats.jats_parser import JatsBase 

26from ptf.model_data import ( 

27 ArticleData, 

28 ContributorDict, 

29 IssueData, 

30 ResourceData, 

31 TitleDict, 

32 create_contributor, 

33 create_extlink, 

34 create_issuedata, 

35 create_publisherdata, 

36 create_titledata, 

37) 

38from ptf.model_data_converter import update_data_for_jats 

39from pylatexenc.latex2text import LatexNodes2Text 

40from pysolr import SolrError 

41from requests_cache import CachedSession, MongoCache 

42 

43from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd 

44from crawler.models import Source 

45from crawler.types import CitationLiteral 

46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 

47 

48# TODO: pass a class factory instead of a dependency to a site 

49# TODO: pass a class factory instead of a dependency to a site 

50 

51 

52class CrawlerTitleDict(TitleDict): 

53 title_tex: str | None 

54 

55 

56class BaseCollectionCrawler: 

57 """ 

58 Base collection for the crawlers. 

59 To create a crawler: 

60 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

61 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

62 3) update factory.py so that crawler_factory can return your new crawler 

63 """ 

64 

65 logger = logging.getLogger(__name__) 

66 tracer = trace.get_tracer(__name__) 

67 

68 source_name = "" 

69 source_domain = "" 

70 source_website = "" 

71 

72 issue_href = "" 

73 

74 collection = None 

75 source = None 

76 user = None 

77 session: requests.Session | CachedSession 

78 

79 verify = True 

80 headers = { 

81 "accept_encoding": "utf-8", 

82 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"), 

83 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"), 

84 } 

85 

86 next_allowed_request: float = time.time() 

87 

88 # seconds to wait between two http requests 

89 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90) 

90 

91 latext_parser = LatexNodes2Text() 

92 

93 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

94 # do not use the "$" to surround tex formulas 

95 delimiter_inline_formula = "$" 

96 delimiter_disp_formula = "$" 

97 

98 # HACK : Workaround for tests (monkeypatching) 

99 # We store the class here, so we can monkeypatch it when running tests 

100 # subCrawlers = { 

101 # LofplCrawler: None 

102 # } 

103 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 

104 

105 language_detector = LanguageDetectorBuilder.from_all_languages().build() 

106 

107 force_refresh = False 

108 

109 # Whereas to include headers in requests cache key 

110 match_headers = False 

111 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})" 

112 

113 # Set this to False on a Crawler-basis to allow inserting articles without PDFs 

114 ignore_missing_pdf = True 

115 

116 def __init__( 

117 self, 

118 *args, 

119 username: str, 

120 collection_id: str, 

121 collection_url: str, 

122 test_mode: bool = False, 

123 publisher: str = "mathdoc", 

124 force_refresh=False, 

125 ): 

126 for CrawlerClass in self.subCrawlers: 126 ↛ 127line 126 didn't jump to line 127 because the loop on line 126 never started

127 self.subCrawlers[CrawlerClass] = CrawlerClass( 

128 *args, 

129 username=username, 

130 collection_id=collection_id, 

131 collection_url=collection_url, 

132 test_mode=test_mode, 

133 publisher=publisher, 

134 ) 

135 self.logger = logging.getLogger(__name__ + "." + self.source_domain) 

136 

137 self.username = username 

138 

139 self.collection_id = collection_id 

140 self.collection_url = ( 

141 collection_url # url of the collection. Ex: https://eudml.org/journal/10098 

142 ) 

143 

144 self.test_mode = test_mode 

145 self.publisher = publisher 

146 

147 self.session = requests.session() 

148 

149 # Skipped when running tests 

150 self.initialize() 

151 

152 self.force_refresh = force_refresh 

153 

154 def initialize(self): 

155 """ 

156 Acts as a "second" init function to skip model accesses during test data generation 

157 """ 

158 self.collection = get_or_create_collection(self.collection_id) 

159 self.source = self.get_or_create_source() 

160 self.user = User.objects.get(username=self.username) 

161 self.session = CachedSession( 

162 match_headers=self.match_headers, 

163 headers=self.headers, 

164 backend=MongoCache( 

165 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), 

166 ), 

167 expire_after=timedelta(days=30), 

168 ) 

169 

170 @classmethod 

171 def can_crawl(cls, pid: str) -> bool: 

172 return True 

173 

174 def parse_collection_content(self, content: str) -> list[IssueData]: 

175 """ 

176 Parse the HTML content with BeautifulSoup 

177 returns a list of xissue. 

178 Override this function in a derived class 

179 """ 

180 return [] 

181 

182 def parse_issue_content(self, content: str, xissue: IssueData): 

183 """ 

184 Parse the HTML content with BeautifulSoup 

185 Fills the xissue.articles 

186 Override this function in a derived class. 

187 

188 CAV : You are supposed to create articles there. Please assign a PID to each article. 

189 The PID can be `a + article_index`, like this : `a0` `a21` 

190 """ 

191 

192 def parse_article_content( 

193 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

194 ) -> ArticleData | None: 

195 """ 

196 Parse the HTML content with BeautifulSoup 

197 returns the xarticle. 

198 Override this function in a derived class. 

199 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

200 The article url is also passed as a parameter 

201 

202 CAV : You are supposed to assign articles pid again here 

203 """ 

204 return xarticle 

205 

206 @tracer.start_as_current_span("crawl_collection") 

207 def crawl_collection(self): 

208 # TODO: Comments, filter 

209 """ 

210 Crawl an entire collection. ptf.models.Container objects are created. 

211 - get the HTML content of the collection_url 

212 - parse the HTML content with beautifulsoup to extract the list of issues 

213 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

214 - crawl each issue if col_only is False 

215 - Returns the list of merged issues. 

216 It is an OrderedDict {pid: {"issues": xissues}} 

217 The key is the pid of the merged issues. 

218 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

219 the pid is then made with 1999-2000__6_ 

220 """ 

221 

222 if self.source is None: 

223 raise RuntimeError("ERROR: the source is not set") 

224 

225 content = self.download_file(self.collection_url) 

226 xissues = self.parse_collection_content(content) 

227 

228 """ 

229 Some collections split the same volumes in different pages 

230 Ex: Volume 6 (2000) and Volume 6 (1999) 

231 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

232 """ 

233 # merged_xissues = self.merge_xissues(xissues) 

234 

235 xissues_dict = {str(i.pid): i for i in xissues} 

236 

237 return xissues_dict 

238 

239 @tracer.start_as_current_span("crawl_issue") 

240 def crawl_issue(self, xissue: IssueData): 

241 """ 

242 Crawl 1 wag page of an issue. 

243 - get the HTML content of the issue 

244 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

245 - crawl each article 

246 """ 

247 

248 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

249 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

250 

251 issue_url = xissue.url 

252 if issue_url is not None: 

253 if issue_url.endswith(".pdf"): 

254 add_pdf_link_to_xarticle(xissue, issue_url) 

255 xissue.url = None 

256 else: 

257 content = self.download_file(issue_url) 

258 with self.tracer.start_as_current_span("parse_issue_content"): 

259 self.parse_issue_content(content, xissue) 

260 

261 xarticles = xissue.articles 

262 

263 parsed_xarticles = [] 

264 

265 for xarticle in xarticles: 

266 parsed_xarticle = self.crawl_article(xarticle, xissue) 

267 if parsed_xarticle is not None: 

268 parsed_xarticles.append(parsed_xarticle) 

269 

270 xissue.articles = parsed_xarticles 

271 

272 article_has_pdf = self.article_has_pdf(xissue) 

273 

274 if self.ignore_missing_pdf: 

275 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)] 

276 

277 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf): 

278 self.process_resource_metadata(xissue, resource_type="issue") 

279 self.add_xissue_into_database(xissue) 

280 

281 @staticmethod 

282 def article_has_source(art: ArticleData | IssueData): 

283 return ( 

284 next( 

285 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

286 None, 

287 ) 

288 is not None 

289 ) 

290 

291 @staticmethod 

292 def article_has_pdf(art: ArticleData | IssueData): 

293 return ( 

294 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) 

295 is not None 

296 ) 

297 

298 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

299 # ARTICLE URL as en ExtLink (to display the link in the article page) 

300 if xarticle.url is None: 

301 if not self.article_has_source(xarticle): 301 ↛ 311line 301 didn't jump to line 311 because the condition on line 301 was always true

302 if xissue.url: 

303 article_source = xissue.url 

304 else: 

305 article_source = self.collection_url 

306 ext_link = create_extlink() 

307 ext_link["rel"] = "source" 

308 ext_link["location"] = article_source 

309 ext_link["metadata"] = self.source_domain 

310 xarticle.ext_links.append(ext_link) 

311 return self.process_article_metadata(xarticle) 

312 

313 content = self.download_file(xarticle.url) 

314 

315 xarticle.pid = f"{xissue.pid}_{xarticle.pid}" 

316 

317 with self.tracer.start_as_current_span("parse_article_content"): 

318 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url) 

319 if parsed_xarticle is None: 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true

320 return None 

321 

322 if parsed_xarticle.doi: 

323 parsed_xarticle.pid = ( 

324 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

325 ) 

326 

327 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

328 ext_link = create_extlink() 

329 ext_link["rel"] = "source" 

330 ext_link["location"] = parsed_xarticle.url 

331 ext_link["metadata"] = self.source_domain 

332 parsed_xarticle.ext_links.append(ext_link) 

333 

334 # The article title may have formulas surrounded with '$' 

335 return self.process_article_metadata(parsed_xarticle) 

336 

337 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"): 

338 tag = "article-title" if resource_type == "article" else "issue-title" 

339 

340 # Process title tex 

341 ckeditor_data = build_jats_data_from_html_field( 

342 xresource.title_tex, 

343 tag=tag, 

344 text_lang=xresource.lang, 

345 delimiter_inline=self.delimiter_inline_formula, 

346 delimiter_disp=self.delimiter_disp_formula, 

347 ) 

348 

349 xresource.title_html = ckeditor_data["value_html"] 

350 # xresource.title_tex = ckeditor_data["value_tex"] 

351 xresource.title_xml = ckeditor_data["value_xml"] 

352 

353 # Process trans_title tex 

354 if xresource.trans_title_tex: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true

355 self.logger.warning( 

356 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex" 

357 ) 

358 trans_title = self.create_trans_title( 

359 xresource_lang=xresource.lang, 

360 resource_type=resource_type, 

361 title_tex=xresource.trans_title_tex, 

362 lang=xresource.trans_lang, 

363 ) 

364 xresource.titles.append(trans_title) 

365 

366 abstracts_to_parse = [ 

367 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract" 

368 ] 

369 # abstract may have formulas surrounded with '$' 

370 if len(abstracts_to_parse) > 0: 

371 for xabstract in abstracts_to_parse: 

372 ckeditor_data = build_jats_data_from_html_field( 

373 xabstract["value_tex"], 

374 tag="abstract", 

375 text_lang=xabstract["lang"], 

376 resource_lang=xresource.lang, 

377 field_type="abstract", 

378 delimiter_inline=self.delimiter_inline_formula, 

379 delimiter_disp=self.delimiter_disp_formula, 

380 ) 

381 

382 xabstract["value_html"] = ckeditor_data["value_html"] 

383 # xabstract["value_tex"] = ckeditor_data["value_tex"] 

384 xabstract["value_xml"] = ckeditor_data["value_xml"] 

385 

386 return xresource 

387 

388 def process_article_metadata(self, xresource: ResourceData): 

389 self.process_resource_metadata(xresource) 

390 update_data_for_jats(xresource) 

391 

392 return xresource 

393 

394 def get(self, url: str, force_refresh=False, headers={}): 

395 attempt = 0 

396 response = None 

397 

398 while attempt < 3: 398 ↛ 432line 398 didn't jump to line 432 because the condition on line 398 was always true

399 # If we already have a key, we can skip the timeout 

400 if isinstance(self.session, CachedSession): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 if not self.session.cache.contains(url=url) or force_refresh: 

402 delta = self.next_allowed_request - time.time() 

403 if delta > 0: 

404 self.logger.debug(f"Waiting {int(delta)}s before making another request") 

405 time.sleep(delta) 

406 self.next_allowed_request = time.time() + self.requests_interval 

407 try: 

408 # self.session.cache.delete(urls=[url]) 

409 if isinstance(self.session, CachedSession): 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true

410 response = self.session.get( 

411 url, 

412 headers={**self.headers, **headers}, 

413 verify=self.verify, 

414 force_refresh=force_refresh, 

415 ) 

416 else: 

417 response = self.session.get( 

418 url, headers={**self.headers, **headers}, verify=self.verify 

419 ) 

420 if not response.ok: 420 ↛ 421line 420 didn't jump to line 421 because the condition on line 420 was never true

421 raise requests.exceptions.HTTPError( 

422 f"Endpoint answered with code {response.status_code} : {url}", 

423 response=response, 

424 ) 

425 return response 

426 except ( 

427 requests.ConnectionError, 

428 requests.ConnectTimeout, 

429 requests.exceptions.HTTPError, 

430 ): 

431 attempt += 1 

432 raise requests.exceptions.HTTPError(f"Unable to download {url}") 

433 

434 def download_file(self, url: str, force_refresh=False, headers={}): 

435 """ 

436 Downloads a URL, saves its content on disk in filename and returns its content. 

437 """ 

438 attempts = 0 

439 while True: 

440 try: 

441 response = self.get( 

442 url, force_refresh=force_refresh or self.force_refresh, headers=headers 

443 ) 

444 content = self.decode_response(response) 

445 if content == "" or not content: 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true

446 raise requests.exceptions.HTTPError(response) 

447 return content 

448 except requests.exceptions.HTTPError as e: 

449 self.logger.debug(f"Caught error : {e}", extra={"url": url}) 

450 attempts += 1 

451 # 15 mins, 30 mins, 45 mins 

452 delay_minutes = attempts * 15 

453 self.logger.debug( 

454 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})", 

455 extra={"url": url}, 

456 ) 

457 time.sleep(delay_minutes * 60) 

458 if attempts > 3: 

459 raise e 

460 

461 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

462 """Override this if the content-type headers from the sources are advertising something else than the actual content 

463 SASA needs this""" 

464 response.encoding = encoding 

465 return response.text 

466 

467 @tracer.start_as_current_span("add_xissue_to_database") 

468 def add_xissue_into_database(self, xissue: IssueData): 

469 xissue.journal = self.collection 

470 xissue.source = self.source_domain 

471 

472 if xissue.year == "": 

473 raise ValueError("Failsafe : Cannot insert issue without a year") 

474 

475 xpub = create_publisherdata() 

476 xpub.name = self.publisher 

477 xissue.publisher = xpub 

478 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat() 

479 

480 attempt = 1 

481 success = False 

482 

483 while not success and attempt < 4: 

484 try: 

485 params = {"xissue": xissue, "use_body": False} 

486 cmd = addOrUpdateGDMLIssueXmlCmd(params) 

487 cmd.do() 

488 success = True 

489 self.logger.debug(f"Issue {xissue.pid} inserted in database") 

490 except SolrError: 

491 self.logger.warn( 

492 f"Encoutered SolrError while inserting issue {xissue.pid} in database" 

493 ) 

494 attempt += 1 

495 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.") 

496 time.sleep(10) 

497 

498 if success is False: 

499 raise ConnectionRefusedError("Cannot connect to SolR") 

500 

501 def get_metadata_using_citation_meta( 

502 self, 

503 xarticle: ArticleData, 

504 xissue: IssueData, 

505 soup: BeautifulSoup, 

506 what: list[CitationLiteral] = [], 

507 ): 

508 """ 

509 If parsing references using this method, use `xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))` afterwards to append the references to the article 

510 

511 

512 :param xarticle: the xarticle that will collect the metadata 

513 :param xissue: the xissue that will collect the publisher 

514 :param soup: the BeautifulSoup object of tha article page 

515 :param what: list of citation_ items to collect. 

516 :return: None. The given article is modified 

517 """ 

518 

519 if "title" in what: 

520 # TITLE 

521 citation_title_node = soup.select_one("meta[name='citation_title']") 

522 if citation_title_node: 522 ↛ 527line 522 didn't jump to line 527 because the condition on line 522 was always true

523 title = citation_title_node.get("content") 

524 if isinstance(title, str): 524 ↛ 527line 524 didn't jump to line 527 because the condition on line 524 was always true

525 xarticle.title_tex = title 

526 

527 if "author" in what: 527 ↛ 556line 527 didn't jump to line 556 because the condition on line 527 was always true

528 # AUTHORS 

529 citation_author_nodes = soup.select("meta[name^='citation_author']") 

530 current_author: ContributorDict | None = None 

531 for citation_author_node in citation_author_nodes: 

532 if citation_author_node.get("name") == "citation_author": 

533 text_author = citation_author_node.get("content") 

534 if not isinstance(text_author, str): 534 ↛ 535line 534 didn't jump to line 535 because the condition on line 534 was never true

535 raise ValueError("Cannot parse author") 

536 if text_author == "": 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true

537 current_author = None 

538 continue 

539 current_author = create_contributor(role="author", string_name=text_author) 

540 xarticle.contributors.append(current_author) 

541 continue 

542 if current_author is None: 542 ↛ 543line 542 didn't jump to line 543 because the condition on line 542 was never true

543 self.logger.warning("Couldn't parse citation author") 

544 continue 

545 if citation_author_node.get("name") == "citation_author_institution": 

546 text_institution = citation_author_node.get("content") 

547 if not isinstance(text_institution, str): 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true

548 continue 

549 current_author["addresses"].append(text_institution) 

550 if citation_author_node.get("name") == "citation_author_ocrid": 550 ↛ 551line 550 didn't jump to line 551 because the condition on line 550 was never true

551 text_orcid = citation_author_node.get("content") 

552 if not isinstance(text_orcid, str): 

553 continue 

554 current_author["orcid"] = text_orcid 

555 

556 if "pdf" in what: 

557 # PDF 

558 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

559 if citation_pdf_node: 

560 pdf_url = citation_pdf_node.get("content") 

561 if isinstance(pdf_url, str): 561 ↛ 564line 561 didn't jump to line 564 because the condition on line 561 was always true

562 add_pdf_link_to_xarticle(xarticle, pdf_url) 

563 

564 if "lang" in what: 

565 # LANG 

566 citation_lang_node = soup.select_one("meta[name='citation_language']") 

567 if citation_lang_node: 567 ↛ 573line 567 didn't jump to line 573 because the condition on line 567 was always true

568 # TODO: check other language code 

569 content_text = citation_lang_node.get("content") 

570 if isinstance(content_text, str): 570 ↛ 573line 570 didn't jump to line 573 because the condition on line 570 was always true

571 xarticle.lang = standardize_tag(content_text) 

572 

573 if "abstract" in what: 

574 # ABSTRACT 

575 abstract_node = soup.select_one("meta[name='citation_abstract']") 

576 if abstract_node is not None: 

577 abstract = abstract_node.get("content") 

578 if not isinstance(abstract, str): 578 ↛ 579line 578 didn't jump to line 579 because the condition on line 578 was never true

579 raise ValueError("Couldn't parse abstract from meta") 

580 abstract = BeautifulSoup(abstract, "html.parser").text 

581 lang = abstract_node.get("lang") 

582 if not isinstance(lang, str): 

583 lang = self.detect_language(abstract, xarticle) 

584 xarticle.abstracts.append( 

585 { 

586 "tag": "abstract", 

587 "value_html": "", 

588 "value_tex": abstract, 

589 "value_xml": "", 

590 "lang": lang, 

591 } 

592 ) 

593 

594 if "page" in what: 

595 # PAGES 

596 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

597 if citation_fpage_node: 

598 page = citation_fpage_node.get("content") 

599 if isinstance(page, str): 599 ↛ 604line 599 didn't jump to line 604 because the condition on line 599 was always true

600 page = page.split("(")[0] 

601 if len(page) < 32: 601 ↛ 604line 601 didn't jump to line 604 because the condition on line 601 was always true

602 xarticle.fpage = page 

603 

604 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

605 if citation_lpage_node: 

606 page = citation_lpage_node.get("content") 

607 if isinstance(page, str): 607 ↛ 612line 607 didn't jump to line 612 because the condition on line 607 was always true

608 page = page.split("(")[0] 

609 if len(page) < 32: 609 ↛ 612line 609 didn't jump to line 612 because the condition on line 609 was always true

610 xarticle.lpage = page 

611 

612 if "doi" in what: 

613 # DOI 

614 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

615 if citation_doi_node: 

616 doi = citation_doi_node.get("content") 

617 if isinstance(doi, str): 617 ↛ 624line 617 didn't jump to line 624 because the condition on line 617 was always true

618 doi = doi.strip() 

619 pos = doi.find("10.") 

620 if pos > 0: 

621 doi = doi[pos:] 

622 xarticle.doi = doi 

623 

624 if "mr" in what: 

625 # MR 

626 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

627 if citation_mr_node: 

628 mr = citation_mr_node.get("content") 

629 if isinstance(mr, str): 629 ↛ 635line 629 didn't jump to line 635 because the condition on line 629 was always true

630 mr = mr.strip() 

631 if mr.find("MR") == 0: 631 ↛ 635line 631 didn't jump to line 635 because the condition on line 631 was always true

632 mr = mr[2:] 

633 xarticle.extids.append(("mr-item-id", mr)) 

634 

635 if "zbl" in what: 

636 # ZBL 

637 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

638 if citation_zbl_node: 

639 zbl = citation_zbl_node.get("content") 

640 if isinstance(zbl, str): 640 ↛ 646line 640 didn't jump to line 646 because the condition on line 640 was always true

641 zbl = zbl.strip() 

642 if zbl.find("Zbl") == 0: 642 ↛ 646line 642 didn't jump to line 646 because the condition on line 642 was always true

643 zbl = zbl[3:].strip() 

644 xarticle.extids.append(("zbl-item-id", zbl)) 

645 

646 if "publisher" in what: 

647 # PUBLISHER 

648 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

649 if citation_publisher_node: 

650 pub = citation_publisher_node.get("content") 

651 if isinstance(pub, str): 651 ↛ 658line 651 didn't jump to line 658 because the condition on line 651 was always true

652 pub = pub.strip() 

653 if pub != "": 653 ↛ 658line 653 didn't jump to line 658 because the condition on line 653 was always true

654 xpub = create_publisherdata() 

655 xpub.name = pub 

656 xissue.publisher = xpub 

657 

658 if "keywords" in what: 

659 # KEYWORDS 

660 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

661 for kwd_node in citation_kwd_nodes: 

662 kwds = kwd_node.get("content") 

663 if isinstance(kwds, str): 663 ↛ 661line 663 didn't jump to line 661 because the condition on line 663 was always true

664 kwds = kwds.split(",") 

665 for kwd in kwds: 

666 if kwd == "": 

667 continue 

668 kwd = kwd.strip() 

669 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

670 

671 if "references" in what: 

672 citation_references = soup.select("meta[name='citation_reference']") 

673 for index, tag in enumerate(citation_references): 

674 content = tag.get("content") 

675 if not isinstance(content, str): 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true

676 raise ValueError("Cannot parse citation_reference meta") 

677 label = str(index + 1) 

678 if regex.match(r"^\[\d+\].*", content): 678 ↛ 679line 678 didn't jump to line 679 because the condition on line 678 was never true

679 label = None 

680 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label)) 

681 

682 def create_xissue( 

683 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

684 ): 

685 if url is not None and url.endswith("/"): 

686 url = url[:-1] 

687 xissue = create_issuedata() 

688 xissue.url = url 

689 

690 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 

691 

692 xissue.year = year 

693 

694 if volume_number is not None: 

695 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

696 

697 if issue_number is not None: 

698 xissue.number = issue_number.replace(",", "-") 

699 return xissue 

700 

701 def detect_language(self, text: str, article: ArticleData | None = None): 

702 if article and article.lang is not None and article.lang != "und": 

703 return article.lang 

704 

705 language = self.language_detector.detect_language_of(text) 

706 

707 if not language: 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true

708 return "und" 

709 return language.iso_code_639_1.name.lower() 

710 

711 def create_trans_title( 

712 self, 

713 resource_type: str, 

714 title_tex: str, 

715 lang: str, 

716 xresource_lang: str, 

717 title_type: str = "main", 

718 ): 

719 tag = "trans-article" if resource_type == "article" else "issue-title" 

720 

721 ckeditor_data = build_jats_data_from_html_field( 

722 title_tex, 

723 tag=tag, 

724 text_lang=lang, 

725 resource_lang=xresource_lang, 

726 delimiter_inline=self.delimiter_inline_formula, 

727 delimiter_disp=self.delimiter_disp_formula, 

728 ) 

729 

730 titledata = create_titledata( 

731 lang=lang, 

732 type="main", 

733 title_html=ckeditor_data["value_html"], 

734 title_xml=ckeditor_data["value_xml"], 

735 ) 

736 

737 return titledata 

738 

739 references_mapping = { 

740 "citation_title": get_article_title_xml, 

741 "citation_journal_title": get_source_xml, 

742 "citation_publication_date": get_year_xml, 

743 "citation_firstpage": get_fpage_xml, 

744 "citation_lastpage": get_lpage_xml, 

745 } 

746 

747 @classmethod 

748 def __parse_meta_citation_reference(cls, content: str, label=None): 

749 categories = content.split(";") 

750 

751 if len(categories) == 1: 

752 return JatsBase.bake_ref(content, label=label) 

753 

754 citation_data = [c.split("=") for c in categories if "=" in c] 

755 del categories 

756 

757 xml_string = "" 

758 authors_parsed = False 

759 authors_strings = [] 

760 for data in citation_data: 

761 key = data[0].strip() 

762 citation_content = data[1] 

763 if key == "citation_author": 

764 authors_strings.append(get_author_xml(template_str=citation_content)) 

765 continue 

766 elif not authors_parsed: 

767 xml_string += ", ".join(authors_strings) 

768 authors_parsed = True 

769 

770 if key in cls.references_mapping: 

771 xml_string += " " + cls.references_mapping[key](citation_content) 

772 

773 return JatsBase.bake_ref(xml_string, label=label) 

774 

775 @classmethod 

776 def get_or_create_source(cls): 

777 source, created = Source.objects.get_or_create( 

778 domain=cls.source_domain, 

779 defaults={ 

780 "name": cls.source_name, 

781 "website": cls.source_website, 

782 }, 

783 ) 

784 if created: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true

785 source.save() 

786 return source 

787 

788 @staticmethod 

789 def get_issue_pid( 

790 collection_id: str, 

791 year: str, 

792 volume_number: str | None = None, 

793 issue_number: str | None = None, 

794 ): 

795 # Replace any non-word character with an underscore 

796 pid = f"{collection_id}_{year}" 

797 if volume_number is not None: 

798 pid += f"_{volume_number}" 

799 if issue_number is not None: 

800 pid += f"_{issue_number}" 

801 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

802 return pid 

803 

804 @staticmethod 

805 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

806 pages_split = pages.split(separator) 

807 if len(pages_split) == 0: 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true

808 article.page_range = pages 

809 if len(pages_split) > 0: 809 ↛ exitline 809 didn't return from function 'set_pages' because the condition on line 809 was always true

810 if pages[0].isnumeric(): 810 ↛ exitline 810 didn't return from function 'set_pages' because the condition on line 810 was always true

811 article.fpage = pages_split[0] 

812 if ( 

813 len(pages_split) > 1 

814 and pages_split[0] != pages_split[1] 

815 and pages_split[1].isnumeric() 

816 ): 

817 article.lpage = pages_split[1]