Coverage for src/crawler/abstract_crawlers/base

1import logging

2import time

3from collections.abc import Iterable

4from datetime import datetime, timedelta

5from email.policy import EmailPolicy

6from typing import TYPE_CHECKING, Literal

8import aiohttp

9import regex

10import requests

11from bs4 import BeautifulSoup

12from django.conf import settings

13from django.contrib.auth.models import User

14from django.db.utils import IntegrityError

15from django.utils import timezone

16from langcodes import standardize_tag

17from lingua import LanguageDetector, LanguageDetectorBuilder

18from opentelemetry import trace

19from ptf.cmds.xml.ckeditor.utils import (

20 build_jats_data_from_html_field,

21)

22from ptf.cmds.xml.jats.builder.references import (

23 get_article_title_xml,

24 get_author_xml,

25 get_fpage_xml,

26 get_lpage_xml,

27 get_source_xml,

28 get_year_xml,

29)

30from ptf.cmds.xml.jats.jats_parser import JatsBase

31from ptf.model_data import (

32 ArticleData,

33 ContributorDict,

34 IssueData,

35 ResourceData,

36 TitleDict,

37 create_abstract,

38 create_contributor,

39 create_extlink,

40 create_issuedata,

41 create_publisherdata,

42 create_subj,

43 create_titledata,

44)

45from ptf.model_data_converter import update_data_for_jats

46from ptf.models import ExtLink

47from pylatexenc.latex2text import LatexNodes2Text

48from pysolr import SolrError

49from requests.adapters import HTTPAdapter

50from requests_cache import CachedSession

51from urllib3 import Retry

53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd

54from crawler.models import Source

55from crawler.models.extlink_checked import ExtlinkChecked

56from crawler.types import CitationLiteral

57from crawler.utils import (

58 add_pdf_link_to_xarticle,

59 cleanup_str,

60 get_all_cols,

61 get_or_create_collection,

62 get_session,

63)

65if TYPE_CHECKING:

66 from bs4 import Tag

69class CrawlerTitleDict(TitleDict):

70 title_tex: str | None

73class BaseCollectionCrawler:

74 """

75 Base collection for the crawlers.

76 To create a crawler:

77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler

78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content

79 3) update factory.py so that crawler_factory can return your new crawler

80 """

82 logger = logging.getLogger(__name__)

83 tracer = trace.get_tracer(__name__)

85 source_name = ""

86 source_domain = ""

87 source_website = ""

89 issue_href = ""

91 collection = None

92 source = None

93 user = None

94 session: requests.Session | CachedSession

95 async_session: aiohttp.ClientSession

96 is_checkable = True

97 verify = True

98 headers = {

99 "accept_encoding": "utf-8",

100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),

101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),

102 }

103

104 # seconds to wait between two http requests

105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)

106 # seconds to wait before aborting the connection (if no bytes are recieved)

107 requests_timeout = 60

108

109 latext_parser = LatexNodes2Text()

110

111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)

112 # do not use the "$" to surround tex formulas

113 delimiter_inline_formula = "$"

114 delimiter_disp_formula = "$"

115

116 # HACK : Workaround for tests (monkeypatching)

117 # We store the class here, so we can monkeypatch it when running tests

118 # subCrawlers = {

119 # LofplCrawler: None

120 # }

121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}

122

123 _language_detector: LanguageDetector | None = None

124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()

125

126 force_refresh = False

127

128 # Whereas to include headers in requests cache key

129 match_headers = False

130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"

131

132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs

133 ignore_missing_pdf = True

134

135 @classmethod

136 def get_view_id(cls):

137 return cls.source_domain

138

139 @property

140 def language_detector(self):

141 """Crawler Instance singleton for language builder.

142 Late init of LanguageDetector to save on memory"""

143 if not self._language_detector:

144 self._language_detector = self._language_detector_builder.build()

145 return self._language_detector

146

147 def __init__(

148 self,

149 *args,

150 username: str,

151 collection_id: str,

152 dry: bool = False,

153 publisher: str = "",

154 force_refresh=False,

155 collection_url: str | None = None,

156 ):

157 if not collection_url: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 all_cols = get_all_cols()

159 col = all_cols[collection_id]

160

161 collection_url = col["sources"].get(self.source_domain, None)

162 if collection_url is None:

163 raise ValueError(

164 f"Source {self.source_domain} not found for collection {collection_id}"

165 )

166 self.collection_url = collection_url

167 for CrawlerClass in self.subCrawlers: 167 ↛ 168line 167 didn't jump to line 168 because the loop on line 167 never started

168 self.subCrawlers[CrawlerClass] = CrawlerClass(

169 *args,

170 username=username,

171 collection_id=collection_id,

172 dry=dry,

173 publisher=publisher,

174 collection_url=collection_url,

175 )

176 self.logger = logging.getLogger(__name__ + "." + self.source_domain)

177 # self.logger = logging.getLogger(__name__)

178

179 self.username = username

180

181 self.collection_id = collection_id

182

183 self.dry = dry

184 self.publisher = publisher

185

186 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)

187 BaseCollectionCrawler.session = requests.Session()

188

189 # Skipped when running tests

190 self.initialize()

191

192 self.force_refresh = force_refresh

193

194 # We implemented custom retry behaviour, so we don't want to make extra requests here

195

196 def initialize(self):

197 """

198 Acts as a "second" init function to skip model accesses during test data generation

199 """

200 self.collection = get_or_create_collection(self.collection_id)

201 self.source = self.get_or_create_source()

202 self.user = User.objects.get(username=self.username)

203 BaseCollectionCrawler.session = get_session()

204 BaseCollectionCrawler.session.verify = self.verify

205 self.session.delay = self.requests_interval

206 retries = Retry(

207 total=0,

208 )

209 self.session.mount("https://", HTTPAdapter(max_retries=retries))

210 self.session.mount("http://", HTTPAdapter(max_retries=retries))

211

212 @classmethod

213 def can_crawl(cls, pid: str) -> bool:

214 return True

215

216 def parse_collection_content(self, content: str) -> list[IssueData]:

217 """

218 Parse the HTML content with BeautifulSoup

219 returns a list of xissue.

220 Override this function in a derived class

221 """

222 return []

223

224 def parse_issue_content(self, content: str, xissue: IssueData):

225 """

226 Parse the HTML content with BeautifulSoup

227 Fills the xissue.articles

228 Override this function in a derived class.

229

230 CAV : You are supposed to create articles there. Please assign a PID to each article.

231 The PID can be `a + article_index`, like this : `a0` `a21`

232 """

233

234 def parse_article_content(

235 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str

236 ) -> ArticleData | None:

237 """

238 Parse the HTML content with BeautifulSoup

239 returns the xarticle.

240 Override this function in a derived class.

241 The xissue is passed to the function in case the article page has issue information (ex: publisher)

242 The article url is also passed as a parameter

243

244 CAV : You are supposed to assign articles pid again here

245 """

246 return xarticle

247

248 @tracer.start_as_current_span("crawl_collection")

249 def crawl_collection(self):

250 # TODO: Comments, filter

251 """

252 Crawl an entire collection. ptf.models.Container objects are created.

253 - get the HTML content of the collection_url

254 - parse the HTML content with beautifulsoup to extract the list of issues

255 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)

256 - crawl each issue if col_only is False

257 - Returns the list of merged issues.

258 It is an OrderedDict {pid: {"issues": xissues}}

259 The key is the pid of the merged issues.

260 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)

261 the pid is then made with 1999-2000__6_

262 """

263

264 if self.source is None:

265 raise RuntimeError("ERROR: the source is not set")

266

267 content = self.download_file(self.collection_url)

268 if content:

269 xissues = self.parse_collection_content(content)

270 else:

271 # download_file returns None (404)

272 return None

273

274 """

275 Some collections split the same volumes in different pages

276 Ex: Volume 6 (2000) and Volume 6 (1999)

277 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)

278 """

279 # merged_xissues = self.merge_xissues(xissues)

280

281 xissues_dict = {str(i.pid): i for i in xissues}

282

283 return xissues_dict

284

285 @tracer.start_as_current_span("crawl_issue")

286 def crawl_issue(self, xissue: IssueData):

287 """

288 Crawl 1 wag page of an issue.

289 - get the HTML content of the issue

290 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata

291 - crawl each article

292 """

293 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.

294 # The list of articles directly come from the collection HTML page: the xissue has no url attribute

295 issue_url = xissue.url

296 if issue_url is not None:

297 if issue_url.endswith(".pdf"):

298 add_pdf_link_to_xarticle(xissue, issue_url)

299 xissue.url = None

300 else:

301 content = self.download_file(issue_url)

302 with self.tracer.start_as_current_span("parse_issue_content"):

303 self.parse_issue_content(content, xissue)

304

305 xarticles = xissue.articles

306

307 parsed_xarticles = []

308

309 for xarticle in xarticles:

310 parsed_xarticle = self.crawl_article(xarticle, xissue)

311 if parsed_xarticle is not None:

312 parsed_xarticles.append(parsed_xarticle)

313

314 xissue.articles = parsed_xarticles

315

316 issue_has_pdf = self.article_has_pdf(xissue)

317

318 if self.ignore_missing_pdf:

319 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]

320 if self.dry:

321 return

322 if len(xissue.articles) == 0 and not issue_has_pdf:

323 return

324 self.process_resource_metadata(xissue, resource_type="issue")

325

326 self.add_xissue_into_database(xissue)

327

328 @staticmethod

329 def article_has_source(art: ArticleData | IssueData):

330 return (

331 next(

332 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),

333 None,

334 )

335 is not None

336 )

337

338 @staticmethod

339 def article_has_pdf(art: ArticleData | IssueData):

340 return (

341 next(

342 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),

343 None,

344 )

345 is not None

346 )

347

348 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):

349 # ARTICLE URL as en ExtLink (to display the link in the article page)

350 if xarticle.url is None:

351 if not self.article_has_source(xarticle): 351 ↛ 361line 351 didn't jump to line 361 because the condition on line 351 was always true

352 if xissue.url:

353 article_source = xissue.url

354 else:

355 article_source = self.collection_url

356 ext_link = create_extlink()

357 ext_link["rel"] = "source"

358 ext_link["location"] = article_source

359 ext_link["metadata"] = self.source_domain

360 xarticle.ext_links.append(ext_link)

361 return self.process_article_metadata(xarticle)

362

363 content = self.download_file(xarticle.url)

364 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"

365

366 try:

367 with self.tracer.start_as_current_span("parse_article_content"):

368 parsed_xarticle = self.parse_article_content(

369 content, xissue, xarticle, xarticle.url

370 )

371 except ValueError as e:

372 self.logger.warning(e)

373 self.logger.warning("Retrying in 5 mins while invalidating cache")

374 time.sleep(5 * 60)

375 content = self.download_file(xarticle.url, force_refresh=True)

376 with self.tracer.start_as_current_span("parse_article_content"):

377 parsed_xarticle = self.parse_article_content(

378 content, xissue, xarticle, xarticle.url

379 )

380

381 if parsed_xarticle is None: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 return None

383

384 if parsed_xarticle.doi:

385 parsed_xarticle.pid = (

386 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")

387 )

388

389 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:

390 ext_link = create_extlink()

391 ext_link["rel"] = "source"

392 ext_link["location"] = parsed_xarticle.url

393 ext_link["metadata"] = self.source_domain

394 parsed_xarticle.ext_links.append(ext_link)

395

396 # The article title may have formulas surrounded with '$'

397 return self.process_article_metadata(parsed_xarticle)

398

399 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):

400 tag = "article-title" if resource_type == "article" else "issue-title"

401

402 # Process title tex

403 ckeditor_data = build_jats_data_from_html_field(

404 xresource.title_tex,

405 tag=tag,

406 text_lang=xresource.lang,

407 delimiter_inline=self.delimiter_inline_formula,

408 delimiter_disp=self.delimiter_disp_formula,

409 )

410

411 xresource.title_html = ckeditor_data["value_html"]

412 # xresource.title_tex = ckeditor_data["value_tex"]

413 xresource.title_xml = ckeditor_data["value_xml"]

414

415 abstracts_to_parse = [

416 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"

417 ]

418 # abstract may have formulas surrounded with '$'

419 if len(abstracts_to_parse) > 0:

420 for xabstract in abstracts_to_parse:

421 ckeditor_data = build_jats_data_from_html_field(

422 xabstract["value_tex"],

423 tag="abstract",

424 text_lang=xabstract["lang"],

425 resource_lang=xresource.lang,

426 field_type="abstract",

427 delimiter_inline=self.delimiter_inline_formula,

428 delimiter_disp=self.delimiter_disp_formula,

429 )

430

431 xabstract["value_html"] = ckeditor_data["value_html"]

432 # xabstract["value_tex"] = ckeditor_data["value_tex"]

433 xabstract["value_xml"] = ckeditor_data["value_xml"]

434

435 return xresource

436

437 def process_article_metadata(self, xarticle: ArticleData):

438 self.process_resource_metadata(xarticle)

439 for bibitem in xarticle.bibitems:

440 bibitem.type = "unknown"

441 update_data_for_jats(xarticle, with_label=False)

442

443 return xarticle

444

445 def download_file(self, url: str, force_refresh=False, headers={}):

446 """

447 Downloads a page and returns its content (decoded string).

448 This function handles retries and decoding

449 """

450 current_exception: Exception | None = None

451 for attempt in range(3):

452 try:

453 kwargs = {

454 "url": url,

455 "headers": {**self.headers, **headers},

456 "timeout": self.requests_timeout,

457 }

458 if attempt > 0 and isinstance(self.session, CachedSession):

459 kwargs["force_refresh"] = True

460 response = self.session.get(**kwargs)

461

462 content = self.decode_response(response)

463 if content == "" or not content:

464 raise requests.exceptions.HTTPError(response)

465

466 return content

467 except (

468 requests.ConnectionError,

469 requests.ConnectTimeout,

470 requests.exceptions.HTTPError,

471 ) as e:

472 current_exception = e

473 self.logger.debug(f"Caught error : {e}", extra={"url": url})

474 # 15 mins, 30 mins, 45 mins

475 delay_minutes = attempt * 15

476 self.logger.debug(

477 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",

478 extra={"url": url},

479 )

480 time.sleep(delay_minutes * 60)

481

482 raise current_exception

483

484 def decode_response(self, response: requests.Response, encoding: str | None = None):

485 """Override this if the content-type headers from the sources are advertising something else than the actual content

486 SASA needs this"""

487 # Force

488 if encoding:

489 response.encoding = encoding

490 return response.text

491

492 # Attempt to get encoding using HTTP headers

493 content_type_tag = response.headers.get("Content-Type", None)

494

495 if content_type_tag: 495 ↛ 502line 495 didn't jump to line 502 because the condition on line 495 was always true

496 charset = self.parse_content_type_charset(content_type_tag)

497 if charset: 497 ↛ 498line 497 didn't jump to line 498 because the condition on line 497 was never true

498 response.encoding = charset

499 return response.text

500

501 # Attempt to get encoding using HTML meta charset tag

502 soup = BeautifulSoup(response.text, "html5lib")

503 charset = soup.select_one("meta[charset]")

504 if charset:

505 htmlencoding = charset.get("charset")

506 if isinstance(htmlencoding, str): 506 ↛ 511line 506 didn't jump to line 511 because the condition on line 506 was always true

507 response.encoding = htmlencoding

508 return response.text

509

510 # Attempt to get encoding using HTML meta content type tag

511 content_type_tag = soup.select_one(

512 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'

513 )

514 if content_type_tag:

515 content_type = content_type_tag.get("content")

516 if isinstance(content_type, str): 516 ↛ 522line 516 didn't jump to line 522 because the condition on line 516 was always true

517 charset = self.parse_content_type_charset(content_type)

518 if charset: 518 ↛ 522line 518 didn't jump to line 522 because the condition on line 518 was always true

519 response.encoding = charset

520 return response.text

521

522 return response.text

523

524 @staticmethod

525 def parse_content_type_charset(content_type: str):

526 header = EmailPolicy.header_factory("content-type", content_type)

527 if "charset" in header.params:

528 return header.params.get("charset")

529

530 @tracer.start_as_current_span("add_xissue_to_database")

531 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:

532 xissue.journal = self.collection

533 xissue.source = self.source_domain

534

535 if xissue.year == "":

536 raise ValueError("Failsafe : Cannot insert issue without a year")

537

538 xpub = create_publisherdata()

539 xpub.name = self.publisher

540 xissue.publisher = xpub

541 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()

542

543 attempt = 1

544 success = False

545

546 while not success and attempt < 4:

547 try:

548 params = {"xissue": xissue, "use_body": False}

549 cmd = addOrUpdateGDMLIssueXmlCmd(params)

550 cmd.do()

551 success = True

552 self.logger.debug(f"Issue {xissue.pid} inserted in database")

553 return xissue

554 except SolrError:

555 self.logger.warning(

556 f"Encoutered SolrError while inserting issue {xissue.pid} in database"

557 )

558 attempt += 1

559 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")

560 time.sleep(10)

561 except Exception as e:

562 self.logger.error(

563 f"Got exception while attempting to insert {xissue.pid} in database : {e}"

564 )

565 raise e

566

567 if success is False:

568 raise ConnectionRefusedError("Cannot connect to SolR")

569

570 assert False, "Unreachable"

571

572 def get_metadata_using_citation_meta(

573 self,

574 xarticle: ArticleData,

575 xissue: IssueData,

576 soup: BeautifulSoup,

577 what: list[CitationLiteral] = [],

578 ):

579 """

580 :param xarticle: the xarticle that will collect the metadata

581 :param xissue: the xissue that will collect the publisher

582 :param soup: the BeautifulSoup object of tha article page

583 :param what: list of citation_ items to collect.

584 :return: None. The given article is modified

585 """

586

587 if "title" in what:

588 # TITLE

589 citation_title_node = soup.select_one("meta[name='citation_title']")

590 if citation_title_node: 590 ↛ 595line 590 didn't jump to line 595 because the condition on line 590 was always true

591 title = citation_title_node.get("content")

592 if isinstance(title, str): 592 ↛ 595line 592 didn't jump to line 595 because the condition on line 592 was always true

593 xarticle.title_tex = title

594

595 if "author" in what: 595 ↛ 624line 595 didn't jump to line 624 because the condition on line 595 was always true

596 # AUTHORS

597 citation_author_nodes = soup.select("meta[name^='citation_author']")

598 current_author: ContributorDict | None = None

599 for citation_author_node in citation_author_nodes:

600 if citation_author_node.get("name") == "citation_author":

601 text_author = citation_author_node.get("content")

602 if not isinstance(text_author, str): 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true

603 raise ValueError("Cannot parse author")

604 if text_author == "": 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 current_author = None

606 continue

607 current_author = create_contributor(role="author", string_name=text_author)

608 xarticle.contributors.append(current_author)

609 continue

610 if current_author is None: 610 ↛ 611line 610 didn't jump to line 611 because the condition on line 610 was never true

611 self.logger.warning("Couldn't parse citation author")

612 continue

613 if citation_author_node.get("name") == "citation_author_institution":

614 text_institution = citation_author_node.get("content")

615 if not isinstance(text_institution, str): 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true

616 continue

617 current_author["addresses"].append(text_institution)

618 if citation_author_node.get("name") == "citation_author_ocrid": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true

619 text_orcid = citation_author_node.get("content")

620 if not isinstance(text_orcid, str):

621 continue

622 current_author["orcid"] = text_orcid

623

624 if "pdf" in what:

625 # PDF

626 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')

627 if citation_pdf_node:

628 pdf_url = citation_pdf_node.get("content")

629 if isinstance(pdf_url, str): 629 ↛ 632line 629 didn't jump to line 632 because the condition on line 629 was always true

630 add_pdf_link_to_xarticle(xarticle, pdf_url)

631

632 if "lang" in what:

633 # LANG

634 citation_lang_node = soup.select_one("meta[name='citation_language']")

635 if citation_lang_node: 635 ↛ 641line 635 didn't jump to line 641 because the condition on line 635 was always true

636 # TODO: check other language code

637 content_text = citation_lang_node.get("content")

638 if isinstance(content_text, str): 638 ↛ 641line 638 didn't jump to line 641 because the condition on line 638 was always true

639 xarticle.lang = standardize_tag(content_text)

640

641 if "abstract" in what:

642 # ABSTRACT

643 abstract_node = soup.select_one("meta[name='citation_abstract']")

644 if abstract_node is not None:

645 abstract = abstract_node.get("content")

646 if not isinstance(abstract, str): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true

647 raise ValueError("Couldn't parse abstract from meta")

648 abstract = BeautifulSoup(abstract, "html.parser").text

649 lang = abstract_node.get("lang")

650 if not isinstance(lang, str):

651 lang = self.detect_language(abstract, xarticle)

652 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))

653

654 if "page" in what:

655 # PAGES

656 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")

657 if citation_fpage_node:

658 page = citation_fpage_node.get("content")

659 if isinstance(page, str): 659 ↛ 664line 659 didn't jump to line 664 because the condition on line 659 was always true

660 page = page.split("(")[0]

661 if len(page) < 32: 661 ↛ 664line 661 didn't jump to line 664 because the condition on line 661 was always true

662 xarticle.fpage = page

663

664 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")

665 if citation_lpage_node:

666 page = citation_lpage_node.get("content")

667 if isinstance(page, str): 667 ↛ 672line 667 didn't jump to line 672 because the condition on line 667 was always true

668 page = page.split("(")[0]

669 if len(page) < 32: 669 ↛ 672line 669 didn't jump to line 672 because the condition on line 669 was always true

670 xarticle.lpage = page

671

672 if "doi" in what:

673 # DOI

674 citation_doi_node = soup.select_one("meta[name='citation_doi']")

675 if citation_doi_node:

676 doi = citation_doi_node.get("content")

677 if isinstance(doi, str): 677 ↛ 684line 677 didn't jump to line 684 because the condition on line 677 was always true

678 doi = doi.strip()

679 pos = doi.find("10.")

680 if pos > 0:

681 doi = doi[pos:]

682 xarticle.doi = doi

683

684 if "mr" in what:

685 # MR

686 citation_mr_node = soup.select_one("meta[name='citation_mr']")

687 if citation_mr_node:

688 mr = citation_mr_node.get("content")

689 if isinstance(mr, str): 689 ↛ 695line 689 didn't jump to line 695 because the condition on line 689 was always true

690 mr = mr.strip()

691 if mr.find("MR") == 0: 691 ↛ 695line 691 didn't jump to line 695 because the condition on line 691 was always true

692 mr = mr[2:]

693 xarticle.extids.append(("mr-item-id", mr))

694

695 if "zbl" in what:

696 # ZBL

697 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")

698 if citation_zbl_node:

699 zbl = citation_zbl_node.get("content")

700 if isinstance(zbl, str): 700 ↛ 706line 700 didn't jump to line 706 because the condition on line 700 was always true

701 zbl = zbl.strip()

702 if zbl.find("Zbl") == 0: 702 ↛ 706line 702 didn't jump to line 706 because the condition on line 702 was always true

703 zbl = zbl[3:].strip()

704 xarticle.extids.append(("zbl-item-id", zbl))

705

706 if "publisher" in what:

707 # PUBLISHER

708 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")

709 if citation_publisher_node:

710 pub = citation_publisher_node.get("content")

711 if isinstance(pub, str): 711 ↛ 718line 711 didn't jump to line 718 because the condition on line 711 was always true

712 pub = pub.strip()

713 if pub != "": 713 ↛ 718line 713 didn't jump to line 718 because the condition on line 713 was always true

714 xpub = create_publisherdata()

715 xpub.name = pub

716 xissue.publisher = xpub

717

718 if "keywords" in what:

719 # KEYWORDS

720 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")

721 for kwd_node in citation_kwd_nodes:

722 kwds = kwd_node.get("content")

723 if isinstance(kwds, str): 723 ↛ 721line 723 didn't jump to line 721 because the condition on line 723 was always true

724 kwds = kwds.split(",")

725 for kwd in kwds:

726 if kwd == "":

727 continue

728 kwd = kwd.strip()

729 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})

730

731 if "references" in what:

732 citation_references = soup.select("meta[name='citation_reference']")

733 for index, tag in enumerate(citation_references):

734 content = tag.get("content")

735 if not isinstance(content, str): 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true

736 raise ValueError("Cannot parse citation_reference meta")

737 label = str(index + 1)

738 if regex.match(r"^\[\d+\].*", content): 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true

739 label = None

740 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))

741

742 def get_metadata_using_dcterms(

743 self,

744 xarticle: ArticleData,

745 soup: "Tag",

746 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",

747 ):

748 if "abstract" in what: 748 ↛ 756line 748 didn't jump to line 756 because the condition on line 748 was always true

749 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")

750 if abstract_tag: 750 ↛ 756line 750 didn't jump to line 756 because the condition on line 750 was always true

751 abstract_text = self.get_str_attr(abstract_tag, "content")

752 xarticle.abstracts.append(

753 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))

754 )

755

756 if "keywords" in what: 756 ↛ 765line 756 didn't jump to line 765 because the condition on line 756 was always true

757 keyword_tags = soup.select("meta[name='DC.subject']")

758 for tag in keyword_tags:

759 kwd_text = tag.get("content")

760 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 continue

762 kwd = create_subj(value=kwd_text)

763 xarticle.kwds.append(kwd)

764

765 if "date_published" in what: 765 ↛ 766line 765 didn't jump to line 766 because the condition on line 765 was never true

766 published_tag = soup.select_one("meta[name='DC.Date.created']")

767 if published_tag:

768 published_text = self.get_str_attr(published_tag, "content")

769 xarticle.date_published = published_text

770

771 if "article_type" in what: 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true

772 type_tag = soup.select_one("meta[name='DC.Type.articleType']")

773 if type_tag:

774 type_text = self.get_str_attr(type_tag, "content")

775 xarticle.atype = type_text

776

777 def create_xissue(

778 self,

779 url: str | None,

780 year: str,

781 volume_number: str | None,

782 issue_number: str | None = None,

783 vseries: str | None = None,

784 ):

785 if url is not None and url.endswith("/"):

786 url = url[:-1]

787 xissue = create_issuedata()

788 xissue.url = url

789

790 xissue.pid = self.get_issue_pid(

791 self.collection_id, year, volume_number, issue_number, vseries

792 )

793

794 xissue.year = year

795

796 if volume_number is not None:

797 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)

798

799 if issue_number is not None:

800 xissue.number = issue_number.replace(",", "-")

801

802 if vseries is not None: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true

803 xissue.vseries = vseries

804 return xissue

805

806 def detect_language(self, text: str, article: ArticleData | None = None):

807 if article and article.lang is not None and article.lang != "und":

808 return article.lang

809

810 language = self.language_detector.detect_language_of(text)

811

812 if not language: 812 ↛ 813line 812 didn't jump to line 813 because the condition on line 812 was never true

813 return "und"

814 return language.iso_code_639_1.name.lower()

815

816 def get_str_attr(self, tag: "Tag", attr: str):

817 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""

818 node_attr = tag.get(attr)

819 if isinstance(node_attr, list): 819 ↛ 820line 819 didn't jump to line 820 because the condition on line 819 was never true

820 raise ValueError(

821 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."

822 )

823 if node_attr is None: 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true

824 raise ValueError(

825 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"

826 )

827 return node_attr

828

829 def create_trans_title(

830 self,

831 resource_type: str,

832 title_str: str,

833 lang: str,

834 xresource_lang: str,

835 title_type: str = "main",

836 ):

837 tag = "trans-title" if resource_type == "article" else "issue-title"

838

839 ckeditor_data = build_jats_data_from_html_field(

840 title_str,

841 tag=tag,

842 text_lang=lang,

843 resource_lang=xresource_lang,

844 delimiter_inline=self.delimiter_inline_formula,

845 delimiter_disp=self.delimiter_disp_formula,

846 )

847

848 titledata = create_titledata(

849 lang=lang,

850 type="main",

851 title_html=ckeditor_data["value_html"],

852 title_xml=ckeditor_data["value_xml"],

853 )

854

855 return titledata

856

857 references_mapping = {

858 "citation_title": get_article_title_xml,

859 "citation_journal_title": get_source_xml,

860 "citation_publication_date": get_year_xml,

861 "citation_firstpage": get_fpage_xml,

862 "citation_lastpage": get_lpage_xml,

863 }

864

865 @classmethod

866 def __parse_meta_citation_reference(cls, content: str, label=None):

867 categories = content.split(";")

868

869 if len(categories) == 1:

870 return JatsBase.bake_ref(content, label=label)

871

872 citation_data = [c.split("=") for c in categories if "=" in c]

873 del categories

874

875 xml_string = ""

876 authors_parsed = False

877 authors_strings = []

878 for data in citation_data:

879 key = data[0].strip()

880 citation_content = data[1]

881 if key == "citation_author":

882 authors_strings.append(get_author_xml(template_str=citation_content))

883 continue

884 elif not authors_parsed:

885 xml_string += ", ".join(authors_strings)

886 authors_parsed = True

887

888 if key in cls.references_mapping:

889 xml_string += " " + cls.references_mapping[key](citation_content)

890

891 return JatsBase.bake_ref(xml_string, label=label)

892

893 @classmethod

894 def get_or_create_source(cls):

895 source, created = Source.objects.get_or_create(

896 domain=cls.source_domain,

897 defaults={

898 "name": cls.source_name,

899 "website": cls.source_website,

900 "view_id": cls.get_view_id(),

901 },

902 )

903 if created: 903 ↛ 904line 903 didn't jump to line 904 because the condition on line 903 was never true

904 source.save()

905 return source

906

907 @staticmethod

908 def get_issue_pid(

909 collection_id: str,

910 year: str,

911 volume_number: str | None = None,

912 issue_number: str | None = None,

913 series: str | None = None,

914 ):

915 # Replace any non-word character with an underscore

916 pid = f"{collection_id}_{year}"

917 if series is not None: 917 ↛ 918line 917 didn't jump to line 918 because the condition on line 917 was never true

918 pid += f"_{series}"

919 if volume_number is not None:

920 pid += f"_{volume_number}"

921 if issue_number is not None:

922 pid += f"_{issue_number}"

923 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))

924 return pid

925

926 @staticmethod

927 def set_pages(article: ArticleData, pages: str, separator: str = "-"):

928 pages_split = pages.split(separator)

929 if len(pages_split) == 0: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true

930 article.page_range = pages

931 if len(pages_split) > 0: 931 ↛ exitline 931 didn't return from function 'set_pages' because the condition on line 931 was always true

932 if pages[0].isnumeric(): 932 ↛ exitline 932 didn't return from function 'set_pages' because the condition on line 932 was always true

933 article.fpage = pages_split[0]

934 if ( 934 ↛ 939line 934 didn't jump to line 939 because the condition on line 934 was never true

935 len(pages_split) > 1

936 and pages_split[0] != pages_split[1]

937 and pages_split[1].isnumeric()

938 ):

939 article.lpage = pages_split[1]

940

941 @staticmethod

942 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):

943 content_type = response.headers.get("Content-Type")

944 if regex.match(rb"^%PDF-\d\.\d", chunk):

945 if content_type and "application/pdf" in content_type:

946 # The file is unmistakably a pdf

947 return [

948 True,

949 response,

950 {

951 "status": ExtlinkChecked.Status.OK,

952 "message": "",

953 },

954 ]

955 # The file is a pdf, but the content type advertised by the server is wrong

956 return [

957 True,

958 response,

959 {

960 "status": ExtlinkChecked.Status.WARNING,

961 "message": f"Content-Type header: {content_type}",

962 },

963 ]

964

965 # Reaching here means we couldn't find the pdf.

966 if not content_type or "application/pdf" not in content_type:

967 return [

968 False,

969 response,

970 {

971 "status": ExtlinkChecked.Status.ERROR,

972 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",

973 },

974 ]

975

976 return [

977 False,

978 response,

979 {

980 "status": ExtlinkChecked.Status.ERROR,

981 "message": f"PDF Header not found: got {chunk}",

982 },

983 ]

984

985 @classmethod

986 async def a_check_pdf_link_validity(

987 cls, url: str, verify=True

988 ) -> tuple[bool, aiohttp.ClientResponse, dict]:

989 """

990 Check the validity of the PDF links.

991 """

992 CHUNK_SIZE = 10 # Nombre de caractères à récupérer

993 header = {

994 "Range": f"bytes=0-{CHUNK_SIZE}",

995 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",

996 }

997 async with cls.async_session.get(

998 url, headers=header, allow_redirects=True, ssl=verify

999 ) as response:

1000 try:

1001 chunk = await response.content.read(CHUNK_SIZE)

1002 return BaseCollectionCrawler._process_pdf_header(chunk, response)

1003 except StopIteration:

1004 return [

1005 False,

1006 response,

1007 {

1008 "status": ExtlinkChecked.Status.ERROR,

1009 "message": "Error reading PDF header",

1010 },

1011 ]

1012

1013 @classmethod

1014 def check_pdf_link_validity(

1015 cls, url: str, verify=True

1016 ) -> tuple[bool, requests.Response | None, dict]:

1017 """

1018 Check the validity of the PDF links.

1019 """

1020 CHUNK_SIZE = 10 # Nombre de caractères à récupérer

1021 header = {

1022 "Range": f"bytes=0-{CHUNK_SIZE}",

1023 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",

1024 }

1025 with cls.session.get(

1026 url, headers=header, allow_redirects=True, verify=verify, stream=True

1027 ) as response:

1028 try:

1029 chunk = next(response.iter_content(CHUNK_SIZE))

1030 return BaseCollectionCrawler._process_pdf_header(chunk, response)

1031 except StopIteration:

1032 return [

1033 False,

1034 response,

1035 {

1036 "status": ExtlinkChecked.Status.ERROR,

1037 "message": "Error reading PDF header",

1038 },

1039 ]

1040

1041 @classmethod

1042 async def check_extlink_validity(cls, extlink: "ExtLink"):

1043 """

1044 Method used by rot_monitoring to check if links have expired

1045 """

1046 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}

1047 header = {

1048 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"

1049 }

1050 verify = True

1051 if not cls.verify:

1052 verify = False

1053 try:

1054 if extlink.rel == "article-pdf":

1055 isok, response, message = await cls.a_check_pdf_link_validity(

1056 extlink.location, verify

1057 )

1058 defaults.update(message)

1059 defaults["http_status"] = response.status

1060 else:

1061 async with cls.async_session.get(

1062 url=extlink.location,

1063 headers=header,

1064 allow_redirects=True,

1065 ssl=verify,

1066 ) as response:

1067 defaults["http_status"] = response.status

1068 if response.status not in (200, 206):

1069 defaults["status"] = ExtlinkChecked.Status.ERROR

1070

1071 except aiohttp.ClientSSLError:

1072 cls.logger.error("SSL error for the url: %s", extlink.location)

1073 defaults["status"] = ExtlinkChecked.Status.ERROR

1074 defaults["message"] = "SSL error"

1075 except aiohttp.ClientConnectionError:

1076 cls.logger.error("Connection error for the url: %s", extlink.location)

1077 defaults["status"] = ExtlinkChecked.Status.ERROR

1078 defaults["message"] = "Connection error"

1079 except TimeoutError:

1080 cls.logger.error("Timeout error for the url: %s", extlink.location)

1081 defaults["status"] = ExtlinkChecked.Status.ERROR

1082 defaults["message"] = "Timeout error"

1083 finally:

1084 try:

1085 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)

1086 cls.logger.info(

1087 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location

1088 )

1089 except IntegrityError:

1090 cls.logger.error(

1091 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location

1092 )

Coverage for src / crawler / abstract_crawlers / base_crawler.py: 66%

584 statements