Coverage for src/crawler/base

1import time

2from collections import OrderedDict

3from collections.abc import Sequence

4from datetime import timedelta

6import requests

7from alive_progress import alive_bar

8from bs4 import BeautifulSoup

9from django.conf import settings

10from django.contrib.auth.models import User

11from django.utils import timezone

12from ptf.cmds import xml_cmds

13from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas

14from ptf.cmds.xml.jats.builder.issue import get_title_xml

15from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml

16from ptf.display.resolver import extids_formats, resolve_id

17from ptf.model_data import (

18 AbstractDict,

19 ArticleData,

20 IssueData,

21 RefData,

22 create_articledata,

23 create_contributor,

24 create_extlink,

25 create_publisherdata,

26)

27from ptf.model_data_converter import update_data_for_jats

28from pylatexenc.latex2text import LatexNodes2Text

29from pysolr import SolrError

30from requests_cache import CachedSession, FileCache

32# TODO: pass a class factory instead of a dependency to a site

33# TODO: pass a class factory instead of a dependency to a site

34from crawler.models import Periode, Source

35from crawler.utils import get_or_create_collection

37from .crawler_types import CitationLiteral

40class BaseCollectionCrawler:

41 """

42 Base collection for the crawlers.

43 To create a crawler:

44 1) derive a class from BaseCollectionCrawler and name it XXXCrawler

45 2) override the functions parse_collection_content, parse_issue_content and parse_article_content

46 3) update factory.py so that crawler_factory can return your new crawler

47 """

49 source_name = ""

50 source_domain = ""

51 source_website = ""

53 periode_begin = None

54 periode_end = None

56 session: requests.Session | CachedSession

58 def __init__(self, *args, username: str, collection_id: str, collection_url: str, **kwargs):

59 self.username = username

60 self.user = User.objects.get(username=self.username)

62 self.collection_id = collection_id

63 self.collection_url = (

64 collection_url # url of the collection. Ex: https://eudml.org/journal/10098

65 )

66 self.collection = get_or_create_collection(self.collection_id)

68 self.source = None

70 self.issue_href = ""

71 self.test_mode = kwargs.get("test_mode", False)

72 self.publisher = kwargs.get("publisher", "mathdoc")

74 # progress_bar can be set externally, for example if you want to crawl all the collections of a given source.

75 self.progress_bar = kwargs.get("progress_bar", None)

77 # EuDML uses javascript to fill the journal page with the issues list.

78 # We need to use a headless browser (NodeJs/Puppeteer) that can handle dynamic content with EuDML.

79 # Set has_dynamic* to True if the Source uses dynamic content in its web pages.

80 self.has_dynamic_collection_pages = False

81 self.has_dynamic_issue_pages = False

82 self.has_dynamic_article_pages = False

84 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page

85 # AMP sets or creates the Periode during the __init__

86 # TODO: see with other sources when to create the Periode

87 self.periode = None

88 self.periode_first_issue = None

89 self.periode_last_issue = None

91 self.start_pid = kwargs.get("start_pid", None)

93 # Some source have multiple pages for 1 issue. We need to merge the content

95 self.latext_parser = LatexNodes2Text()

97 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)

98 # do not use the "$" to surround tex formulas

99 self.delimiter_inline_formula = "$"

100 self.delimiter_disp_formula = "$"

101

102 self.session = CachedSession(

103 backend=FileCache(

104 getattr(settings, "HTML_ROOT_FOLDER", "/tmp/ptf_requests_cache"),

105 decode_content=False,

106 ),

107 headers={

108 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),

109 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),

110 },

111 expire_after=timedelta(days=30),

112 )

113 # self.session = requests.Session()

114

115 def parse_collection_content(self, content: str) -> list[IssueData]:

116 """

117 Parse the HTML content with BeautifulSoup

118 returns a list of xissue.

119 Override this function in a derived class

120 """

121 return []

122

123 def parse_issue_content(self, content: str, xissue: IssueData):

124 """

125 Parse the HTML content with BeautifulSoup

126 Fills the xissue.articles

127 Override this function in a derived class.

128 """

129

130 def parse_article_content(

131 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str

132 ):

133 """

134 Parse the HTML content with BeautifulSoup

135 returns the xarticle.

136 Override this function in a derived class.

137 The xissue is passed to the function in case the article page has issue information (ex: publisher)

138 The article url is also passed as a parameter

139 """

140 return create_articledata()

141

142 def crawl_collection(self, col_only=False):

143 """

144 Crawl an entire collection. ptf.models.Container objects are created.

145 - get the HTML content of the collection_url

146 - parse the HTML content with beautifulsoup to extract the list of issues

147 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)

148 - crawl each issue if col_only is False

149 - Returns the list of merged issues.

150 It is an OrderedDict {pid: {"issues": xissues}}

151 The key is the pid of the merged issues.

152 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)

153 the pid is then made with 1999-2000__6_

154 """

155

156 if self.source is None:

157 raise RuntimeError("ERROR: the source is not set")

158

159 content = self.get_page_content(self.collection_url)

160 xissues = self.parse_collection_content(content)

161

162 """

163 Some collections split the same volumes in different pages

164 Ex: Volume 6 (2000) and Volume 6 (1999)

165 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)

166 """

167 merged_xissues = self.merge_xissues(xissues)

168

169 if col_only:

170 # TODO: update Celery tasks

171 return merged_xissues

172

173 filtered_xissues = OrderedDict()

174

175 # Filter the issues to crawl if start_pid was set in the constructor

176 start = False

177 for pid in merged_xissues:

178 if self.start_pid is None or start or pid == self.start_pid:

179 start = True

180 filtered_xissues[pid] = merged_xissues[pid]

181

182 def iterate_xissues(filtered_xissues, progress_bar_for_issues):

183 """

184 internal function to be used by alive_bar (see below)

185 to iterate on the issues to crawl.

186 crawl_issue calls addOrUpdateIssueXmlCmd but returns the xissue computed by the crawl

187 """

188 crawled_xissues = []

189 for pid in filtered_xissues:

190 if self.progress_bar is not None and progress_bar_for_issues:

191 self.progress_bar()

192 crawled_xissue = self.crawl_issue(pid, filtered_xissues)

193 crawled_xissues.append(crawled_xissue)

194 return crawled_xissues

195

196 if self.progress_bar is None:

197 with alive_bar(

198 len(filtered_xissues),

199 dual_line=True,

200 title=f"Crawl {self.collection_id} - {self.collection_url}",

201 stats="(eta {eta})",

202 force_tty=True,

203 ) as self.progress_bar:

204 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=True)

205 else:

206 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=False)

207

208 return crawled_xissues

209

210 def crawl_issue(self, pid, merged_xissues):

211 """

212 Wrapper around crawl_one_issue_url, to handle issues declared in multiple web pages.

213 If you want to crawl only 1 issue and not the entire collection,

214 you need to call crawl_collection(col_only=True) before to get the merged_xissues

215 A ptf.models.Container object is created with its Articles.

216 Returns the full xissue (with its articles) used to call addOrUpdateIssueXmlCmd

217 """

218

219 if pid not in merged_xissues:

220 raise ValueError(f"Error {pid} is not found in the collection")

221

222 xissues_to_crawl = merged_xissues[pid]["issues"]

223

224 merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])

225

226 if len(xissues_to_crawl) > 1:

227 do_append = merged_xissues[pid]["do_append"]

228 for index, raw_xissue in enumerate(xissues_to_crawl[1:]):

229 crawled_xissue = self.crawl_one_issue_url(raw_xissue)

230

231 if do_append:

232 merged_xissue.articles.extend(crawled_xissue.articles)

233 else:

234 merged_xissue.articles[:0] = crawled_xissue.articles

235

236 # Updates the article pid

237 for article_index, xarticle in enumerate(merged_xissue):

238 if raw_xissue.pid in xarticle.pid:

239 xarticle.pid = f"{pid}_a{str(article_index)}"

240

241 # Now that the issue pages have been downloaded/read, we can set the merged pid

242 # The merged_year was set in self.merge_xissues

243 merged_xissue.pid = pid

244 merged_xissue.year = merged_xissue.merged_year

245 if self.test_mode is False or self.test_mode is None:

246 if len(merged_xissue.articles) > 0:

247 self.add_xissue_into_database(merged_xissue)

248

249 return merged_xissue

250

251 def crawl_one_issue_url(self, xissue: IssueData):

252 """

253 Crawl 1 wag page of an issue.

254 - get the HTML content of the issue

255 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata

256 - crawl each article

257 """

258

259 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.

260 # The list of articles directly come from the collection HTML page: the xissue has no url attribute

261 if hasattr(xissue, "url") and xissue.url:

262 content = self.get_page_content(xissue.url)

263 self.parse_issue_content(content, xissue)

264

265 xarticles = xissue.articles

266

267 if self.progress_bar:

268 self.progress_bar.title = (

269 f"Crawl {self.collection_id} - {xissue.year} {xissue.volume} {xissue.number}"

270 )

271

272 parsed_xarticles = []

273

274 for xarticle in xarticles:

275 parsed_xarticle = self.crawl_article(xarticle, xissue)

276 if parsed_xarticle is not None:

277 parsed_xarticles.append(parsed_xarticle)

278

279 xissue.articles = parsed_xarticles

280

281 return xissue

282

283 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):

284 if hasattr(xarticle, "url") and xarticle.url:

285 if self.progress_bar: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")

287

288 url = xarticle.url

289

290 content = self.get_page_content(xarticle.url)

291 pid = f"{xissue.pid}_{xarticle.pid}"

292 xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url, pid)

293 if xarticle is None: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 return None

295

296 # ARTICLE URL as en ExtLink (to display the link in the article page)

297 ext_link = create_extlink()

298 ext_link["rel"] = "source"

299 ext_link["location"] = url

300 ext_link["metadata"] = self.source_domain

301 xarticle.ext_links.append(ext_link)

302

303 # The article title may have formulas surrounded with '$'

304 return self.process_article_metadata(xarticle)

305

306 def process_article_metadata(self, xarticle: ArticleData):

307 html, xml = get_html_and_xml_from_text_with_formulas(

308 xarticle.title_tex,

309 delimiter_inline=self.delimiter_inline_formula,

310 delimiter_disp=self.delimiter_disp_formula,

311 )

312 xml = get_title_xml(xml, with_tex_values=False)

313 xarticle.title_html = html

314 xarticle.title_xml = xml

315

316 abstracts_to_parse = [

317 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract"

318 ]

319 # abstract may have formulas surrounded with '$'

320 if len(abstracts_to_parse) > 0:

321 for xabstract in abstracts_to_parse:

322 html, xml = get_html_and_xml_from_text_with_formulas(

323 xabstract["value_tex"],

324 delimiter_inline=self.delimiter_inline_formula,

325 delimiter_disp=self.delimiter_disp_formula,

326 )

327 xabstract["value_html"] = html

328 lang = xabstract["lang"]

329 if lang == xarticle.lang:

330 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'

331 else:

332 xabstract[

333 "value_xml"

334 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'

335

336 update_data_for_jats(xarticle)

337

338 return xarticle

339

340 def download_file(self, url: str):

341 """

342 Downloads a URL, saves its content on disk in filename and returns its content.

343 """

344

345 txt = f"Download {url}"

346 if settings.CRAWLER_LOG_FILE: 346 ↛ 347line 346 didn't jump to line 347 because the condition on line 346 was never true

347 with open(settings.CRAWLER_LOG_FILE, "a") as f_:

348 f_.write(txt + "\n")

349

350 content = ""

351 attempt = 0

352 response = None

353 while not content and attempt < 3:

354 try:

355 headers = {"accept_encoding": "utf-8"}

356 # For SSL Errors, use verify=False kwarg

357 verify = True

358 if url.startswith("https://hdml.di.ionio.gr/"): 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true

359 verify = False

360 # self.session.cache.delete(urls=[url])

361 response = self.session.get(url, headers=headers, verify=verify)

362 if not response.ok:

363 raise requests.exceptions.HTTPError(

364 f"Endpoint answered with code {response.status_code} : {url}",

365 response=response,

366 )

367 content = self.decode_response(response)

368 except (requests.ConnectionError, requests.ConnectTimeout):

369 attempt += 1

370

371 if not content: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 raise requests.exceptions.HTTPError(f"Unable to download {url}")

373

374 return content

375

376 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):

377 """Override this if the content-type headers from the sources are advertising something else than the actual content

378 SASA needs this"""

379 return response.content.decode(encoding)

380

381 def get_page_content(self, url: str, force_download=False):

382 """

383 NOTE : is this needed ?

384 Get the HTML content of a given url.

385 A cache is used to back up the HTML content on disk. By default, the cache is used to read the HTML content.

386 """

387 content = ""

388

389 def set_progress_bar_title():

390 if not self.progress_bar: 390 ↛ 392line 390 didn't jump to line 392 because the condition on line 390 was always true

391 return

392 if isinstance(self.session, CachedSession):

393 if self.session.cache.contains(

394 url=url,

395 ):

396 self.progress_bar.text(f"Get Cached {url}")

397 return

398 self.progress_bar.text(f"Download {url}")

399

400 set_progress_bar_title()

401 content = self.download_file(url)

402

403 return content

404

405 def get_or_create_source(self):

406 try:

407 source = Source.objects.get(name=self.source_name)

408 except Source.DoesNotExist:

409 source = Source(

410 name=self.source_name,

411 domain=self.source_domain,

412 website=self.source_website,

413 create_xissue=True,

414 periode_href="",

415 article_href="",

416 pdf_href="",

417 )

418 source.save()

419

420 return source

421

422 def get_or_create_periode(self):

423 if self.periode is not None:

424 return self.periode

425

426 if self.collection is None or self.source is None:

427 raise ValueError("You need to set a collection or a source before creating a periode")

428

429 qs = Periode.objects.filter(collection=self.collection, source=self.source)

430 if qs.exists():

431 periode = qs.first()

432 else:

433 periode = Periode(

434 collection=self.collection,

435 source=self.source,

436 title=self.collection.title_tex,

437 issue_href=self.issue_href,

438 collection_href=self.collection_url,

439 doi_href="",

440 published=False,

441 begin=self.periode_begin,

442 end=self.periode_end,

443 first_issue=self.periode_first_issue,

444 last_issue=self.periode_last_issue,

445 )

446 periode.save()

447

448 return periode

449

450 def merge_xissues(self, xissues: list[IssueData]):

451 """

452 Some collections split the same volumes in different pages

453 Ex: Volume 6 (2000) and Volume 6 (1999)

454 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)

455 """

456

457 merged_xissues = OrderedDict()

458

459 for xissue in xissues:

460 xissues_with_same_volume = [

461 item

462 for item in xissues

463 if xissue.volume == item.volume

464 and xissue.number == item.number

465 and xissue.vseries == item.vseries

466 and (item.volume or item.number)

467 ]

468

469 do_append = False

470

471 if len(xissues_with_same_volume) < 2:

472 if xissue.pid is None:

473 raise ValueError("Issue does not have a PID")

474 merged_xissues[xissue.pid] = {"issues": [xissue], "do_append": True}

475 first_issue = xissue

476 year = xissue.year

477 else:

478 first_issue = xissues_with_same_volume[0]

479 volume = xissues_with_same_volume[0].volume

480 number = xissues_with_same_volume[0].number

481 vseries = xissues_with_same_volume[0].vseries

482

483 # Compute the year based on all issues with the same volume/number

484 begin = end = year = xissues_with_same_volume[0].year

485 if not year:

486 raise ValueError("year is not defined")

487

488 if "-" in year:

489 parts = year.split("-")

490 begin = parts[0]

491 end = parts[1]

492

493 for xissue_with_same_volume in xissues_with_same_volume[1:]:

494 new_begin = new_end = xissue_with_same_volume.year

495

496 if not xissue_with_same_volume.year:

497 raise ValueError("xissue year is not defined")

498

499 if "-" in xissue_with_same_volume.year:

500 parts = year.split("-")

501 new_begin = parts[0]

502 new_end = parts[1]

503

504 if begin is None or end is None or new_begin is None or new_end is None:

505 continue

506 begin_int = int(begin)

507 end_int = int(end)

508 new_begin_int = int(new_begin)

509 new_end_int = int(new_end)

510

511 if new_begin_int < begin_int:

512 begin = new_begin

513 if new_end_int > end_int:

514 end = new_end

515 do_append = True

516

517 if begin != end:

518 year = f"{begin}-{end}"

519 else:

520 year = begin

521

522 # We can now set the real pid

523 # Note: We cannot update the pid of each xissue of xissues_with_same_volume

524 # because the HTML cache relies on the original id

525 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"

526 if pid not in merged_xissues:

527 merged_xissues[pid] = {

528 "issues": xissues_with_same_volume,

529 "do_append": do_append,

530 }

531

532 # We can set the year only for the first xissue because it is the one used to collect

533 # all the articles.

534 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])

535 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year

536 first_issue.merged_year = year

537

538 return merged_xissues

539

540 def add_xissue_into_database(self, xissue: IssueData):

541 xissue.journal = self.collection

542

543 xpub = create_publisherdata()

544 xpub.name = self.publisher

545 xissue.publisher = xpub

546 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()

547

548 attempt = 1

549 success = False

550

551 while not success and attempt < 4:

552 try:

553 params = {"xissue": xissue, "use_body": False}

554 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)

555 cmd.do()

556 success = True

557 except SolrError:

558 attempt += 1

559 time.sleep(10)

560

561 def get_metadata_using_citation_meta(

562 self,

563 xarticle: ArticleData,

564 xissue: IssueData,

565 soup: BeautifulSoup,

566 what: list[CitationLiteral] = [],

567 ):

568 """

569 :param xarticle: the xarticle that will collect the metadata

570 :param xissue: the xissue that will collect the publisher

571 :param soup: the BeautifulSoup object of tha article page

572 :param what: list of citation_ items to collect.

573 :return: None. The given article is modified

574 """

575

576 if "title" in what:

577 # TITLE

578 citation_title_node = soup.select_one("meta[name='citation_title']")

579 if citation_title_node: 579 ↛ 584line 579 didn't jump to line 584 because the condition on line 579 was always true

580 title = citation_title_node.get("content")

581 if isinstance(title, str): 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true

582 xarticle.title_tex = title

583

584 if "author" in what: 584 ↛ 596line 584 didn't jump to line 596 because the condition on line 584 was always true

585 # AUTHORS

586 citation_author_nodes = soup.find_all("meta", {"name": "citation_author"})

587 for citation_author_node in citation_author_nodes:

588 text_author = citation_author_node.get("content")

589

590 author = create_contributor()

591 author["role"] = "author"

592 author["string_name"] = text_author

593

594 xarticle.contributors.append(author)

595

596 if "pdf" in what: 596 ↛ 604line 596 didn't jump to line 604 because the condition on line 596 was always true

597 # PDF

598 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')

599 if citation_pdf_node:

600 pdf_url = citation_pdf_node.get("content")

601 if isinstance(pdf_url, str): 601 ↛ 604line 601 didn't jump to line 604 because the condition on line 601 was always true

602 add_pdf_link_to_xarticle(xarticle, pdf_url)

603

604 lang = "en"

605 if "lang" in what:

606 # LANG

607 citation_lang_node = soup.find("meta", {"name": "citation_language"})

608 if citation_lang_node: 608 ↛ 612line 608 didn't jump to line 612 because the condition on line 608 was always true

609 # TODO: check other language code

610 lang = xarticle.lang = citation_lang_node.get("content").strip()[0:2]

611

612 if "abstract" in what:

613 # ABSTRACT

614 abstract_node = soup.find("div", {"class": "entry-content"})

615 if abstract_node is not None:

616 abstract_section_node = abstract_node.find("p")

617 if abstract_section_node: 617 ↛ 629line 617 didn't jump to line 629 because the condition on line 617 was always true

618 abstract = str(abstract_section_node)

619 xarticle.abstracts.append(

620 {

621 "tag": "abstract",

622 "value_html": "",

623 "value_tex": abstract,

624 "value_xml": "",

625 "lang": lang,

626 }

627 )

628

629 if "page" in what: 629 ↛ 645line 629 didn't jump to line 645 because the condition on line 629 was always true

630 # PAGES

631 citation_fpage_node = soup.find("meta", {"name": "citation_firstpage"})

632 if citation_fpage_node: 632 ↛ 638line 632 didn't jump to line 638 because the condition on line 632 was always true

633 page = citation_fpage_node.get("content")

634 page = page.split("(")[0]

635 if len(page) < 32: 635 ↛ 638line 635 didn't jump to line 638 because the condition on line 635 was always true

636 xarticle.fpage = page

637

638 citation_lpage_node = soup.find("meta", {"name": "citation_lastpage"})

639 if citation_lpage_node: 639 ↛ 645line 639 didn't jump to line 645 because the condition on line 639 was always true

640 page = citation_fpage_node.get("content")

641 page = page.split("(")[0]

642 if len(page) < 32: 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true

643 xarticle.fpage = page

644

645 if "doi" in what:

646 # DOI

647 citation_doi_node = soup.find("meta", {"name": "citation_doi"})

648 if citation_doi_node:

649 doi = citation_doi_node.get("content").strip()

650 pos = doi.find("10.")

651 if pos > 0:

652 doi = doi[pos:]

653 xarticle.doi = doi

654 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")

655

656 if "mr" in what:

657 # MR

658 citation_mr_node = soup.find("meta", {"name": "citation_mr"})

659 if citation_mr_node: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true

660 mr = citation_mr_node.get("content").strip()

661 if mr.find("MR") == 0:

662 mr = mr[2:]

663 xarticle.extids.append(("mr-item-id", mr))

664

665 if "zbl" in what:

666 # ZBL

667 citation_zbl_node = soup.find("meta", {"name": "citation_zbl"})

668 if citation_zbl_node:

669 zbl = citation_zbl_node.get("content").strip()

670 if zbl.find("Zbl") == 0: 670 ↛ 674line 670 didn't jump to line 674 because the condition on line 670 was always true

671 zbl = zbl[3:].strip()

672 xarticle.extids.append(("zbl-item-id", zbl))

673

674 if "publisher" in what and (not self.test_mode): 674 ↛ 676line 674 didn't jump to line 676 because the condition on line 674 was never true

675 # PUBLISHER

676 citation_publisher_node = soup.find("meta", {"name": "citation_publisher"})

677 if citation_publisher_node:

678 pub = citation_publisher_node.get("content").strip()

679 if pub != "":

680 xpub = create_publisherdata()

681 xpub.name = pub

682 xissue.publisher = xpub

683

684 if "keywords" in what:

685 # KEYWORDS

686 citation_kwd_node = soup.find("meta", {"name": "citation_keywords"})

687 if citation_kwd_node:

688 kwds = citation_kwd_node.get("content").split(",")

689 for kwd in kwds:

690 if kwd == "": 690 ↛ 691line 690 didn't jump to line 691 because the condition on line 690 was never true

691 continue

692 kwd = kwd.strip()

693 xarticle.kwds.append({"type": "", "lang": lang, "value": kwd})

694

695 def create_crawled_bibitem(self, value_xml: str):

696 xref = RefData(lang="en")

697 # xref.citation_tex = "".join([e["value_tex"] for e in elements])

698

699 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>'

700 xref.citation_xml = value_xml

701 xref = check_bibitem_xml(xref)

702

703 # Bakes extlink badges into the bibliography html

704 # Maybe we should put this into another file (jats_parser ?)

705 for extid in xref.extids:

706 href = resolve_id(extid[0], extid[1])

707 if (not href) or (not xref.citation_html): 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true

708 continue

709 str_format = extid[0]

710 if str_format in extids_formats: 710 ↛ 712line 710 didn't jump to line 712 because the condition on line 710 was always true

711 str_format = extids_formats[str_format]

712 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"

713

714 return xref

715

716 def create_bibliography(self, bibitems: Sequence[RefData]):

717 xml_str = "<ref-list>\n"

718 html_str = "<div>\n"

719

720 for item in bibitems:

721 xml_str += f"\t{item.citation_xml}\n"

722 html_str += f"\t<p>{item.citation_html}</p>\n"

723 xml_str += "</ref-list>"

724

725 # for item in bibitems:

726 # html_str =

727 # html_str += f"\t<p>{item.citation_html}</p>\n"

728 html_str += "</div>"

729

730 tex_str = "<div>\n"

731 for item in bibitems:

732 tex_str += f"\t<p>{item.citation_tex}</p>\n"

733 tex_str += "</div>"

734

735 biblio_dict: AbstractDict = {

736 "tag": "biblio",

737 "value_html": html_str,

738 "value_tex": tex_str,

739 "value_xml": xml_str,

740 "lang": "en",

741 }

742

743 return biblio_dict

744

745

746def add_pdf_link_to_xarticle(xarticle: ArticleData, pdf_url: str):

747 data = {

748 "rel": "full-text",

749 "mimetype": "application/pdf",

750 "location": pdf_url,

751 "base": "",

752 "text": "Full Text",

753 }

754 xarticle.streams.append(data)

755

756 # The pdf url is already added as a stream (just above) but might be replaced by a file later on.

757 # Keep the pdf url as an Extlink if we want to propose both option:

758 # - direct download of a local PDF

759 # - URL to the remote PDF

760 ext_link = create_extlink(rel="article-pdf", location=pdf_url)

761 xarticle.ext_links.append(ext_link)

Coverage for src/crawler/base_crawler.py: 49%

393 statements