Coverage for src/crawler/base_crawler.py: 49%
393 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import time
2from collections import OrderedDict
3from collections.abc import Sequence
4from datetime import timedelta
6import requests
7from alive_progress import alive_bar
8from bs4 import BeautifulSoup
9from django.conf import settings
10from django.contrib.auth.models import User
11from django.utils import timezone
12from ptf.cmds import xml_cmds
13from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
14from ptf.cmds.xml.jats.builder.issue import get_title_xml
15from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml
16from ptf.display.resolver import extids_formats, resolve_id
17from ptf.model_data import (
18 AbstractDict,
19 ArticleData,
20 IssueData,
21 RefData,
22 create_articledata,
23 create_contributor,
24 create_extlink,
25 create_publisherdata,
26)
27from ptf.model_data_converter import update_data_for_jats
28from pylatexenc.latex2text import LatexNodes2Text
29from pysolr import SolrError
30from requests_cache import CachedSession, FileCache
32# TODO: pass a class factory instead of a dependency to a site
33# TODO: pass a class factory instead of a dependency to a site
34from crawler.models import Periode, Source
35from crawler.utils import get_or_create_collection
37from .crawler_types import CitationLiteral
40class BaseCollectionCrawler:
41 """
42 Base collection for the crawlers.
43 To create a crawler:
44 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
45 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
46 3) update factory.py so that crawler_factory can return your new crawler
47 """
49 source_name = ""
50 source_domain = ""
51 source_website = ""
53 periode_begin = None
54 periode_end = None
56 session: requests.Session | CachedSession
58 def __init__(self, *args, username: str, collection_id: str, collection_url: str, **kwargs):
59 self.username = username
60 self.user = User.objects.get(username=self.username)
62 self.collection_id = collection_id
63 self.collection_url = (
64 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
65 )
66 self.collection = get_or_create_collection(self.collection_id)
68 self.source = None
70 self.issue_href = ""
71 self.test_mode = kwargs.get("test_mode", False)
72 self.publisher = kwargs.get("publisher", "mathdoc")
74 # progress_bar can be set externally, for example if you want to crawl all the collections of a given source.
75 self.progress_bar = kwargs.get("progress_bar", None)
77 # EuDML uses javascript to fill the journal page with the issues list.
78 # We need to use a headless browser (NodeJs/Puppeteer) that can handle dynamic content with EuDML.
79 # Set has_dynamic* to True if the Source uses dynamic content in its web pages.
80 self.has_dynamic_collection_pages = False
81 self.has_dynamic_issue_pages = False
82 self.has_dynamic_article_pages = False
84 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page
85 # AMP sets or creates the Periode during the __init__
86 # TODO: see with other sources when to create the Periode
87 self.periode = None
88 self.periode_first_issue = None
89 self.periode_last_issue = None
91 self.start_pid = kwargs.get("start_pid", None)
93 # Some source have multiple pages for 1 issue. We need to merge the content
95 self.latext_parser = LatexNodes2Text()
97 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
98 # do not use the "$" to surround tex formulas
99 self.delimiter_inline_formula = "$"
100 self.delimiter_disp_formula = "$"
102 self.session = CachedSession(
103 backend=FileCache(
104 getattr(settings, "HTML_ROOT_FOLDER", "/tmp/ptf_requests_cache"),
105 decode_content=False,
106 ),
107 headers={
108 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
109 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
110 },
111 expire_after=timedelta(days=30),
112 )
113 # self.session = requests.Session()
115 def parse_collection_content(self, content: str) -> list[IssueData]:
116 """
117 Parse the HTML content with BeautifulSoup
118 returns a list of xissue.
119 Override this function in a derived class
120 """
121 return []
123 def parse_issue_content(self, content: str, xissue: IssueData):
124 """
125 Parse the HTML content with BeautifulSoup
126 Fills the xissue.articles
127 Override this function in a derived class.
128 """
130 def parse_article_content(
131 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str
132 ):
133 """
134 Parse the HTML content with BeautifulSoup
135 returns the xarticle.
136 Override this function in a derived class.
137 The xissue is passed to the function in case the article page has issue information (ex: publisher)
138 The article url is also passed as a parameter
139 """
140 return create_articledata()
142 def crawl_collection(self, col_only=False):
143 """
144 Crawl an entire collection. ptf.models.Container objects are created.
145 - get the HTML content of the collection_url
146 - parse the HTML content with beautifulsoup to extract the list of issues
147 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
148 - crawl each issue if col_only is False
149 - Returns the list of merged issues.
150 It is an OrderedDict {pid: {"issues": xissues}}
151 The key is the pid of the merged issues.
152 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
153 the pid is then made with 1999-2000__6_
154 """
156 if self.source is None:
157 raise RuntimeError("ERROR: the source is not set")
159 content = self.get_page_content(self.collection_url)
160 xissues = self.parse_collection_content(content)
162 """
163 Some collections split the same volumes in different pages
164 Ex: Volume 6 (2000) and Volume 6 (1999)
165 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
166 """
167 merged_xissues = self.merge_xissues(xissues)
169 if col_only:
170 # TODO: update Celery tasks
171 return merged_xissues
173 filtered_xissues = OrderedDict()
175 # Filter the issues to crawl if start_pid was set in the constructor
176 start = False
177 for pid in merged_xissues:
178 if self.start_pid is None or start or pid == self.start_pid:
179 start = True
180 filtered_xissues[pid] = merged_xissues[pid]
182 def iterate_xissues(filtered_xissues, progress_bar_for_issues):
183 """
184 internal function to be used by alive_bar (see below)
185 to iterate on the issues to crawl.
186 crawl_issue calls addOrUpdateIssueXmlCmd but returns the xissue computed by the crawl
187 """
188 crawled_xissues = []
189 for pid in filtered_xissues:
190 if self.progress_bar is not None and progress_bar_for_issues:
191 self.progress_bar()
192 crawled_xissue = self.crawl_issue(pid, filtered_xissues)
193 crawled_xissues.append(crawled_xissue)
194 return crawled_xissues
196 if self.progress_bar is None:
197 with alive_bar(
198 len(filtered_xissues),
199 dual_line=True,
200 title=f"Crawl {self.collection_id} - {self.collection_url}",
201 stats="(eta {eta})",
202 force_tty=True,
203 ) as self.progress_bar:
204 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=True)
205 else:
206 crawled_xissues = iterate_xissues(filtered_xissues, progress_bar_for_issues=False)
208 return crawled_xissues
210 def crawl_issue(self, pid, merged_xissues):
211 """
212 Wrapper around crawl_one_issue_url, to handle issues declared in multiple web pages.
213 If you want to crawl only 1 issue and not the entire collection,
214 you need to call crawl_collection(col_only=True) before to get the merged_xissues
215 A ptf.models.Container object is created with its Articles.
216 Returns the full xissue (with its articles) used to call addOrUpdateIssueXmlCmd
217 """
219 if pid not in merged_xissues:
220 raise ValueError(f"Error {pid} is not found in the collection")
222 xissues_to_crawl = merged_xissues[pid]["issues"]
224 merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])
226 if len(xissues_to_crawl) > 1:
227 do_append = merged_xissues[pid]["do_append"]
228 for index, raw_xissue in enumerate(xissues_to_crawl[1:]):
229 crawled_xissue = self.crawl_one_issue_url(raw_xissue)
231 if do_append:
232 merged_xissue.articles.extend(crawled_xissue.articles)
233 else:
234 merged_xissue.articles[:0] = crawled_xissue.articles
236 # Updates the article pid
237 for article_index, xarticle in enumerate(merged_xissue):
238 if raw_xissue.pid in xarticle.pid:
239 xarticle.pid = f"{pid}_a{str(article_index)}"
241 # Now that the issue pages have been downloaded/read, we can set the merged pid
242 # The merged_year was set in self.merge_xissues
243 merged_xissue.pid = pid
244 merged_xissue.year = merged_xissue.merged_year
245 if self.test_mode is False or self.test_mode is None:
246 if len(merged_xissue.articles) > 0:
247 self.add_xissue_into_database(merged_xissue)
249 return merged_xissue
251 def crawl_one_issue_url(self, xissue: IssueData):
252 """
253 Crawl 1 wag page of an issue.
254 - get the HTML content of the issue
255 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
256 - crawl each article
257 """
259 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
260 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
261 if hasattr(xissue, "url") and xissue.url:
262 content = self.get_page_content(xissue.url)
263 self.parse_issue_content(content, xissue)
265 xarticles = xissue.articles
267 if self.progress_bar:
268 self.progress_bar.title = (
269 f"Crawl {self.collection_id} - {xissue.year} {xissue.volume} {xissue.number}"
270 )
272 parsed_xarticles = []
274 for xarticle in xarticles:
275 parsed_xarticle = self.crawl_article(xarticle, xissue)
276 if parsed_xarticle is not None:
277 parsed_xarticles.append(parsed_xarticle)
279 xissue.articles = parsed_xarticles
281 return xissue
283 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
284 if hasattr(xarticle, "url") and xarticle.url:
285 if self.progress_bar: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true
286 self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
288 url = xarticle.url
290 content = self.get_page_content(xarticle.url)
291 pid = f"{xissue.pid}_{xarticle.pid}"
292 xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url, pid)
293 if xarticle is None: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 return None
296 # ARTICLE URL as en ExtLink (to display the link in the article page)
297 ext_link = create_extlink()
298 ext_link["rel"] = "source"
299 ext_link["location"] = url
300 ext_link["metadata"] = self.source_domain
301 xarticle.ext_links.append(ext_link)
303 # The article title may have formulas surrounded with '$'
304 return self.process_article_metadata(xarticle)
306 def process_article_metadata(self, xarticle: ArticleData):
307 html, xml = get_html_and_xml_from_text_with_formulas(
308 xarticle.title_tex,
309 delimiter_inline=self.delimiter_inline_formula,
310 delimiter_disp=self.delimiter_disp_formula,
311 )
312 xml = get_title_xml(xml, with_tex_values=False)
313 xarticle.title_html = html
314 xarticle.title_xml = xml
316 abstracts_to_parse = [
317 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract"
318 ]
319 # abstract may have formulas surrounded with '$'
320 if len(abstracts_to_parse) > 0:
321 for xabstract in abstracts_to_parse:
322 html, xml = get_html_and_xml_from_text_with_formulas(
323 xabstract["value_tex"],
324 delimiter_inline=self.delimiter_inline_formula,
325 delimiter_disp=self.delimiter_disp_formula,
326 )
327 xabstract["value_html"] = html
328 lang = xabstract["lang"]
329 if lang == xarticle.lang:
330 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
331 else:
332 xabstract[
333 "value_xml"
334 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
336 update_data_for_jats(xarticle)
338 return xarticle
340 def download_file(self, url: str):
341 """
342 Downloads a URL, saves its content on disk in filename and returns its content.
343 """
345 txt = f"Download {url}"
346 if settings.CRAWLER_LOG_FILE: 346 ↛ 347line 346 didn't jump to line 347 because the condition on line 346 was never true
347 with open(settings.CRAWLER_LOG_FILE, "a") as f_:
348 f_.write(txt + "\n")
350 content = ""
351 attempt = 0
352 response = None
353 while not content and attempt < 3:
354 try:
355 headers = {"accept_encoding": "utf-8"}
356 # For SSL Errors, use verify=False kwarg
357 verify = True
358 if url.startswith("https://hdml.di.ionio.gr/"): 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 verify = False
360 # self.session.cache.delete(urls=[url])
361 response = self.session.get(url, headers=headers, verify=verify)
362 if not response.ok:
363 raise requests.exceptions.HTTPError(
364 f"Endpoint answered with code {response.status_code} : {url}",
365 response=response,
366 )
367 content = self.decode_response(response)
368 except (requests.ConnectionError, requests.ConnectTimeout):
369 attempt += 1
371 if not content: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true
372 raise requests.exceptions.HTTPError(f"Unable to download {url}")
374 return content
376 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
377 """Override this if the content-type headers from the sources are advertising something else than the actual content
378 SASA needs this"""
379 return response.content.decode(encoding)
381 def get_page_content(self, url: str, force_download=False):
382 """
383 NOTE : is this needed ?
384 Get the HTML content of a given url.
385 A cache is used to back up the HTML content on disk. By default, the cache is used to read the HTML content.
386 """
387 content = ""
389 def set_progress_bar_title():
390 if not self.progress_bar: 390 ↛ 392line 390 didn't jump to line 392 because the condition on line 390 was always true
391 return
392 if isinstance(self.session, CachedSession):
393 if self.session.cache.contains(
394 url=url,
395 ):
396 self.progress_bar.text(f"Get Cached {url}")
397 return
398 self.progress_bar.text(f"Download {url}")
400 set_progress_bar_title()
401 content = self.download_file(url)
403 return content
405 def get_or_create_source(self):
406 try:
407 source = Source.objects.get(name=self.source_name)
408 except Source.DoesNotExist:
409 source = Source(
410 name=self.source_name,
411 domain=self.source_domain,
412 website=self.source_website,
413 create_xissue=True,
414 periode_href="",
415 article_href="",
416 pdf_href="",
417 )
418 source.save()
420 return source
422 def get_or_create_periode(self):
423 if self.periode is not None:
424 return self.periode
426 if self.collection is None or self.source is None:
427 raise ValueError("You need to set a collection or a source before creating a periode")
429 qs = Periode.objects.filter(collection=self.collection, source=self.source)
430 if qs.exists():
431 periode = qs.first()
432 else:
433 periode = Periode(
434 collection=self.collection,
435 source=self.source,
436 title=self.collection.title_tex,
437 issue_href=self.issue_href,
438 collection_href=self.collection_url,
439 doi_href="",
440 published=False,
441 begin=self.periode_begin,
442 end=self.periode_end,
443 first_issue=self.periode_first_issue,
444 last_issue=self.periode_last_issue,
445 )
446 periode.save()
448 return periode
450 def merge_xissues(self, xissues: list[IssueData]):
451 """
452 Some collections split the same volumes in different pages
453 Ex: Volume 6 (2000) and Volume 6 (1999)
454 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
455 """
457 merged_xissues = OrderedDict()
459 for xissue in xissues:
460 xissues_with_same_volume = [
461 item
462 for item in xissues
463 if xissue.volume == item.volume
464 and xissue.number == item.number
465 and xissue.vseries == item.vseries
466 and (item.volume or item.number)
467 ]
469 do_append = False
471 if len(xissues_with_same_volume) < 2:
472 if xissue.pid is None:
473 raise ValueError("Issue does not have a PID")
474 merged_xissues[xissue.pid] = {"issues": [xissue], "do_append": True}
475 first_issue = xissue
476 year = xissue.year
477 else:
478 first_issue = xissues_with_same_volume[0]
479 volume = xissues_with_same_volume[0].volume
480 number = xissues_with_same_volume[0].number
481 vseries = xissues_with_same_volume[0].vseries
483 # Compute the year based on all issues with the same volume/number
484 begin = end = year = xissues_with_same_volume[0].year
485 if not year:
486 raise ValueError("year is not defined")
488 if "-" in year:
489 parts = year.split("-")
490 begin = parts[0]
491 end = parts[1]
493 for xissue_with_same_volume in xissues_with_same_volume[1:]:
494 new_begin = new_end = xissue_with_same_volume.year
496 if not xissue_with_same_volume.year:
497 raise ValueError("xissue year is not defined")
499 if "-" in xissue_with_same_volume.year:
500 parts = year.split("-")
501 new_begin = parts[0]
502 new_end = parts[1]
504 if begin is None or end is None or new_begin is None or new_end is None:
505 continue
506 begin_int = int(begin)
507 end_int = int(end)
508 new_begin_int = int(new_begin)
509 new_end_int = int(new_end)
511 if new_begin_int < begin_int:
512 begin = new_begin
513 if new_end_int > end_int:
514 end = new_end
515 do_append = True
517 if begin != end:
518 year = f"{begin}-{end}"
519 else:
520 year = begin
522 # We can now set the real pid
523 # Note: We cannot update the pid of each xissue of xissues_with_same_volume
524 # because the HTML cache relies on the original id
525 pid = f"{self.collection_id}_{year}_{vseries}_{volume}_{number}"
526 if pid not in merged_xissues:
527 merged_xissues[pid] = {
528 "issues": xissues_with_same_volume,
529 "do_append": do_append,
530 }
532 # We can set the year only for the first xissue because it is the one used to collect
533 # all the articles.
534 # See crawl_issue with merged_xissue = self.crawl_one_issue_url(xissues_to_crawl[0])
535 # But we need to use a separate variable (merged_year) because parse_article_content may rely on the year
536 first_issue.merged_year = year
538 return merged_xissues
540 def add_xissue_into_database(self, xissue: IssueData):
541 xissue.journal = self.collection
543 xpub = create_publisherdata()
544 xpub.name = self.publisher
545 xissue.publisher = xpub
546 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
548 attempt = 1
549 success = False
551 while not success and attempt < 4:
552 try:
553 params = {"xissue": xissue, "use_body": False}
554 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
555 cmd.do()
556 success = True
557 except SolrError:
558 attempt += 1
559 time.sleep(10)
561 def get_metadata_using_citation_meta(
562 self,
563 xarticle: ArticleData,
564 xissue: IssueData,
565 soup: BeautifulSoup,
566 what: list[CitationLiteral] = [],
567 ):
568 """
569 :param xarticle: the xarticle that will collect the metadata
570 :param xissue: the xissue that will collect the publisher
571 :param soup: the BeautifulSoup object of tha article page
572 :param what: list of citation_ items to collect.
573 :return: None. The given article is modified
574 """
576 if "title" in what:
577 # TITLE
578 citation_title_node = soup.select_one("meta[name='citation_title']")
579 if citation_title_node: 579 ↛ 584line 579 didn't jump to line 584 because the condition on line 579 was always true
580 title = citation_title_node.get("content")
581 if isinstance(title, str): 581 ↛ 584line 581 didn't jump to line 584 because the condition on line 581 was always true
582 xarticle.title_tex = title
584 if "author" in what: 584 ↛ 596line 584 didn't jump to line 596 because the condition on line 584 was always true
585 # AUTHORS
586 citation_author_nodes = soup.find_all("meta", {"name": "citation_author"})
587 for citation_author_node in citation_author_nodes:
588 text_author = citation_author_node.get("content")
590 author = create_contributor()
591 author["role"] = "author"
592 author["string_name"] = text_author
594 xarticle.contributors.append(author)
596 if "pdf" in what: 596 ↛ 604line 596 didn't jump to line 604 because the condition on line 596 was always true
597 # PDF
598 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
599 if citation_pdf_node:
600 pdf_url = citation_pdf_node.get("content")
601 if isinstance(pdf_url, str): 601 ↛ 604line 601 didn't jump to line 604 because the condition on line 601 was always true
602 add_pdf_link_to_xarticle(xarticle, pdf_url)
604 lang = "en"
605 if "lang" in what:
606 # LANG
607 citation_lang_node = soup.find("meta", {"name": "citation_language"})
608 if citation_lang_node: 608 ↛ 612line 608 didn't jump to line 612 because the condition on line 608 was always true
609 # TODO: check other language code
610 lang = xarticle.lang = citation_lang_node.get("content").strip()[0:2]
612 if "abstract" in what:
613 # ABSTRACT
614 abstract_node = soup.find("div", {"class": "entry-content"})
615 if abstract_node is not None:
616 abstract_section_node = abstract_node.find("p")
617 if abstract_section_node: 617 ↛ 629line 617 didn't jump to line 629 because the condition on line 617 was always true
618 abstract = str(abstract_section_node)
619 xarticle.abstracts.append(
620 {
621 "tag": "abstract",
622 "value_html": "",
623 "value_tex": abstract,
624 "value_xml": "",
625 "lang": lang,
626 }
627 )
629 if "page" in what: 629 ↛ 645line 629 didn't jump to line 645 because the condition on line 629 was always true
630 # PAGES
631 citation_fpage_node = soup.find("meta", {"name": "citation_firstpage"})
632 if citation_fpage_node: 632 ↛ 638line 632 didn't jump to line 638 because the condition on line 632 was always true
633 page = citation_fpage_node.get("content")
634 page = page.split("(")[0]
635 if len(page) < 32: 635 ↛ 638line 635 didn't jump to line 638 because the condition on line 635 was always true
636 xarticle.fpage = page
638 citation_lpage_node = soup.find("meta", {"name": "citation_lastpage"})
639 if citation_lpage_node: 639 ↛ 645line 639 didn't jump to line 645 because the condition on line 639 was always true
640 page = citation_fpage_node.get("content")
641 page = page.split("(")[0]
642 if len(page) < 32: 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true
643 xarticle.fpage = page
645 if "doi" in what:
646 # DOI
647 citation_doi_node = soup.find("meta", {"name": "citation_doi"})
648 if citation_doi_node:
649 doi = citation_doi_node.get("content").strip()
650 pos = doi.find("10.")
651 if pos > 0:
652 doi = doi[pos:]
653 xarticle.doi = doi
654 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
656 if "mr" in what:
657 # MR
658 citation_mr_node = soup.find("meta", {"name": "citation_mr"})
659 if citation_mr_node: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true
660 mr = citation_mr_node.get("content").strip()
661 if mr.find("MR") == 0:
662 mr = mr[2:]
663 xarticle.extids.append(("mr-item-id", mr))
665 if "zbl" in what:
666 # ZBL
667 citation_zbl_node = soup.find("meta", {"name": "citation_zbl"})
668 if citation_zbl_node:
669 zbl = citation_zbl_node.get("content").strip()
670 if zbl.find("Zbl") == 0: 670 ↛ 674line 670 didn't jump to line 674 because the condition on line 670 was always true
671 zbl = zbl[3:].strip()
672 xarticle.extids.append(("zbl-item-id", zbl))
674 if "publisher" in what and (not self.test_mode): 674 ↛ 676line 674 didn't jump to line 676 because the condition on line 674 was never true
675 # PUBLISHER
676 citation_publisher_node = soup.find("meta", {"name": "citation_publisher"})
677 if citation_publisher_node:
678 pub = citation_publisher_node.get("content").strip()
679 if pub != "":
680 xpub = create_publisherdata()
681 xpub.name = pub
682 xissue.publisher = xpub
684 if "keywords" in what:
685 # KEYWORDS
686 citation_kwd_node = soup.find("meta", {"name": "citation_keywords"})
687 if citation_kwd_node:
688 kwds = citation_kwd_node.get("content").split(",")
689 for kwd in kwds:
690 if kwd == "": 690 ↛ 691line 690 didn't jump to line 691 because the condition on line 690 was never true
691 continue
692 kwd = kwd.strip()
693 xarticle.kwds.append({"type": "", "lang": lang, "value": kwd})
695 def create_crawled_bibitem(self, value_xml: str):
696 xref = RefData(lang="en")
697 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
699 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>'
700 xref.citation_xml = value_xml
701 xref = check_bibitem_xml(xref)
703 # Bakes extlink badges into the bibliography html
704 # Maybe we should put this into another file (jats_parser ?)
705 for extid in xref.extids:
706 href = resolve_id(extid[0], extid[1])
707 if (not href) or (not xref.citation_html): 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true
708 continue
709 str_format = extid[0]
710 if str_format in extids_formats: 710 ↛ 712line 710 didn't jump to line 712 because the condition on line 710 was always true
711 str_format = extids_formats[str_format]
712 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
714 return xref
716 def create_bibliography(self, bibitems: Sequence[RefData]):
717 xml_str = "<ref-list>\n"
718 html_str = "<div>\n"
720 for item in bibitems:
721 xml_str += f"\t{item.citation_xml}\n"
722 html_str += f"\t<p>{item.citation_html}</p>\n"
723 xml_str += "</ref-list>"
725 # for item in bibitems:
726 # html_str =
727 # html_str += f"\t<p>{item.citation_html}</p>\n"
728 html_str += "</div>"
730 tex_str = "<div>\n"
731 for item in bibitems:
732 tex_str += f"\t<p>{item.citation_tex}</p>\n"
733 tex_str += "</div>"
735 biblio_dict: AbstractDict = {
736 "tag": "biblio",
737 "value_html": html_str,
738 "value_tex": tex_str,
739 "value_xml": xml_str,
740 "lang": "en",
741 }
743 return biblio_dict
746def add_pdf_link_to_xarticle(xarticle: ArticleData, pdf_url: str):
747 data = {
748 "rel": "full-text",
749 "mimetype": "application/pdf",
750 "location": pdf_url,
751 "base": "",
752 "text": "Full Text",
753 }
754 xarticle.streams.append(data)
756 # The pdf url is already added as a stream (just above) but might be replaced by a file later on.
757 # Keep the pdf url as an Extlink if we want to propose both option:
758 # - direct download of a local PDF
759 # - URL to the remote PDF
760 ext_link = create_extlink(rel="article-pdf", location=pdf_url)
761 xarticle.ext_links.append(ext_link)