Coverage for src/crawler/base_crawler.py: 79%
430 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-02 15:25 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-02 15:25 +0000
1import time
2from collections.abc import Sequence
3from datetime import timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetectorBuilder
13from ptf.cmds import xml_cmds
14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
15from ptf.cmds.xml.jats.builder.citation import (
16 get_article_title_xml,
17 get_author_xml,
18 get_fpage_xml,
19 get_lpage_xml,
20 get_source_xml,
21 get_year_xml,
22)
23from ptf.cmds.xml.jats.builder.issue import get_title_xml
24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml
25from ptf.display.resolver import extids_formats, resolve_id
26from ptf.model_data import (
27 ArticleData,
28 ContributorDict,
29 IssueData,
30 RefData,
31 ResourceData,
32 create_abstract,
33 create_contributor,
34 create_extlink,
35 create_issuedata,
36 create_publisherdata,
37)
38from ptf.model_data_converter import update_data_for_jats
39from pylatexenc.latex2text import LatexNodes2Text
40from pysolr import SolrError
41from requests_cache import CachedSession, FileCache
43from crawler.models import Source
44from crawler.models.container_source import ContainerSource
45from crawler.types import CitationLiteral
46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
48# TODO: pass a class factory instead of a dependency to a site
49# TODO: pass a class factory instead of a dependency to a site
52class BaseCollectionCrawler:
53 """
54 Base collection for the crawlers.
55 To create a crawler:
56 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
57 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
58 3) update factory.py so that crawler_factory can return your new crawler
59 """
61 source_name = ""
62 source_domain = ""
63 source_website = ""
65 issue_href = ""
67 collection = None
68 source = None
69 user = None
70 session: requests.Session | CachedSession
71 # Updated in constructor with user agent from settings_local
72 headers = {"accept_encoding": "utf-8"}
74 next_allowed_request: float = time.time()
76 # seconds to wait between two http requests
77 requests_interval = 5
79 latext_parser = LatexNodes2Text()
81 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
82 # do not use the "$" to surround tex formulas
83 delimiter_inline_formula = "$"
84 delimiter_disp_formula = "$"
86 # HACK : Workaround for tests (monkeypatching)
87 # We store the class here, so we can monkeypatch it when running tests
88 # subCrawlers = {
89 # LofplCrawler: None
90 # }
91 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
93 language_detector = LanguageDetectorBuilder.from_all_languages().build()
95 force_refresh = False
97 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
99 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
100 ignore_missing_pdf = True
102 def __init__(
103 self,
104 *args,
105 username: str,
106 collection_id: str,
107 collection_url: str,
108 test_mode: bool = False,
109 publisher: str = "mathdoc",
110 force_refresh=False,
111 ):
112 for CrawlerClass in self.subCrawlers:
113 self.subCrawlers[CrawlerClass] = CrawlerClass(
114 *args,
115 username=username,
116 collection_id=collection_id,
117 collection_url=collection_url,
118 test_mode=test_mode,
119 publisher=publisher,
120 )
122 self.username = username
124 self.collection_id = collection_id
125 self.collection_url = (
126 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
127 )
129 self.test_mode = test_mode
130 self.publisher = publisher
132 # Skipped when running tests
133 self.initialize()
135 self.session = CachedSession(
136 backend=FileCache(
137 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"),
138 decode_content=False,
139 ),
140 expire_after=timedelta(days=30),
141 )
142 self.headers.update(
143 {
144 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
145 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
146 }
147 )
149 self.force_refresh = force_refresh
151 def initialize(self):
152 """
153 Acts as a "second" init function to skip model accesses during test data generation
154 """
155 self.collection = get_or_create_collection(self.collection_id)
156 self.source = self.get_or_create_source()
157 self.user = User.objects.get(username=self.username)
159 @classmethod
160 def can_crawl(cls, pid: str) -> bool:
161 return True
163 def parse_collection_content(self, content: str) -> list[IssueData]:
164 """
165 Parse the HTML content with BeautifulSoup
166 returns a list of xissue.
167 Override this function in a derived class
168 """
169 return []
171 def parse_issue_content(self, content: str, xissue: IssueData):
172 """
173 Parse the HTML content with BeautifulSoup
174 Fills the xissue.articles
175 Override this function in a derived class.
177 CAV : You are supposed to create articles there. Please assign a PID to each article.
178 The PID can be `a + article_index`, like this : `a0` `a21`
179 """
181 def parse_article_content(
182 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
183 ) -> ArticleData | None:
184 """
185 Parse the HTML content with BeautifulSoup
186 returns the xarticle.
187 Override this function in a derived class.
188 The xissue is passed to the function in case the article page has issue information (ex: publisher)
189 The article url is also passed as a parameter
191 CAV : You are supposed to assign articles pid again here
192 """
193 return xarticle
195 def crawl_collection(self):
196 # TODO: Comments, filter
197 """
198 Crawl an entire collection. ptf.models.Container objects are created.
199 - get the HTML content of the collection_url
200 - parse the HTML content with beautifulsoup to extract the list of issues
201 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
202 - crawl each issue if col_only is False
203 - Returns the list of merged issues.
204 It is an OrderedDict {pid: {"issues": xissues}}
205 The key is the pid of the merged issues.
206 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
207 the pid is then made with 1999-2000__6_
208 """
210 if self.source is None:
211 raise RuntimeError("ERROR: the source is not set")
213 content = self.download_file(self.collection_url)
214 xissues = self.parse_collection_content(content)
216 """
217 Some collections split the same volumes in different pages
218 Ex: Volume 6 (2000) and Volume 6 (1999)
219 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
220 """
221 # merged_xissues = self.merge_xissues(xissues)
223 xissues_dict = {str(i.pid): i for i in xissues}
225 return xissues_dict
227 def crawl_issue(self, xissue: IssueData):
228 """
229 Crawl 1 wag page of an issue.
230 - get the HTML content of the issue
231 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
232 - crawl each article
233 """
235 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
236 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
238 issue_url = xissue.url
239 if issue_url is not None:
240 if issue_url.endswith(".pdf"):
241 add_pdf_link_to_xarticle(xissue, issue_url)
242 xissue.url = None
243 else:
244 content = self.download_file(issue_url)
245 self.parse_issue_content(content, xissue)
247 xarticles = xissue.articles
249 parsed_xarticles = []
251 for xarticle in xarticles:
252 parsed_xarticle = self.crawl_article(xarticle, xissue)
253 if parsed_xarticle is not None:
254 parsed_xarticles.append(parsed_xarticle)
256 xissue.articles = parsed_xarticles
258 article_has_pdf = self.article_has_pdf(xissue)
260 if self.ignore_missing_pdf:
261 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
263 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
264 self.process_resource_metadata(xissue)
265 self.add_xissue_into_database(xissue)
267 @staticmethod
268 def article_has_source(art: ArticleData | IssueData):
269 return (
270 next(
271 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
272 None,
273 )
274 is not None
275 )
277 @staticmethod
278 def article_has_pdf(art: ArticleData | IssueData):
279 return (
280 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
281 is not None
282 )
284 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
285 # ARTICLE URL as en ExtLink (to display the link in the article page)
286 if xarticle.url is None:
287 if not self.article_has_source(xarticle): 287 ↛ 297line 287 didn't jump to line 297 because the condition on line 287 was always true
288 if xissue.url:
289 article_source = xissue.url
290 else:
291 article_source = self.collection_url
292 ext_link = create_extlink()
293 ext_link["rel"] = "source"
294 ext_link["location"] = article_source
295 ext_link["metadata"] = self.source_domain
296 xarticle.ext_links.append(ext_link)
297 return self.process_resource_metadata(xarticle)
299 content = self.download_file(xarticle.url)
301 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
302 if parsed_xarticle is None: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true
303 return None
305 if parsed_xarticle.doi:
306 parsed_xarticle.pid = (
307 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
308 )
309 else:
310 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
312 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
313 ext_link = create_extlink()
314 ext_link["rel"] = "source"
315 ext_link["location"] = parsed_xarticle.url
316 ext_link["metadata"] = self.source_domain
317 parsed_xarticle.ext_links.append(ext_link)
319 # The article title may have formulas surrounded with '$'
320 return self.process_resource_metadata(parsed_xarticle)
322 def process_resource_metadata(self, xresource: ResourceData):
323 html, xml = get_html_and_xml_from_text_with_formulas(
324 xresource.title_tex,
325 delimiter_inline=self.delimiter_inline_formula,
326 delimiter_disp=self.delimiter_disp_formula,
327 )
328 xml = get_title_xml(xml, with_tex_values=False)
329 xresource.title_html = html
330 xresource.title_xml = xml
332 abstracts_to_parse = [
333 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
334 ]
335 # abstract may have formulas surrounded with '$'
336 if len(abstracts_to_parse) > 0:
337 for xabstract in abstracts_to_parse:
338 html, xml = get_html_and_xml_from_text_with_formulas(
339 xabstract["value_tex"],
340 delimiter_inline=self.delimiter_inline_formula,
341 delimiter_disp=self.delimiter_disp_formula,
342 )
343 xabstract["value_html"] = html
344 lang = xabstract["lang"]
345 if lang == xresource.lang:
346 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
347 else:
348 xabstract[
349 "value_xml"
350 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
352 if isinstance(xresource, ArticleData):
353 update_data_for_jats(xresource)
354 return xresource
356 def get(self, url: str, force_refresh=False):
357 attempt = 0
358 response = None
360 while attempt < 3:
361 # If we already have a key, we can skip the timeout
362 if isinstance(self.session, CachedSession): 362 ↛ 367line 362 didn't jump to line 367 because the condition on line 362 was always true
363 if not self.session.cache.contains(url=url):
364 delta = self.next_allowed_request - time.time()
365 if delta > 0:
366 time.sleep(delta)
367 self.next_allowed_request = time.time() + self.requests_interval
368 try:
369 # For SSL Errors, use verify=False kwarg
370 verify = True
371 if url.startswith("https://hdml.di.ionio.gr/"): 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true
372 verify = False
373 # self.session.cache.delete(urls=[url])
374 if isinstance(self.session, CachedSession): 374 ↛ 379line 374 didn't jump to line 379 because the condition on line 374 was always true
375 response = self.session.get(
376 url, headers=self.headers, verify=verify, force_refresh=force_refresh
377 )
378 else:
379 response = self.session.get(url, headers=self.headers, verify=verify)
380 if not response.ok:
381 raise requests.exceptions.HTTPError(
382 f"Endpoint answered with code {response.status_code} : {url}",
383 response=response,
384 )
385 return response
386 except (
387 requests.ConnectionError,
388 requests.ConnectTimeout,
389 requests.exceptions.HTTPError,
390 ):
391 attempt += 1
392 raise requests.exceptions.HTTPError(f"Unable to download {url}")
394 def download_file(self, url: str, force_refresh=False):
395 """
396 Downloads a URL, saves its content on disk in filename and returns its content.
397 """
398 response = self.get(url, force_refresh=force_refresh or self.force_refresh)
399 content = self.decode_response(response)
400 if content == "" or not content: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 raise requests.exceptions.HTTPError(response)
402 return content
404 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
405 """Override this if the content-type headers from the sources are advertising something else than the actual content
406 SASA needs this"""
407 response.encoding = encoding
408 return response.text
410 def add_xissue_into_database(self, xissue: IssueData):
411 xissue.journal = self.collection
413 if xissue.year == "":
414 raise ValueError("Failsafe : Cannot insert issue without a year")
416 xpub = create_publisherdata()
417 xpub.name = self.publisher
418 xissue.publisher = xpub
419 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
421 attempt = 1
422 success = False
424 while not success and attempt < 4:
425 try:
426 params = {"xissue": xissue, "use_body": False}
427 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
428 container = cmd.do()
429 success = True
430 ContainerSource.objects.create(source=self.source, container=container)
431 except SolrError:
432 attempt += 1
433 time.sleep(10)
435 def get_metadata_using_citation_meta(
436 self,
437 xarticle: ArticleData,
438 xissue: IssueData,
439 soup: BeautifulSoup,
440 what: list[CitationLiteral] = [],
441 ):
442 """
443 :param xarticle: the xarticle that will collect the metadata
444 :param xissue: the xissue that will collect the publisher
445 :param soup: the BeautifulSoup object of tha article page
446 :param what: list of citation_ items to collect.
447 :return: None. The given article is modified
448 """
450 if "title" in what:
451 # TITLE
452 citation_title_node = soup.select_one("meta[name='citation_title']")
453 if citation_title_node: 453 ↛ 458line 453 didn't jump to line 458 because the condition on line 453 was always true
454 title = citation_title_node.get("content")
455 if isinstance(title, str): 455 ↛ 458line 455 didn't jump to line 458 because the condition on line 455 was always true
456 xarticle.title_tex = title
458 if "author" in what: 458 ↛ 487line 458 didn't jump to line 487 because the condition on line 458 was always true
459 # AUTHORS
460 citation_author_nodes = soup.select("meta[name^='citation_author']")
461 current_author: ContributorDict | None = None
462 for citation_author_node in citation_author_nodes:
463 if citation_author_node.get("name") == "citation_author":
464 text_author = citation_author_node.get("content")
465 if not isinstance(text_author, str): 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true
466 raise ValueError("Cannot parse author")
467 if text_author == "": 467 ↛ 468line 467 didn't jump to line 468 because the condition on line 467 was never true
468 current_author = None
469 continue
470 current_author = create_contributor(role="author", string_name=text_author)
471 xarticle.contributors.append(current_author)
472 continue
473 if current_author is None: 473 ↛ 474line 473 didn't jump to line 474 because the condition on line 473 was never true
474 print("Couldn't parse citation author")
475 continue
476 if citation_author_node.get("name") == "citation_author_institution":
477 text_institution = citation_author_node.get("content")
478 if not isinstance(text_institution, str): 478 ↛ 479line 478 didn't jump to line 479 because the condition on line 478 was never true
479 continue
480 current_author["addresses"].append(text_institution)
481 if citation_author_node.get("name") == "citation_author_ocrid": 481 ↛ 482line 481 didn't jump to line 482 because the condition on line 481 was never true
482 text_orcid = citation_author_node.get("content")
483 if not isinstance(text_orcid, str):
484 continue
485 current_author["orcid"] = text_orcid
487 if "pdf" in what:
488 # PDF
489 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
490 if citation_pdf_node:
491 pdf_url = citation_pdf_node.get("content")
492 if isinstance(pdf_url, str): 492 ↛ 495line 492 didn't jump to line 495 because the condition on line 492 was always true
493 add_pdf_link_to_xarticle(xarticle, pdf_url)
495 if "lang" in what:
496 # LANG
497 citation_lang_node = soup.select_one("meta[name='citation_language']")
498 if citation_lang_node: 498 ↛ 504line 498 didn't jump to line 504 because the condition on line 498 was always true
499 # TODO: check other language code
500 content_text = citation_lang_node.get("content")
501 if isinstance(content_text, str): 501 ↛ 504line 501 didn't jump to line 504 because the condition on line 501 was always true
502 xarticle.lang = standardize_tag(content_text)
504 if "abstract" in what:
505 # ABSTRACT
506 abstract_node = soup.select_one("meta[name='citation_abstract']")
507 if abstract_node is not None:
508 abstract = abstract_node.get("content")
509 if not isinstance(abstract, str): 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true
510 raise ValueError("Couldn't parse abstract from meta")
511 abstract = BeautifulSoup(abstract, "html.parser").text
512 lang = abstract_node.get("lang")
513 if not isinstance(lang, str): 513 ↛ 514line 513 didn't jump to line 514 because the condition on line 513 was never true
514 lang = self.detect_language(abstract, xarticle)
515 xarticle.abstracts.append(
516 {
517 "tag": "abstract",
518 "value_html": "",
519 "value_tex": abstract,
520 "value_xml": "",
521 "lang": lang,
522 }
523 )
525 if "page" in what:
526 # PAGES
527 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
528 if citation_fpage_node:
529 page = citation_fpage_node.get("content")
530 if isinstance(page, str): 530 ↛ 535line 530 didn't jump to line 535 because the condition on line 530 was always true
531 page = page.split("(")[0]
532 if len(page) < 32: 532 ↛ 535line 532 didn't jump to line 535 because the condition on line 532 was always true
533 xarticle.fpage = page
535 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
536 if citation_lpage_node:
537 page = citation_lpage_node.get("content")
538 if isinstance(page, str): 538 ↛ 543line 538 didn't jump to line 543 because the condition on line 538 was always true
539 page = page.split("(")[0]
540 if len(page) < 32: 540 ↛ 543line 540 didn't jump to line 543 because the condition on line 540 was always true
541 xarticle.lpage = page
543 if "doi" in what:
544 # DOI
545 citation_doi_node = soup.select_one("meta[name='citation_doi']")
546 if citation_doi_node:
547 doi = citation_doi_node.get("content")
548 if isinstance(doi, str): 548 ↛ 555line 548 didn't jump to line 555 because the condition on line 548 was always true
549 doi = doi.strip()
550 pos = doi.find("10.")
551 if pos > 0:
552 doi = doi[pos:]
553 xarticle.doi = doi
555 if "mr" in what:
556 # MR
557 citation_mr_node = soup.select_one("meta[name='citation_mr']")
558 if citation_mr_node: 558 ↛ 559line 558 didn't jump to line 559 because the condition on line 558 was never true
559 mr = citation_mr_node.get("content")
560 if isinstance(mr, str):
561 mr = mr.strip()
562 if mr.find("MR") == 0:
563 mr = mr[2:]
564 xarticle.extids.append(("mr-item-id", mr))
566 if "zbl" in what:
567 # ZBL
568 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
569 if citation_zbl_node:
570 zbl = citation_zbl_node.get("content")
571 if isinstance(zbl, str): 571 ↛ 577line 571 didn't jump to line 577 because the condition on line 571 was always true
572 zbl = zbl.strip()
573 if zbl.find("Zbl") == 0: 573 ↛ 577line 573 didn't jump to line 577 because the condition on line 573 was always true
574 zbl = zbl[3:].strip()
575 xarticle.extids.append(("zbl-item-id", zbl))
577 if "publisher" in what:
578 # PUBLISHER
579 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
580 if citation_publisher_node:
581 pub = citation_publisher_node.get("content")
582 if isinstance(pub, str): 582 ↛ 589line 582 didn't jump to line 589 because the condition on line 582 was always true
583 pub = pub.strip()
584 if pub != "": 584 ↛ 589line 584 didn't jump to line 589 because the condition on line 584 was always true
585 xpub = create_publisherdata()
586 xpub.name = pub
587 xissue.publisher = xpub
589 if "keywords" in what:
590 # KEYWORDS
591 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
592 for kwd_node in citation_kwd_nodes:
593 kwds = kwd_node.get("content")
594 if isinstance(kwds, str): 594 ↛ 592line 594 didn't jump to line 592 because the condition on line 594 was always true
595 kwds = kwds.split(",")
596 for kwd in kwds:
597 if kwd == "":
598 continue
599 kwd = kwd.strip()
600 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
602 if "references" in what:
603 citation_references = soup.select("meta[name='citation_reference']")
604 for index, tag in enumerate(citation_references):
605 content = tag.get("content")
606 if not isinstance(content, str): 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true
607 raise ValueError("Cannot parse citation_reference meta")
608 xarticle.bibitems.append(
609 self.__parse_meta_citation_reference(content, str(index + 1))
610 )
612 def create_xissue(
613 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
614 ):
615 if url is not None and url.endswith("/"):
616 url = url[:-1]
617 xissue = create_issuedata()
618 xissue.url = url
620 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
622 xissue.year = year
624 if volume_number is not None: 624 ↛ 627line 624 didn't jump to line 627 because the condition on line 624 was always true
625 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
627 if issue_number is not None:
628 xissue.number = issue_number.replace(",", "-")
629 return xissue
631 def detect_language(self, text: str, article: ArticleData | None = None):
632 if article and article.lang is not None and article.lang != "und":
633 return article.lang
635 language = self.language_detector.detect_language_of(text)
637 if not language: 637 ↛ 638line 637 didn't jump to line 638 because the condition on line 637 was never true
638 return "und"
639 return language.iso_code_639_1.name.lower()
641 references_mapping = {
642 "citation_title": get_article_title_xml,
643 "citation_journal_title": get_source_xml,
644 "citation_publication_date": get_year_xml,
645 "citation_firstpage": get_fpage_xml,
646 "citation_lastpage": get_lpage_xml,
647 }
649 @classmethod
650 def __parse_meta_citation_reference(cls, content: str, label=None):
651 categories = content.split(";")
653 if len(categories) == 1:
654 return cls.create_crawled_bibitem(content, label=label)
656 citation_data = [c.split("=") for c in categories if "=" in c]
657 del categories
659 xml_string = ""
660 authors_parsed = False
661 authors_strings = []
662 for data in citation_data:
663 key = data[0].strip()
664 citation_content = data[1]
665 if key == "citation_author":
666 authors_strings.append(get_author_xml(template_str=citation_content))
667 continue
668 elif not authors_parsed:
669 xml_string += ", ".join(authors_strings)
670 authors_parsed = True
672 if key in cls.references_mapping:
673 xml_string += " " + cls.references_mapping[key](citation_content)
675 return cls.create_crawled_bibitem(xml_string, label=label)
677 @classmethod
678 def get_or_create_source(cls):
679 source, created = Source.objects.get_or_create(
680 domain=cls.source_domain,
681 defaults={
682 "name": cls.source_name,
683 "website": cls.source_website,
684 },
685 )
686 if created: 686 ↛ 687line 686 didn't jump to line 687 because the condition on line 686 was never true
687 source.save()
688 return source
690 @staticmethod
691 def create_crawled_bibitem(ref_value: str | JatsRef, label=None):
692 if isinstance(ref_value, str):
693 xref = RefData(lang="en")
694 value_xml = ""
695 if label:
696 value_xml += f"<label>{label}</label>"
697 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
698 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>'
699 xref.citation_xml = value_xml
700 else:
701 xref = ref_value
703 xref = check_bibitem_xml(xref)
705 # Bakes extlink badges into the bibliography html
706 # Maybe we should put this into another file (jats_parser ?)
707 for extid in xref.extids:
708 href = resolve_id(extid[0], extid[1])
709 if (not href) or (not xref.citation_html): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true
710 continue
711 str_format = extid[0]
712 if str_format in extids_formats: 712 ↛ 714line 712 didn't jump to line 714 because the condition on line 712 was always true
713 str_format = extids_formats[str_format]
714 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
716 return xref
718 @staticmethod
719 def create_bibliography(bibitems: Sequence[RefData]):
720 xml_str = "<ref-list>\n"
721 html_str = "<div>\n"
723 for item in bibitems:
724 xml_str += f"\t{item.citation_xml}\n"
725 html_str += f"\t<p>{item.citation_html}</p>\n"
726 xml_str += "</ref-list>"
728 # for item in bibitems:
729 # html_str =
730 # html_str += f"\t<p>{item.citation_html}</p>\n"
731 html_str += "</div>"
733 tex_str = "<div>\n"
734 for item in bibitems:
735 tex_str += f"\t<p>{item.citation_tex}</p>\n"
736 tex_str += "</div>"
738 biblio_dict = create_abstract(
739 tag="biblio",
740 value_html=html_str,
741 value_tex=tex_str,
742 value_xml=xml_str,
743 lang="en",
744 )
746 return biblio_dict
748 @staticmethod
749 def get_issue_pid(
750 collection_id: str,
751 year: str,
752 volume_number: str | None = None,
753 issue_number: str | None = None,
754 ):
755 # Replace any non-word character with an underscore
756 pid = f"{collection_id}_{year}"
757 if volume_number is not None:
758 pid += f"_{volume_number}"
759 if issue_number is not None:
760 pid += f"_{issue_number}"
761 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
762 return pid
764 @staticmethod
765 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
766 pages_split = pages.split(separator)
767 if len(pages_split) == 0: 767 ↛ 768line 767 didn't jump to line 768 because the condition on line 767 was never true
768 article.page_range = pages
769 if len(pages_split) > 0: 769 ↛ exitline 769 didn't return from function 'set_pages' because the condition on line 769 was always true
770 if pages[0].isnumeric():
771 article.fpage = pages_split[0]
772 if (
773 len(pages_split) > 1
774 and pages_split[0] != pages_split[1]
775 and pages_split[1].isnumeric()
776 ):
777 article.lpage = pages_split[1]