Coverage for src/crawler/base_crawler.py: 78%
428 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-28 11:29 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-28 11:29 +0000
1import time
2from collections.abc import Sequence
3from datetime import timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetectorBuilder
13from ptf.cmds import xml_cmds
14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
15from ptf.cmds.xml.jats.builder.citation import (
16 get_article_title_xml,
17 get_author_xml,
18 get_fpage_xml,
19 get_lpage_xml,
20 get_source_xml,
21 get_year_xml,
22)
23from ptf.cmds.xml.jats.builder.issue import get_title_xml
24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml
25from ptf.display.resolver import extids_formats, resolve_id
26from ptf.model_data import (
27 ArticleData,
28 ContributorDict,
29 IssueData,
30 RefData,
31 create_abstract,
32 create_contributor,
33 create_extlink,
34 create_issuedata,
35 create_publisherdata,
36)
37from ptf.model_data_converter import update_data_for_jats
38from pylatexenc.latex2text import LatexNodes2Text
39from pysolr import SolrError
40from requests_cache import CachedSession, FileCache
42from crawler.models import Source
43from crawler.models.container_source import ContainerSource
44from crawler.types import CitationLiteral
45from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
47# TODO: pass a class factory instead of a dependency to a site
48# TODO: pass a class factory instead of a dependency to a site
51class BaseCollectionCrawler:
52 """
53 Base collection for the crawlers.
54 To create a crawler:
55 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
56 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
57 3) update factory.py so that crawler_factory can return your new crawler
58 """
60 source_name = ""
61 source_domain = ""
62 source_website = ""
64 issue_href = ""
66 collection = None
67 source = None
68 user = None
69 session: requests.Session | CachedSession
70 # Updated in constructor with user agent from settings_local
71 headers = {"accept_encoding": "utf-8"}
73 next_allowed_request: float = time.time()
75 # seconds to wait between two http requests
76 requests_interval = 5
78 latext_parser = LatexNodes2Text()
80 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
81 # do not use the "$" to surround tex formulas
82 delimiter_inline_formula = "$"
83 delimiter_disp_formula = "$"
85 # HACK : Workaround for tests (monkeypatching)
86 # We store the class here, so we can monkeypatch it when running tests
87 # subCrawlers = {
88 # LofplCrawler: None
89 # }
90 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
92 language_detector = LanguageDetectorBuilder.from_all_languages().build()
94 force_refresh = False
96 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
98 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
99 ignore_missing_pdf = True
101 def __init__(
102 self,
103 *args,
104 username: str,
105 collection_id: str,
106 collection_url: str,
107 test_mode: bool = False,
108 publisher: str = "mathdoc",
109 force_refresh=False,
110 ):
111 for CrawlerClass in self.subCrawlers:
112 self.subCrawlers[CrawlerClass] = CrawlerClass(
113 *args,
114 username=username,
115 collection_id=collection_id,
116 collection_url=collection_url,
117 test_mode=test_mode,
118 publisher=publisher,
119 )
121 self.username = username
123 self.collection_id = collection_id
124 self.collection_url = (
125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
126 )
128 self.test_mode = test_mode
129 self.publisher = publisher
131 # Skipped when running tests
132 self.initialize()
134 self.session = CachedSession(
135 backend=FileCache(
136 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"),
137 decode_content=False,
138 ),
139 expire_after=timedelta(days=30),
140 )
141 self.headers.update(
142 {
143 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
144 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
145 }
146 )
148 self.force_refresh = force_refresh
150 def initialize(self):
151 """
152 Acts as a "second" init function to skip model accesses during test data generation
153 """
154 self.collection = get_or_create_collection(self.collection_id)
155 self.source = self.get_or_create_source()
156 self.user = User.objects.get(username=self.username)
158 @classmethod
159 def can_crawl(cls, pid: str) -> bool:
160 return True
162 def parse_collection_content(self, content: str) -> list[IssueData]:
163 """
164 Parse the HTML content with BeautifulSoup
165 returns a list of xissue.
166 Override this function in a derived class
167 """
168 return []
170 def parse_issue_content(self, content: str, xissue: IssueData):
171 """
172 Parse the HTML content with BeautifulSoup
173 Fills the xissue.articles
174 Override this function in a derived class.
176 CAV : You are supposed to create articles there. Please assign a PID to each article.
177 The PID can be `a + article_index`, like this : `a0` `a21`
178 """
180 def parse_article_content(
181 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
182 ) -> ArticleData | None:
183 """
184 Parse the HTML content with BeautifulSoup
185 returns the xarticle.
186 Override this function in a derived class.
187 The xissue is passed to the function in case the article page has issue information (ex: publisher)
188 The article url is also passed as a parameter
190 CAV : You are supposed to assign articles pid again here
191 """
192 return xarticle
194 def crawl_collection(self):
195 # TODO: Comments, filter
196 """
197 Crawl an entire collection. ptf.models.Container objects are created.
198 - get the HTML content of the collection_url
199 - parse the HTML content with beautifulsoup to extract the list of issues
200 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
201 - crawl each issue if col_only is False
202 - Returns the list of merged issues.
203 It is an OrderedDict {pid: {"issues": xissues}}
204 The key is the pid of the merged issues.
205 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
206 the pid is then made with 1999-2000__6_
207 """
209 if self.source is None:
210 raise RuntimeError("ERROR: the source is not set")
212 content = self.download_file(self.collection_url)
213 xissues = self.parse_collection_content(content)
215 """
216 Some collections split the same volumes in different pages
217 Ex: Volume 6 (2000) and Volume 6 (1999)
218 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
219 """
220 # merged_xissues = self.merge_xissues(xissues)
222 xissues_dict = {str(i.pid): i for i in xissues}
224 return xissues_dict
226 def crawl_issue(self, xissue: IssueData):
227 """
228 Crawl 1 wag page of an issue.
229 - get the HTML content of the issue
230 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
231 - crawl each article
232 """
234 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
235 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
237 issue_url = xissue.url
238 if issue_url is not None:
239 if issue_url.endswith(".pdf"):
240 add_pdf_link_to_xarticle(xissue, issue_url)
241 xissue.url = None
242 else:
243 content = self.download_file(issue_url)
244 self.parse_issue_content(content, xissue)
246 xarticles = xissue.articles
248 parsed_xarticles = []
250 for xarticle in xarticles:
251 parsed_xarticle = self.crawl_article(xarticle, xissue)
252 if parsed_xarticle is not None:
253 parsed_xarticles.append(parsed_xarticle)
255 xissue.articles = parsed_xarticles
257 article_has_pdf = self.article_has_pdf(xissue)
259 if self.ignore_missing_pdf:
260 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
262 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
263 self.add_xissue_into_database(xissue)
265 @staticmethod
266 def article_has_source(art: ArticleData | IssueData):
267 return (
268 next(
269 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
270 None,
271 )
272 is not None
273 )
275 @staticmethod
276 def article_has_pdf(art: ArticleData | IssueData):
277 return (
278 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
279 is not None
280 )
282 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
283 # ARTICLE URL as en ExtLink (to display the link in the article page)
284 if xarticle.url is None:
285 if not self.article_has_source(xarticle): 285 ↛ 295line 285 didn't jump to line 295 because the condition on line 285 was always true
286 if xissue.url:
287 article_source = xissue.url
288 else:
289 article_source = self.collection_url
290 ext_link = create_extlink()
291 ext_link["rel"] = "source"
292 ext_link["location"] = article_source
293 ext_link["metadata"] = self.source_domain
294 xarticle.ext_links.append(ext_link)
295 return self.process_article_metadata(xarticle)
297 content = self.download_file(xarticle.url)
299 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
300 if parsed_xarticle is None: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true
301 return None
303 if parsed_xarticle.doi:
304 parsed_xarticle.pid = (
305 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
306 )
307 else:
308 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
310 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
311 ext_link = create_extlink()
312 ext_link["rel"] = "source"
313 ext_link["location"] = parsed_xarticle.url
314 ext_link["metadata"] = self.source_domain
315 parsed_xarticle.ext_links.append(ext_link)
317 # The article title may have formulas surrounded with '$'
318 return self.process_article_metadata(parsed_xarticle)
320 def process_article_metadata(self, xarticle: ArticleData):
321 html, xml = get_html_and_xml_from_text_with_formulas(
322 xarticle.title_tex,
323 delimiter_inline=self.delimiter_inline_formula,
324 delimiter_disp=self.delimiter_disp_formula,
325 )
326 xml = get_title_xml(xml, with_tex_values=False)
327 xarticle.title_html = html
328 xarticle.title_xml = xml
330 abstracts_to_parse = [
331 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract"
332 ]
333 # abstract may have formulas surrounded with '$'
334 if len(abstracts_to_parse) > 0:
335 for xabstract in abstracts_to_parse:
336 html, xml = get_html_and_xml_from_text_with_formulas(
337 xabstract["value_tex"],
338 delimiter_inline=self.delimiter_inline_formula,
339 delimiter_disp=self.delimiter_disp_formula,
340 )
341 xabstract["value_html"] = html
342 lang = xabstract["lang"]
343 if lang == xarticle.lang:
344 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
345 else:
346 xabstract[
347 "value_xml"
348 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
350 update_data_for_jats(xarticle)
352 return xarticle
354 def get(self, url: str, force_refresh=False):
355 attempt = 0
356 response = None
358 while attempt < 3:
359 # If we already have a key, we can skip the timeout
360 if isinstance(self.session, CachedSession): 360 ↛ 365line 360 didn't jump to line 365 because the condition on line 360 was always true
361 if not self.session.cache.contains(url=url):
362 delta = self.next_allowed_request - time.time()
363 if delta > 0:
364 time.sleep(delta)
365 self.next_allowed_request = time.time() + self.requests_interval
366 try:
367 # For SSL Errors, use verify=False kwarg
368 verify = True
369 if url.startswith("https://hdml.di.ionio.gr/"): 369 ↛ 370line 369 didn't jump to line 370 because the condition on line 369 was never true
370 verify = False
371 # self.session.cache.delete(urls=[url])
372 if isinstance(self.session, CachedSession): 372 ↛ 377line 372 didn't jump to line 377 because the condition on line 372 was always true
373 response = self.session.get(
374 url, headers=self.headers, verify=verify, force_refresh=force_refresh
375 )
376 else:
377 response = self.session.get(url, headers=self.headers, verify=verify)
378 if not response.ok:
379 raise requests.exceptions.HTTPError(
380 f"Endpoint answered with code {response.status_code} : {url}",
381 response=response,
382 )
383 return response
384 except (
385 requests.ConnectionError,
386 requests.ConnectTimeout,
387 requests.exceptions.HTTPError,
388 ):
389 attempt += 1
390 raise requests.exceptions.HTTPError(f"Unable to download {url}")
392 def download_file(self, url: str, force_refresh=False):
393 """
394 Downloads a URL, saves its content on disk in filename and returns its content.
395 """
396 response = self.get(url, force_refresh=force_refresh or self.force_refresh)
397 content = self.decode_response(response)
398 if content == "" or not content: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 raise requests.exceptions.HTTPError(response)
400 return content
402 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
403 """Override this if the content-type headers from the sources are advertising something else than the actual content
404 SASA needs this"""
405 response.encoding = encoding
406 return response.text
408 def add_xissue_into_database(self, xissue: IssueData):
409 xissue.journal = self.collection
411 if xissue.year == "":
412 raise ValueError("Failsafe : Cannot insert issue without a year")
414 xpub = create_publisherdata()
415 xpub.name = self.publisher
416 xissue.publisher = xpub
417 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
419 attempt = 1
420 success = False
422 while not success and attempt < 4:
423 try:
424 params = {"xissue": xissue, "use_body": False}
425 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
426 container = cmd.do()
427 success = True
428 ContainerSource.objects.create(source=self.source, container=container)
429 except SolrError:
430 attempt += 1
431 time.sleep(10)
433 def get_metadata_using_citation_meta(
434 self,
435 xarticle: ArticleData,
436 xissue: IssueData,
437 soup: BeautifulSoup,
438 what: list[CitationLiteral] = [],
439 ):
440 """
441 :param xarticle: the xarticle that will collect the metadata
442 :param xissue: the xissue that will collect the publisher
443 :param soup: the BeautifulSoup object of tha article page
444 :param what: list of citation_ items to collect.
445 :return: None. The given article is modified
446 """
448 if "title" in what:
449 # TITLE
450 citation_title_node = soup.select_one("meta[name='citation_title']")
451 if citation_title_node: 451 ↛ 456line 451 didn't jump to line 456 because the condition on line 451 was always true
452 title = citation_title_node.get("content")
453 if isinstance(title, str): 453 ↛ 456line 453 didn't jump to line 456 because the condition on line 453 was always true
454 xarticle.title_tex = title
456 if "author" in what: 456 ↛ 485line 456 didn't jump to line 485 because the condition on line 456 was always true
457 # AUTHORS
458 citation_author_nodes = soup.select("meta[name^='citation_author']")
459 current_author: ContributorDict | None = None
460 for citation_author_node in citation_author_nodes:
461 if citation_author_node.get("name") == "citation_author":
462 text_author = citation_author_node.get("content")
463 if not isinstance(text_author, str): 463 ↛ 464line 463 didn't jump to line 464 because the condition on line 463 was never true
464 raise ValueError("Cannot parse author")
465 if text_author == "": 465 ↛ 466line 465 didn't jump to line 466 because the condition on line 465 was never true
466 current_author = None
467 continue
468 current_author = create_contributor(role="author", string_name=text_author)
469 xarticle.contributors.append(current_author)
470 continue
471 if current_author is None: 471 ↛ 472line 471 didn't jump to line 472 because the condition on line 471 was never true
472 print("Couldn't parse citation author")
473 continue
474 if citation_author_node.get("name") == "citation_author_institution":
475 text_institution = citation_author_node.get("content")
476 if not isinstance(text_institution, str): 476 ↛ 477line 476 didn't jump to line 477 because the condition on line 476 was never true
477 continue
478 current_author["addresses"].append(text_institution)
479 if citation_author_node.get("name") == "citation_author_ocrid": 479 ↛ 480line 479 didn't jump to line 480 because the condition on line 479 was never true
480 text_orcid = citation_author_node.get("content")
481 if not isinstance(text_orcid, str):
482 continue
483 current_author["orcid"] = text_orcid
485 if "pdf" in what:
486 # PDF
487 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
488 if citation_pdf_node:
489 pdf_url = citation_pdf_node.get("content")
490 if isinstance(pdf_url, str): 490 ↛ 493line 490 didn't jump to line 493 because the condition on line 490 was always true
491 add_pdf_link_to_xarticle(xarticle, pdf_url)
493 if "lang" in what:
494 # LANG
495 citation_lang_node = soup.select_one("meta[name='citation_language']")
496 if citation_lang_node: 496 ↛ 502line 496 didn't jump to line 502 because the condition on line 496 was always true
497 # TODO: check other language code
498 content_text = citation_lang_node.get("content")
499 if isinstance(content_text, str): 499 ↛ 502line 499 didn't jump to line 502 because the condition on line 499 was always true
500 xarticle.lang = standardize_tag(content_text)
502 if "abstract" in what:
503 # ABSTRACT
504 abstract_node = soup.select_one("meta[name='citation_abstract']")
505 if abstract_node is not None:
506 abstract = abstract_node.get("content")
507 if not isinstance(abstract, str): 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true
508 raise ValueError("Couldn't parse abstract from meta")
509 abstract = BeautifulSoup(abstract, "html.parser").text
510 lang = abstract_node.get("lang")
511 if not isinstance(lang, str): 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true
512 lang = self.detect_language(abstract, xarticle)
513 xarticle.abstracts.append(
514 {
515 "tag": "abstract",
516 "value_html": "",
517 "value_tex": abstract,
518 "value_xml": "",
519 "lang": lang,
520 }
521 )
523 if "page" in what:
524 # PAGES
525 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
526 if citation_fpage_node:
527 page = citation_fpage_node.get("content")
528 if isinstance(page, str): 528 ↛ 533line 528 didn't jump to line 533 because the condition on line 528 was always true
529 page = page.split("(")[0]
530 if len(page) < 32: 530 ↛ 533line 530 didn't jump to line 533 because the condition on line 530 was always true
531 xarticle.fpage = page
533 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
534 if citation_lpage_node:
535 page = citation_lpage_node.get("content")
536 if isinstance(page, str): 536 ↛ 541line 536 didn't jump to line 541 because the condition on line 536 was always true
537 page = page.split("(")[0]
538 if len(page) < 32: 538 ↛ 541line 538 didn't jump to line 541 because the condition on line 538 was always true
539 xarticle.lpage = page
541 if "doi" in what:
542 # DOI
543 citation_doi_node = soup.select_one("meta[name='citation_doi']")
544 if citation_doi_node:
545 doi = citation_doi_node.get("content")
546 if isinstance(doi, str): 546 ↛ 553line 546 didn't jump to line 553 because the condition on line 546 was always true
547 doi = doi.strip()
548 pos = doi.find("10.")
549 if pos > 0:
550 doi = doi[pos:]
551 xarticle.doi = doi
553 if "mr" in what:
554 # MR
555 citation_mr_node = soup.select_one("meta[name='citation_mr']")
556 if citation_mr_node: 556 ↛ 557line 556 didn't jump to line 557 because the condition on line 556 was never true
557 mr = citation_mr_node.get("content")
558 if isinstance(mr, str):
559 mr = mr.strip()
560 if mr.find("MR") == 0:
561 mr = mr[2:]
562 xarticle.extids.append(("mr-item-id", mr))
564 if "zbl" in what:
565 # ZBL
566 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
567 if citation_zbl_node:
568 zbl = citation_zbl_node.get("content")
569 if isinstance(zbl, str): 569 ↛ 575line 569 didn't jump to line 575 because the condition on line 569 was always true
570 zbl = zbl.strip()
571 if zbl.find("Zbl") == 0: 571 ↛ 575line 571 didn't jump to line 575 because the condition on line 571 was always true
572 zbl = zbl[3:].strip()
573 xarticle.extids.append(("zbl-item-id", zbl))
575 if "publisher" in what:
576 # PUBLISHER
577 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
578 if citation_publisher_node:
579 pub = citation_publisher_node.get("content")
580 if isinstance(pub, str): 580 ↛ 587line 580 didn't jump to line 587 because the condition on line 580 was always true
581 pub = pub.strip()
582 if pub != "": 582 ↛ 587line 582 didn't jump to line 587 because the condition on line 582 was always true
583 xpub = create_publisherdata()
584 xpub.name = pub
585 xissue.publisher = xpub
587 if "keywords" in what:
588 # KEYWORDS
589 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
590 for kwd_node in citation_kwd_nodes:
591 kwds = kwd_node.get("content")
592 if isinstance(kwds, str): 592 ↛ 590line 592 didn't jump to line 590 because the condition on line 592 was always true
593 kwds = kwds.split(",")
594 for kwd in kwds:
595 if kwd == "":
596 continue
597 kwd = kwd.strip()
598 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
600 if "references" in what:
601 citation_references = soup.select("meta[name='citation_reference']")
602 for index, tag in enumerate(citation_references):
603 content = tag.get("content")
604 if not isinstance(content, str): 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true
605 raise ValueError("Cannot parse citation_reference meta")
606 xarticle.bibitems.append(
607 self.__parse_meta_citation_reference(content, str(index + 1))
608 )
610 def create_xissue(
611 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
612 ):
613 if url is not None and url.endswith("/"):
614 url = url[:-1]
615 xissue = create_issuedata()
616 xissue.url = url
618 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
620 xissue.year = year
622 if volume_number is not None: 622 ↛ 625line 622 didn't jump to line 625 because the condition on line 622 was always true
623 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
625 if issue_number is not None:
626 xissue.number = issue_number.replace(",", "-")
627 return xissue
629 def detect_language(self, text: str, article: ArticleData | None = None):
630 if article and article.lang is not None and article.lang != "und":
631 return article.lang
633 language = self.language_detector.detect_language_of(text)
635 if not language: 635 ↛ 636line 635 didn't jump to line 636 because the condition on line 635 was never true
636 return "und"
637 return language.iso_code_639_1.name.lower()
639 references_mapping = {
640 "citation_title": get_article_title_xml,
641 "citation_journal_title": get_source_xml,
642 "citation_publication_date": get_year_xml,
643 "citation_firstpage": get_fpage_xml,
644 "citation_lastpage": get_lpage_xml,
645 }
647 @classmethod
648 def __parse_meta_citation_reference(cls, content: str, label=None):
649 categories = content.split(";")
651 if len(categories) == 1:
652 return cls.create_crawled_bibitem(content, label=label)
654 citation_data = [c.split("=") for c in categories if "=" in c]
655 del categories
657 xml_string = ""
658 authors_parsed = False
659 authors_strings = []
660 for data in citation_data:
661 key = data[0].strip()
662 citation_content = data[1]
663 if key == "citation_author":
664 authors_strings.append(get_author_xml(template_str=citation_content))
665 continue
666 elif not authors_parsed:
667 xml_string += ", ".join(authors_strings)
668 authors_parsed = True
670 if key in cls.references_mapping:
671 xml_string += " " + cls.references_mapping[key](citation_content)
673 return cls.create_crawled_bibitem(xml_string, label=label)
675 @classmethod
676 def get_or_create_source(cls):
677 source, created = Source.objects.get_or_create(
678 domain=cls.source_domain,
679 defaults={
680 "name": cls.source_name,
681 "website": cls.source_website,
682 },
683 )
684 if created: 684 ↛ 685line 684 didn't jump to line 685 because the condition on line 684 was never true
685 source.save()
686 return source
688 @staticmethod
689 def create_crawled_bibitem(ref_value: str | JatsRef, label=None):
690 if isinstance(ref_value, str):
691 xref = RefData(lang="en")
692 value_xml = ""
693 if label:
694 value_xml += f"<label>{label}</label>"
695 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
696 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>'
697 xref.citation_xml = value_xml
698 else:
699 xref = ref_value
701 xref = check_bibitem_xml(xref)
703 # Bakes extlink badges into the bibliography html
704 # Maybe we should put this into another file (jats_parser ?)
705 for extid in xref.extids:
706 href = resolve_id(extid[0], extid[1])
707 if (not href) or (not xref.citation_html): 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true
708 continue
709 str_format = extid[0]
710 if str_format in extids_formats: 710 ↛ 712line 710 didn't jump to line 712 because the condition on line 710 was always true
711 str_format = extids_formats[str_format]
712 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
714 return xref
716 @staticmethod
717 def create_bibliography(bibitems: Sequence[RefData]):
718 xml_str = "<ref-list>\n"
719 html_str = "<div>\n"
721 for item in bibitems:
722 xml_str += f"\t{item.citation_xml}\n"
723 html_str += f"\t<p>{item.citation_html}</p>\n"
724 xml_str += "</ref-list>"
726 # for item in bibitems:
727 # html_str =
728 # html_str += f"\t<p>{item.citation_html}</p>\n"
729 html_str += "</div>"
731 tex_str = "<div>\n"
732 for item in bibitems:
733 tex_str += f"\t<p>{item.citation_tex}</p>\n"
734 tex_str += "</div>"
736 biblio_dict = create_abstract(
737 tag="biblio",
738 value_html=html_str,
739 value_tex=tex_str,
740 value_xml=xml_str,
741 lang="en",
742 )
744 return biblio_dict
746 @staticmethod
747 def get_issue_pid(
748 collection_id: str,
749 year: str,
750 volume_number: str | None = None,
751 issue_number: str | None = None,
752 ):
753 # Replace any non-word character with an underscore
754 pid = f"{collection_id}_{year}"
755 if volume_number is not None: 755 ↛ 757line 755 didn't jump to line 757 because the condition on line 755 was always true
756 pid += f"_{volume_number}"
757 if issue_number is not None:
758 pid += f"_{issue_number}"
759 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
760 return pid
762 @staticmethod
763 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
764 pages_split = pages.split(separator)
765 if len(pages_split) == 0: 765 ↛ 766line 765 didn't jump to line 766 because the condition on line 765 was never true
766 article.page_range = pages
767 if len(pages_split) > 0: 767 ↛ exitline 767 didn't return from function 'set_pages' because the condition on line 767 was always true
768 if pages[0].isnumeric():
769 article.fpage = pages_split[0]
770 if (
771 len(pages_split) > 1
772 and pages_split[0] != pages_split[1]
773 and pages_split[1].isnumeric()
774 ):
775 article.lpage = pages_split[1]