Coverage for src/crawler/base_crawler.py: 76%
402 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import time
2from datetime import timedelta
4import regex
5import requests
6from bs4 import BeautifulSoup
7from django.conf import settings
8from django.contrib.auth.models import User
9from django.utils import timezone
10from langcodes import standardize_tag
11from lingua import LanguageDetectorBuilder
12from ptf.cmds import xml_cmds
13from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
14from ptf.cmds.xml.jats.builder.citation import (
15 get_article_title_xml,
16 get_author_xml,
17 get_fpage_xml,
18 get_lpage_xml,
19 get_source_xml,
20 get_year_xml,
21)
22from ptf.cmds.xml.jats.builder.issue import get_title_xml
23from ptf.cmds.xml.jats.jats_parser import JatsBase
24from ptf.model_data import (
25 ArticleData,
26 ContributorDict,
27 IssueData,
28 ResourceData,
29 create_contributor,
30 create_extlink,
31 create_issuedata,
32 create_publisherdata,
33)
34from ptf.model_data_converter import update_data_for_jats
35from pylatexenc.latex2text import LatexNodes2Text
36from pysolr import SolrError
37from requests_cache import CachedSession, MongoCache
39from crawler.models import Source
40from crawler.models.container_source import ContainerSource
41from crawler.types import CitationLiteral
42from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
44# TODO: pass a class factory instead of a dependency to a site
45# TODO: pass a class factory instead of a dependency to a site
48class BaseCollectionCrawler:
49 """
50 Base collection for the crawlers.
51 To create a crawler:
52 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
53 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
54 3) update factory.py so that crawler_factory can return your new crawler
55 """
57 source_name = ""
58 source_domain = ""
59 source_website = ""
61 issue_href = ""
63 collection = None
64 source = None
65 user = None
66 session: requests.Session | CachedSession
67 # Updated in constructor with user agent from settings_local
68 headers = {"accept_encoding": "utf-8"}
70 next_allowed_request: float = time.time()
72 # seconds to wait between two http requests
73 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
75 latext_parser = LatexNodes2Text()
77 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
78 # do not use the "$" to surround tex formulas
79 delimiter_inline_formula = "$"
80 delimiter_disp_formula = "$"
82 # HACK : Workaround for tests (monkeypatching)
83 # We store the class here, so we can monkeypatch it when running tests
84 # subCrawlers = {
85 # LofplCrawler: None
86 # }
87 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
89 language_detector = LanguageDetectorBuilder.from_all_languages().build()
91 force_refresh = False
93 # Whereas to include headers in requests cache key
94 match_headers = False
95 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
97 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
98 ignore_missing_pdf = True
100 def __init__(
101 self,
102 *args,
103 username: str,
104 collection_id: str,
105 collection_url: str,
106 test_mode: bool = False,
107 publisher: str = "mathdoc",
108 force_refresh=False,
109 ):
110 for CrawlerClass in self.subCrawlers:
111 self.subCrawlers[CrawlerClass] = CrawlerClass(
112 *args,
113 username=username,
114 collection_id=collection_id,
115 collection_url=collection_url,
116 test_mode=test_mode,
117 publisher=publisher,
118 )
120 self.username = username
122 self.collection_id = collection_id
123 self.collection_url = (
124 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
125 )
127 self.test_mode = test_mode
128 self.publisher = publisher
130 # Skipped when running tests
131 self.initialize()
133 self.session = requests.session()
135 self.force_refresh = force_refresh
137 def initialize(self):
138 """
139 Acts as a "second" init function to skip model accesses during test data generation
140 """
141 self.collection = get_or_create_collection(self.collection_id)
142 self.source = self.get_or_create_source()
143 self.user = User.objects.get(username=self.username)
144 self.session = CachedSession(
145 match_headers=self.match_headers,
146 backend=MongoCache(
147 getattr(settings, "MONGO_HOSTNAME", "localhost"),
148 ),
149 expire_after=timedelta(days=30),
150 )
152 @classmethod
153 def can_crawl(cls, pid: str) -> bool:
154 return True
156 def parse_collection_content(self, content: str) -> list[IssueData]:
157 """
158 Parse the HTML content with BeautifulSoup
159 returns a list of xissue.
160 Override this function in a derived class
161 """
162 return []
164 def parse_issue_content(self, content: str, xissue: IssueData):
165 """
166 Parse the HTML content with BeautifulSoup
167 Fills the xissue.articles
168 Override this function in a derived class.
170 CAV : You are supposed to create articles there. Please assign a PID to each article.
171 The PID can be `a + article_index`, like this : `a0` `a21`
172 """
174 def parse_article_content(
175 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
176 ) -> ArticleData | None:
177 """
178 Parse the HTML content with BeautifulSoup
179 returns the xarticle.
180 Override this function in a derived class.
181 The xissue is passed to the function in case the article page has issue information (ex: publisher)
182 The article url is also passed as a parameter
184 CAV : You are supposed to assign articles pid again here
185 """
186 return xarticle
188 def crawl_collection(self):
189 # TODO: Comments, filter
190 """
191 Crawl an entire collection. ptf.models.Container objects are created.
192 - get the HTML content of the collection_url
193 - parse the HTML content with beautifulsoup to extract the list of issues
194 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
195 - crawl each issue if col_only is False
196 - Returns the list of merged issues.
197 It is an OrderedDict {pid: {"issues": xissues}}
198 The key is the pid of the merged issues.
199 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
200 the pid is then made with 1999-2000__6_
201 """
203 if self.source is None:
204 raise RuntimeError("ERROR: the source is not set")
206 content = self.download_file(self.collection_url)
207 xissues = self.parse_collection_content(content)
209 """
210 Some collections split the same volumes in different pages
211 Ex: Volume 6 (2000) and Volume 6 (1999)
212 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
213 """
214 # merged_xissues = self.merge_xissues(xissues)
216 xissues_dict = {str(i.pid): i for i in xissues}
218 return xissues_dict
220 def crawl_issue(self, xissue: IssueData):
221 """
222 Crawl 1 wag page of an issue.
223 - get the HTML content of the issue
224 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
225 - crawl each article
226 """
228 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
229 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
231 issue_url = xissue.url
232 if issue_url is not None:
233 if issue_url.endswith(".pdf"):
234 add_pdf_link_to_xarticle(xissue, issue_url)
235 xissue.url = None
236 else:
237 content = self.download_file(issue_url)
238 self.parse_issue_content(content, xissue)
240 xarticles = xissue.articles
242 parsed_xarticles = []
244 for xarticle in xarticles:
245 parsed_xarticle = self.crawl_article(xarticle, xissue)
246 if parsed_xarticle is not None:
247 parsed_xarticles.append(parsed_xarticle)
249 xissue.articles = parsed_xarticles
251 article_has_pdf = self.article_has_pdf(xissue)
253 if self.ignore_missing_pdf:
254 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
256 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
257 self.process_resource_metadata(xissue)
258 self.add_xissue_into_database(xissue)
260 @staticmethod
261 def article_has_source(art: ArticleData | IssueData):
262 return (
263 next(
264 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
265 None,
266 )
267 is not None
268 )
270 @staticmethod
271 def article_has_pdf(art: ArticleData | IssueData):
272 return (
273 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
274 is not None
275 )
277 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
278 # ARTICLE URL as en ExtLink (to display the link in the article page)
279 if xarticle.url is None:
280 if not self.article_has_source(xarticle): 280 ↛ 290line 280 didn't jump to line 290 because the condition on line 280 was always true
281 if xissue.url:
282 article_source = xissue.url
283 else:
284 article_source = self.collection_url
285 ext_link = create_extlink()
286 ext_link["rel"] = "source"
287 ext_link["location"] = article_source
288 ext_link["metadata"] = self.source_domain
289 xarticle.ext_links.append(ext_link)
290 return self.process_resource_metadata(xarticle)
292 content = self.download_file(xarticle.url)
294 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
295 if parsed_xarticle is None: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 return None
298 if parsed_xarticle.doi:
299 parsed_xarticle.pid = (
300 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
301 )
302 else:
303 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
305 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
306 ext_link = create_extlink()
307 ext_link["rel"] = "source"
308 ext_link["location"] = parsed_xarticle.url
309 ext_link["metadata"] = self.source_domain
310 parsed_xarticle.ext_links.append(ext_link)
312 # The article title may have formulas surrounded with '$'
313 return self.process_resource_metadata(parsed_xarticle)
315 def process_resource_metadata(self, xresource: ResourceData):
316 # Process title tex
317 html, xml = get_html_and_xml_from_text_with_formulas(
318 xresource.title_tex,
319 delimiter_inline=self.delimiter_inline_formula,
320 delimiter_disp=self.delimiter_disp_formula,
321 )
322 xml = get_title_xml(xml, with_tex_values=False)
323 xresource.title_html = html
324 xresource.title_xml = xml
325 del xml
326 del html
328 # Process trans_title tex
329 html, xml = get_html_and_xml_from_text_with_formulas(
330 xresource.trans_title_tex,
331 delimiter_inline=self.delimiter_inline_formula,
332 delimiter_disp=self.delimiter_disp_formula,
333 )
334 xml = get_title_xml(xml, with_tex_values=False)
335 xresource.trans_title_html = html
336 xresource.trans_title_xml = xml
337 del xml
338 del html
340 abstracts_to_parse = [
341 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
342 ]
343 # abstract may have formulas surrounded with '$'
344 if len(abstracts_to_parse) > 0:
345 for xabstract in abstracts_to_parse:
346 html, xml = get_html_and_xml_from_text_with_formulas(
347 xabstract["value_tex"],
348 delimiter_inline=self.delimiter_inline_formula,
349 delimiter_disp=self.delimiter_disp_formula,
350 )
351 xabstract["value_html"] = html
352 lang = xabstract["lang"]
353 if lang == xresource.lang:
354 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
355 else:
356 xabstract["value_xml"] = (
357 f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
358 )
360 if isinstance(xresource, ArticleData):
361 update_data_for_jats(xresource)
362 return xresource
364 def get(self, url: str, force_refresh=False, headers={}):
365 attempt = 0
366 response = None
368 while attempt < 3:
369 # If we already have a key, we can skip the timeout
370 if isinstance(self.session, CachedSession): 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true
371 if not self.session.cache.contains(url=url) or force_refresh:
372 delta = self.next_allowed_request - time.time()
373 if delta > 0:
374 time.sleep(delta)
375 self.next_allowed_request = time.time() + self.requests_interval
376 try:
377 # For SSL Errors, use verify=False kwarg
378 verify = True
379 if url.startswith("https://hdml.di.ionio.gr/"): 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true
380 verify = False
381 # self.session.cache.delete(urls=[url])
382 if isinstance(self.session, CachedSession): 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true
383 response = self.session.get(
384 url,
385 headers={**self.headers, **headers},
386 verify=verify,
387 force_refresh=force_refresh,
388 )
389 else:
390 response = self.session.get(
391 url, headers={**self.headers, **headers}, verify=verify
392 )
393 if not response.ok:
394 raise requests.exceptions.HTTPError(
395 f"Endpoint answered with code {response.status_code} : {url}",
396 response=response,
397 )
398 return response
399 except (
400 requests.ConnectionError,
401 requests.ConnectTimeout,
402 requests.exceptions.HTTPError,
403 ):
404 attempt += 1
405 raise requests.exceptions.HTTPError(f"Unable to download {url}")
407 def download_file(self, url: str, force_refresh=False, headers={}):
408 """
409 Downloads a URL, saves its content on disk in filename and returns its content.
410 """
411 response = self.get(
412 url, force_refresh=force_refresh or self.force_refresh, headers=headers
413 )
414 content = self.decode_response(response)
415 if content == "" or not content: 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true
416 raise requests.exceptions.HTTPError(response)
417 return content
419 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
420 """Override this if the content-type headers from the sources are advertising something else than the actual content
421 SASA needs this"""
422 response.encoding = encoding
423 return response.text
425 def add_xissue_into_database(self, xissue: IssueData):
426 xissue.journal = self.collection
428 if xissue.year == "":
429 raise ValueError("Failsafe : Cannot insert issue without a year")
431 xpub = create_publisherdata()
432 xpub.name = self.publisher
433 xissue.publisher = xpub
434 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
436 attempt = 1
437 success = False
439 while not success and attempt < 4:
440 try:
441 params = {"xissue": xissue, "use_body": False}
442 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
443 container = cmd.do()
444 success = True
445 ContainerSource.objects.create(source=self.source, container=container)
446 except SolrError:
447 attempt += 1
448 time.sleep(10)
450 def get_metadata_using_citation_meta(
451 self,
452 xarticle: ArticleData,
453 xissue: IssueData,
454 soup: BeautifulSoup,
455 what: list[CitationLiteral] = [],
456 ):
457 """
458 :param xarticle: the xarticle that will collect the metadata
459 :param xissue: the xissue that will collect the publisher
460 :param soup: the BeautifulSoup object of tha article page
461 :param what: list of citation_ items to collect.
462 :return: None. The given article is modified
463 """
465 if "title" in what:
466 # TITLE
467 citation_title_node = soup.select_one("meta[name='citation_title']")
468 if citation_title_node: 468 ↛ 473line 468 didn't jump to line 473 because the condition on line 468 was always true
469 title = citation_title_node.get("content")
470 if isinstance(title, str): 470 ↛ 473line 470 didn't jump to line 473 because the condition on line 470 was always true
471 xarticle.title_tex = title
473 if "author" in what: 473 ↛ 502line 473 didn't jump to line 502 because the condition on line 473 was always true
474 # AUTHORS
475 citation_author_nodes = soup.select("meta[name^='citation_author']")
476 current_author: ContributorDict | None = None
477 for citation_author_node in citation_author_nodes:
478 if citation_author_node.get("name") == "citation_author":
479 text_author = citation_author_node.get("content")
480 if not isinstance(text_author, str): 480 ↛ 481line 480 didn't jump to line 481 because the condition on line 480 was never true
481 raise ValueError("Cannot parse author")
482 if text_author == "": 482 ↛ 483line 482 didn't jump to line 483 because the condition on line 482 was never true
483 current_author = None
484 continue
485 current_author = create_contributor(role="author", string_name=text_author)
486 xarticle.contributors.append(current_author)
487 continue
488 if current_author is None: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true
489 print("Couldn't parse citation author")
490 continue
491 if citation_author_node.get("name") == "citation_author_institution":
492 text_institution = citation_author_node.get("content")
493 if not isinstance(text_institution, str): 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true
494 continue
495 current_author["addresses"].append(text_institution)
496 if citation_author_node.get("name") == "citation_author_ocrid": 496 ↛ 497line 496 didn't jump to line 497 because the condition on line 496 was never true
497 text_orcid = citation_author_node.get("content")
498 if not isinstance(text_orcid, str):
499 continue
500 current_author["orcid"] = text_orcid
502 if "pdf" in what:
503 # PDF
504 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
505 if citation_pdf_node:
506 pdf_url = citation_pdf_node.get("content")
507 if isinstance(pdf_url, str): 507 ↛ 510line 507 didn't jump to line 510 because the condition on line 507 was always true
508 add_pdf_link_to_xarticle(xarticle, pdf_url)
510 if "lang" in what:
511 # LANG
512 citation_lang_node = soup.select_one("meta[name='citation_language']")
513 if citation_lang_node: 513 ↛ 519line 513 didn't jump to line 519 because the condition on line 513 was always true
514 # TODO: check other language code
515 content_text = citation_lang_node.get("content")
516 if isinstance(content_text, str): 516 ↛ 519line 516 didn't jump to line 519 because the condition on line 516 was always true
517 xarticle.lang = standardize_tag(content_text)
519 if "abstract" in what:
520 # ABSTRACT
521 abstract_node = soup.select_one("meta[name='citation_abstract']")
522 if abstract_node is not None:
523 abstract = abstract_node.get("content")
524 if not isinstance(abstract, str): 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true
525 raise ValueError("Couldn't parse abstract from meta")
526 abstract = BeautifulSoup(abstract, "html.parser").text
527 lang = abstract_node.get("lang")
528 if not isinstance(lang, str): 528 ↛ 529line 528 didn't jump to line 529 because the condition on line 528 was never true
529 lang = self.detect_language(abstract, xarticle)
530 xarticle.abstracts.append(
531 {
532 "tag": "abstract",
533 "value_html": "",
534 "value_tex": abstract,
535 "value_xml": "",
536 "lang": lang,
537 }
538 )
540 if "page" in what:
541 # PAGES
542 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
543 if citation_fpage_node:
544 page = citation_fpage_node.get("content")
545 if isinstance(page, str): 545 ↛ 550line 545 didn't jump to line 550 because the condition on line 545 was always true
546 page = page.split("(")[0]
547 if len(page) < 32: 547 ↛ 550line 547 didn't jump to line 550 because the condition on line 547 was always true
548 xarticle.fpage = page
550 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
551 if citation_lpage_node:
552 page = citation_lpage_node.get("content")
553 if isinstance(page, str): 553 ↛ 558line 553 didn't jump to line 558 because the condition on line 553 was always true
554 page = page.split("(")[0]
555 if len(page) < 32: 555 ↛ 558line 555 didn't jump to line 558 because the condition on line 555 was always true
556 xarticle.lpage = page
558 if "doi" in what:
559 # DOI
560 citation_doi_node = soup.select_one("meta[name='citation_doi']")
561 if citation_doi_node:
562 doi = citation_doi_node.get("content")
563 if isinstance(doi, str): 563 ↛ 570line 563 didn't jump to line 570 because the condition on line 563 was always true
564 doi = doi.strip()
565 pos = doi.find("10.")
566 if pos > 0:
567 doi = doi[pos:]
568 xarticle.doi = doi
570 if "mr" in what:
571 # MR
572 citation_mr_node = soup.select_one("meta[name='citation_mr']")
573 if citation_mr_node: 573 ↛ 574line 573 didn't jump to line 574 because the condition on line 573 was never true
574 mr = citation_mr_node.get("content")
575 if isinstance(mr, str):
576 mr = mr.strip()
577 if mr.find("MR") == 0:
578 mr = mr[2:]
579 xarticle.extids.append(("mr-item-id", mr))
581 if "zbl" in what:
582 # ZBL
583 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
584 if citation_zbl_node:
585 zbl = citation_zbl_node.get("content")
586 if isinstance(zbl, str): 586 ↛ 592line 586 didn't jump to line 592 because the condition on line 586 was always true
587 zbl = zbl.strip()
588 if zbl.find("Zbl") == 0: 588 ↛ 592line 588 didn't jump to line 592 because the condition on line 588 was always true
589 zbl = zbl[3:].strip()
590 xarticle.extids.append(("zbl-item-id", zbl))
592 if "publisher" in what:
593 # PUBLISHER
594 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
595 if citation_publisher_node:
596 pub = citation_publisher_node.get("content")
597 if isinstance(pub, str): 597 ↛ 604line 597 didn't jump to line 604 because the condition on line 597 was always true
598 pub = pub.strip()
599 if pub != "": 599 ↛ 604line 599 didn't jump to line 604 because the condition on line 599 was always true
600 xpub = create_publisherdata()
601 xpub.name = pub
602 xissue.publisher = xpub
604 if "keywords" in what:
605 # KEYWORDS
606 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
607 for kwd_node in citation_kwd_nodes:
608 kwds = kwd_node.get("content")
609 if isinstance(kwds, str): 609 ↛ 607line 609 didn't jump to line 607 because the condition on line 609 was always true
610 kwds = kwds.split(",")
611 for kwd in kwds:
612 if kwd == "":
613 continue
614 kwd = kwd.strip()
615 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
617 if "references" in what:
618 citation_references = soup.select("meta[name='citation_reference']")
619 for index, tag in enumerate(citation_references):
620 content = tag.get("content")
621 if not isinstance(content, str): 621 ↛ 622line 621 didn't jump to line 622 because the condition on line 621 was never true
622 raise ValueError("Cannot parse citation_reference meta")
623 xarticle.bibitems.append(
624 self.__parse_meta_citation_reference(content, str(index + 1))
625 )
627 def create_xissue(
628 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
629 ):
630 if url is not None and url.endswith("/"):
631 url = url[:-1]
632 xissue = create_issuedata()
633 xissue.url = url
635 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
637 xissue.year = year
639 if volume_number is not None:
640 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
642 if issue_number is not None:
643 xissue.number = issue_number.replace(",", "-")
644 return xissue
646 def detect_language(self, text: str, article: ArticleData | None = None):
647 if article and article.lang is not None and article.lang != "und":
648 return article.lang
650 language = self.language_detector.detect_language_of(text)
652 if not language: 652 ↛ 653line 652 didn't jump to line 653 because the condition on line 652 was never true
653 return "und"
654 return language.iso_code_639_1.name.lower()
656 references_mapping = {
657 "citation_title": get_article_title_xml,
658 "citation_journal_title": get_source_xml,
659 "citation_publication_date": get_year_xml,
660 "citation_firstpage": get_fpage_xml,
661 "citation_lastpage": get_lpage_xml,
662 }
664 @classmethod
665 def __parse_meta_citation_reference(cls, content: str, label=None):
666 categories = content.split(";")
668 if len(categories) == 1:
669 return JatsBase.bake_ref(content, label=label)
671 citation_data = [c.split("=") for c in categories if "=" in c]
672 del categories
674 xml_string = ""
675 authors_parsed = False
676 authors_strings = []
677 for data in citation_data:
678 key = data[0].strip()
679 citation_content = data[1]
680 if key == "citation_author":
681 authors_strings.append(get_author_xml(template_str=citation_content))
682 continue
683 elif not authors_parsed:
684 xml_string += ", ".join(authors_strings)
685 authors_parsed = True
687 if key in cls.references_mapping:
688 xml_string += " " + cls.references_mapping[key](citation_content)
690 return JatsBase.bake_ref(xml_string, label=label)
692 @classmethod
693 def get_or_create_source(cls):
694 source, created = Source.objects.get_or_create(
695 domain=cls.source_domain,
696 defaults={
697 "name": cls.source_name,
698 "website": cls.source_website,
699 },
700 )
701 if created: 701 ↛ 702line 701 didn't jump to line 702 because the condition on line 701 was never true
702 source.save()
703 return source
705 @staticmethod
706 def get_issue_pid(
707 collection_id: str,
708 year: str,
709 volume_number: str | None = None,
710 issue_number: str | None = None,
711 ):
712 # Replace any non-word character with an underscore
713 pid = f"{collection_id}_{year}"
714 if volume_number is not None:
715 pid += f"_{volume_number}"
716 if issue_number is not None:
717 pid += f"_{issue_number}"
718 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
719 return pid
721 @staticmethod
722 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
723 pages_split = pages.split(separator)
724 if len(pages_split) == 0: 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true
725 article.page_range = pages
726 if len(pages_split) > 0: 726 ↛ exitline 726 didn't return from function 'set_pages' because the condition on line 726 was always true
727 if pages[0].isnumeric():
728 article.fpage = pages_split[0]
729 if (
730 len(pages_split) > 1
731 and pages_split[0] != pages_split[1]
732 and pages_split[1].isnumeric()
733 ):
734 article.lpage = pages_split[1]