Coverage for src/crawler/base_crawler.py: 75%
396 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1import time
2from datetime import timedelta
4import regex
5import requests
6from bs4 import BeautifulSoup
7from django.conf import settings
8from django.contrib.auth.models import User
9from django.utils import timezone
10from langcodes import standardize_tag
11from lingua import LanguageDetectorBuilder
12from ptf.cmds import xml_cmds
13from ptf.cmds.xml.ckeditor.utils import (
14 build_jats_data_from_html_field,
15)
16from ptf.cmds.xml.jats.builder.citation import (
17 get_article_title_xml,
18 get_author_xml,
19 get_fpage_xml,
20 get_lpage_xml,
21 get_source_xml,
22 get_year_xml,
23)
24from ptf.cmds.xml.jats.jats_parser import JatsBase
25from ptf.model_data import (
26 ArticleData,
27 ContributorDict,
28 IssueData,
29 ResourceData,
30 create_contributor,
31 create_extlink,
32 create_issuedata,
33 create_publisherdata,
34)
35from ptf.model_data_converter import update_data_for_jats
36from pylatexenc.latex2text import LatexNodes2Text
37from pysolr import SolrError
38from requests_cache import CachedSession, MongoCache
40from crawler.models import Source
41from crawler.models.container_source import ContainerSource
42from crawler.types import CitationLiteral
43from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
45# TODO: pass a class factory instead of a dependency to a site
46# TODO: pass a class factory instead of a dependency to a site
49class BaseCollectionCrawler:
50 """
51 Base collection for the crawlers.
52 To create a crawler:
53 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
54 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
55 3) update factory.py so that crawler_factory can return your new crawler
56 """
58 source_name = ""
59 source_domain = ""
60 source_website = ""
62 issue_href = ""
64 collection = None
65 source = None
66 user = None
67 session: requests.Session | CachedSession
68 # Updated in constructor with user agent from settings_local
69 headers = {"accept_encoding": "utf-8"}
71 next_allowed_request: float = time.time()
73 # seconds to wait between two http requests
74 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
76 latext_parser = LatexNodes2Text()
78 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
79 # do not use the "$" to surround tex formulas
80 delimiter_inline_formula = "$"
81 delimiter_disp_formula = "$"
83 # HACK : Workaround for tests (monkeypatching)
84 # We store the class here, so we can monkeypatch it when running tests
85 # subCrawlers = {
86 # LofplCrawler: None
87 # }
88 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
90 language_detector = LanguageDetectorBuilder.from_all_languages().build()
92 force_refresh = False
94 # Whereas to include headers in requests cache key
95 match_headers = False
96 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
98 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
99 ignore_missing_pdf = True
101 def __init__(
102 self,
103 *args,
104 username: str,
105 collection_id: str,
106 collection_url: str,
107 test_mode: bool = False,
108 publisher: str = "mathdoc",
109 force_refresh=False,
110 ):
111 for CrawlerClass in self.subCrawlers:
112 self.subCrawlers[CrawlerClass] = CrawlerClass(
113 *args,
114 username=username,
115 collection_id=collection_id,
116 collection_url=collection_url,
117 test_mode=test_mode,
118 publisher=publisher,
119 )
121 self.username = username
123 self.collection_id = collection_id
124 self.collection_url = (
125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
126 )
128 self.test_mode = test_mode
129 self.publisher = publisher
131 # Skipped when running tests
132 self.initialize()
134 self.session = requests.session()
136 self.force_refresh = force_refresh
138 def initialize(self):
139 """
140 Acts as a "second" init function to skip model accesses during test data generation
141 """
142 self.collection = get_or_create_collection(self.collection_id)
143 self.source = self.get_or_create_source()
144 self.user = User.objects.get(username=self.username)
145 self.session = CachedSession(
146 match_headers=self.match_headers,
147 backend=MongoCache(
148 getattr(settings, "MONGO_HOSTNAME", "localhost"),
149 ),
150 expire_after=timedelta(days=30),
151 )
153 @classmethod
154 def can_crawl(cls, pid: str) -> bool:
155 return True
157 def parse_collection_content(self, content: str) -> list[IssueData]:
158 """
159 Parse the HTML content with BeautifulSoup
160 returns a list of xissue.
161 Override this function in a derived class
162 """
163 return []
165 def parse_issue_content(self, content: str, xissue: IssueData):
166 """
167 Parse the HTML content with BeautifulSoup
168 Fills the xissue.articles
169 Override this function in a derived class.
171 CAV : You are supposed to create articles there. Please assign a PID to each article.
172 The PID can be `a + article_index`, like this : `a0` `a21`
173 """
175 def parse_article_content(
176 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
177 ) -> ArticleData | None:
178 """
179 Parse the HTML content with BeautifulSoup
180 returns the xarticle.
181 Override this function in a derived class.
182 The xissue is passed to the function in case the article page has issue information (ex: publisher)
183 The article url is also passed as a parameter
185 CAV : You are supposed to assign articles pid again here
186 """
187 return xarticle
189 def crawl_collection(self):
190 # TODO: Comments, filter
191 """
192 Crawl an entire collection. ptf.models.Container objects are created.
193 - get the HTML content of the collection_url
194 - parse the HTML content with beautifulsoup to extract the list of issues
195 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
196 - crawl each issue if col_only is False
197 - Returns the list of merged issues.
198 It is an OrderedDict {pid: {"issues": xissues}}
199 The key is the pid of the merged issues.
200 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
201 the pid is then made with 1999-2000__6_
202 """
204 if self.source is None:
205 raise RuntimeError("ERROR: the source is not set")
207 content = self.download_file(self.collection_url)
208 xissues = self.parse_collection_content(content)
210 """
211 Some collections split the same volumes in different pages
212 Ex: Volume 6 (2000) and Volume 6 (1999)
213 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
214 """
215 # merged_xissues = self.merge_xissues(xissues)
217 xissues_dict = {str(i.pid): i for i in xissues}
219 return xissues_dict
221 def crawl_issue(self, xissue: IssueData):
222 """
223 Crawl 1 wag page of an issue.
224 - get the HTML content of the issue
225 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
226 - crawl each article
227 """
229 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
230 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
232 issue_url = xissue.url
233 if issue_url is not None:
234 if issue_url.endswith(".pdf"):
235 add_pdf_link_to_xarticle(xissue, issue_url)
236 xissue.url = None
237 else:
238 content = self.download_file(issue_url)
239 self.parse_issue_content(content, xissue)
241 xarticles = xissue.articles
243 parsed_xarticles = []
245 for xarticle in xarticles:
246 parsed_xarticle = self.crawl_article(xarticle, xissue)
247 if parsed_xarticle is not None:
248 parsed_xarticles.append(parsed_xarticle)
250 xissue.articles = parsed_xarticles
252 article_has_pdf = self.article_has_pdf(xissue)
254 if self.ignore_missing_pdf:
255 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
257 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
258 self.process_resource_metadata(xissue, resource_type="issue")
259 self.add_xissue_into_database(xissue)
261 @staticmethod
262 def article_has_source(art: ArticleData | IssueData):
263 return (
264 next(
265 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
266 None,
267 )
268 is not None
269 )
271 @staticmethod
272 def article_has_pdf(art: ArticleData | IssueData):
273 return (
274 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
275 is not None
276 )
278 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
279 # ARTICLE URL as en ExtLink (to display the link in the article page)
280 if xarticle.url is None:
281 if not self.article_has_source(xarticle): 281 ↛ 291line 281 didn't jump to line 291 because the condition on line 281 was always true
282 if xissue.url:
283 article_source = xissue.url
284 else:
285 article_source = self.collection_url
286 ext_link = create_extlink()
287 ext_link["rel"] = "source"
288 ext_link["location"] = article_source
289 ext_link["metadata"] = self.source_domain
290 xarticle.ext_links.append(ext_link)
291 return self.process_article_metadata(xarticle)
293 content = self.download_file(xarticle.url)
295 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
296 if parsed_xarticle is None: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 return None
299 if parsed_xarticle.doi:
300 parsed_xarticle.pid = (
301 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
302 )
303 else:
304 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
306 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
307 ext_link = create_extlink()
308 ext_link["rel"] = "source"
309 ext_link["location"] = parsed_xarticle.url
310 ext_link["metadata"] = self.source_domain
311 parsed_xarticle.ext_links.append(ext_link)
313 # The article title may have formulas surrounded with '$'
314 return self.process_article_metadata(parsed_xarticle)
316 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
317 tag = "article-title" if resource_type == "article" else "issue-title"
319 # Process title tex
320 ckeditor_data = build_jats_data_from_html_field(
321 xresource.title_tex,
322 tag=tag,
323 text_lang=xresource.lang,
324 delimiter_inline=self.delimiter_inline_formula,
325 delimiter_disp=self.delimiter_disp_formula,
326 )
328 xresource.title_html = ckeditor_data["value_html"]
329 # xresource.title_tex = ckeditor_data["value_tex"]
330 xresource.title_xml = ckeditor_data["value_xml"]
332 # Process trans_title tex
333 if xresource.trans_title_tex: 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 tag = "trans-article" if resource_type == "article" else "issue-title"
336 ckeditor_data = build_jats_data_from_html_field(
337 xresource.trans_title_tex,
338 tag=tag,
339 text_lang=xresource.trans_lang,
340 resource_lang=xresource.lang,
341 delimiter_inline=self.delimiter_inline_formula,
342 delimiter_disp=self.delimiter_disp_formula,
343 )
345 xresource.titles.append(ckeditor_data["title"])
347 abstracts_to_parse = [
348 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
349 ]
350 # abstract may have formulas surrounded with '$'
351 if len(abstracts_to_parse) > 0:
352 for xabstract in abstracts_to_parse:
353 ckeditor_data = build_jats_data_from_html_field(
354 xabstract["value_tex"],
355 tag="abstract",
356 text_lang=xabstract["lang"],
357 resource_lang=xresource.lang,
358 field_type="abstract",
359 delimiter_inline=self.delimiter_inline_formula,
360 delimiter_disp=self.delimiter_disp_formula,
361 )
363 xabstract["value_html"] = ckeditor_data["value_html"]
364 # xabstract["value_tex"] = ckeditor_data["value_tex"]
365 xabstract["value_xml"] = ckeditor_data["value_xml"]
367 return xresource
369 def process_article_metadata(self, xresource: ResourceData):
370 self.process_resource_metadata(xresource)
371 update_data_for_jats(xresource)
373 return xresource
375 def get(self, url: str, force_refresh=False, headers={}):
376 attempt = 0
377 response = None
379 while attempt < 3:
380 # If we already have a key, we can skip the timeout
381 if isinstance(self.session, CachedSession): 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 if not self.session.cache.contains(url=url) or force_refresh:
383 delta = self.next_allowed_request - time.time()
384 if delta > 0:
385 time.sleep(delta)
386 self.next_allowed_request = time.time() + self.requests_interval
387 try:
388 # For SSL Errors, use verify=False kwarg
389 verify = True
390 if url.startswith("https://hdml.di.ionio.gr/"): 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true
391 verify = False
392 # self.session.cache.delete(urls=[url])
393 if isinstance(self.session, CachedSession): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true
394 response = self.session.get(
395 url,
396 headers={**self.headers, **headers},
397 verify=verify,
398 force_refresh=force_refresh,
399 )
400 else:
401 response = self.session.get(
402 url, headers={**self.headers, **headers}, verify=verify
403 )
404 if not response.ok:
405 raise requests.exceptions.HTTPError(
406 f"Endpoint answered with code {response.status_code} : {url}",
407 response=response,
408 )
409 return response
410 except (
411 requests.ConnectionError,
412 requests.ConnectTimeout,
413 requests.exceptions.HTTPError,
414 ):
415 attempt += 1
416 raise requests.exceptions.HTTPError(f"Unable to download {url}")
418 def download_file(self, url: str, force_refresh=False, headers={}):
419 """
420 Downloads a URL, saves its content on disk in filename and returns its content.
421 """
422 response = self.get(
423 url, force_refresh=force_refresh or self.force_refresh, headers=headers
424 )
425 content = self.decode_response(response)
426 if content == "" or not content: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true
427 raise requests.exceptions.HTTPError(response)
428 return content
430 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
431 """Override this if the content-type headers from the sources are advertising something else than the actual content
432 SASA needs this"""
433 response.encoding = encoding
434 return response.text
436 def add_xissue_into_database(self, xissue: IssueData):
437 xissue.journal = self.collection
439 if xissue.year == "":
440 raise ValueError("Failsafe : Cannot insert issue without a year")
442 xpub = create_publisherdata()
443 xpub.name = self.publisher
444 xissue.publisher = xpub
445 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
447 attempt = 1
448 success = False
450 while not success and attempt < 4:
451 try:
452 params = {"xissue": xissue, "use_body": False}
453 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
454 container = cmd.do()
455 success = True
456 ContainerSource.objects.create(source=self.source, container=container)
457 except SolrError:
458 attempt += 1
459 time.sleep(10)
461 def get_metadata_using_citation_meta(
462 self,
463 xarticle: ArticleData,
464 xissue: IssueData,
465 soup: BeautifulSoup,
466 what: list[CitationLiteral] = [],
467 ):
468 """
469 :param xarticle: the xarticle that will collect the metadata
470 :param xissue: the xissue that will collect the publisher
471 :param soup: the BeautifulSoup object of tha article page
472 :param what: list of citation_ items to collect.
473 :return: None. The given article is modified
474 """
476 if "title" in what:
477 # TITLE
478 citation_title_node = soup.select_one("meta[name='citation_title']")
479 if citation_title_node: 479 ↛ 484line 479 didn't jump to line 484 because the condition on line 479 was always true
480 title = citation_title_node.get("content")
481 if isinstance(title, str): 481 ↛ 484line 481 didn't jump to line 484 because the condition on line 481 was always true
482 xarticle.title_tex = title
484 if "author" in what: 484 ↛ 513line 484 didn't jump to line 513 because the condition on line 484 was always true
485 # AUTHORS
486 citation_author_nodes = soup.select("meta[name^='citation_author']")
487 current_author: ContributorDict | None = None
488 for citation_author_node in citation_author_nodes:
489 if citation_author_node.get("name") == "citation_author":
490 text_author = citation_author_node.get("content")
491 if not isinstance(text_author, str): 491 ↛ 492line 491 didn't jump to line 492 because the condition on line 491 was never true
492 raise ValueError("Cannot parse author")
493 if text_author == "": 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true
494 current_author = None
495 continue
496 current_author = create_contributor(role="author", string_name=text_author)
497 xarticle.contributors.append(current_author)
498 continue
499 if current_author is None: 499 ↛ 500line 499 didn't jump to line 500 because the condition on line 499 was never true
500 print("Couldn't parse citation author")
501 continue
502 if citation_author_node.get("name") == "citation_author_institution":
503 text_institution = citation_author_node.get("content")
504 if not isinstance(text_institution, str): 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true
505 continue
506 current_author["addresses"].append(text_institution)
507 if citation_author_node.get("name") == "citation_author_ocrid": 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true
508 text_orcid = citation_author_node.get("content")
509 if not isinstance(text_orcid, str):
510 continue
511 current_author["orcid"] = text_orcid
513 if "pdf" in what:
514 # PDF
515 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
516 if citation_pdf_node:
517 pdf_url = citation_pdf_node.get("content")
518 if isinstance(pdf_url, str): 518 ↛ 521line 518 didn't jump to line 521 because the condition on line 518 was always true
519 add_pdf_link_to_xarticle(xarticle, pdf_url)
521 if "lang" in what:
522 # LANG
523 citation_lang_node = soup.select_one("meta[name='citation_language']")
524 if citation_lang_node: 524 ↛ 530line 524 didn't jump to line 530 because the condition on line 524 was always true
525 # TODO: check other language code
526 content_text = citation_lang_node.get("content")
527 if isinstance(content_text, str): 527 ↛ 530line 527 didn't jump to line 530 because the condition on line 527 was always true
528 xarticle.lang = standardize_tag(content_text)
530 if "abstract" in what:
531 # ABSTRACT
532 abstract_node = soup.select_one("meta[name='citation_abstract']")
533 if abstract_node is not None:
534 abstract = abstract_node.get("content")
535 if not isinstance(abstract, str): 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true
536 raise ValueError("Couldn't parse abstract from meta")
537 abstract = BeautifulSoup(abstract, "html.parser").text
538 lang = abstract_node.get("lang")
539 if not isinstance(lang, str): 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true
540 lang = self.detect_language(abstract, xarticle)
541 xarticle.abstracts.append(
542 {
543 "tag": "abstract",
544 "value_html": "",
545 "value_tex": abstract,
546 "value_xml": "",
547 "lang": lang,
548 }
549 )
551 if "page" in what:
552 # PAGES
553 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
554 if citation_fpage_node:
555 page = citation_fpage_node.get("content")
556 if isinstance(page, str): 556 ↛ 561line 556 didn't jump to line 561 because the condition on line 556 was always true
557 page = page.split("(")[0]
558 if len(page) < 32: 558 ↛ 561line 558 didn't jump to line 561 because the condition on line 558 was always true
559 xarticle.fpage = page
561 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
562 if citation_lpage_node:
563 page = citation_lpage_node.get("content")
564 if isinstance(page, str): 564 ↛ 569line 564 didn't jump to line 569 because the condition on line 564 was always true
565 page = page.split("(")[0]
566 if len(page) < 32: 566 ↛ 569line 566 didn't jump to line 569 because the condition on line 566 was always true
567 xarticle.lpage = page
569 if "doi" in what:
570 # DOI
571 citation_doi_node = soup.select_one("meta[name='citation_doi']")
572 if citation_doi_node:
573 doi = citation_doi_node.get("content")
574 if isinstance(doi, str): 574 ↛ 581line 574 didn't jump to line 581 because the condition on line 574 was always true
575 doi = doi.strip()
576 pos = doi.find("10.")
577 if pos > 0:
578 doi = doi[pos:]
579 xarticle.doi = doi
581 if "mr" in what:
582 # MR
583 citation_mr_node = soup.select_one("meta[name='citation_mr']")
584 if citation_mr_node: 584 ↛ 585line 584 didn't jump to line 585 because the condition on line 584 was never true
585 mr = citation_mr_node.get("content")
586 if isinstance(mr, str):
587 mr = mr.strip()
588 if mr.find("MR") == 0:
589 mr = mr[2:]
590 xarticle.extids.append(("mr-item-id", mr))
592 if "zbl" in what:
593 # ZBL
594 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
595 if citation_zbl_node:
596 zbl = citation_zbl_node.get("content")
597 if isinstance(zbl, str): 597 ↛ 603line 597 didn't jump to line 603 because the condition on line 597 was always true
598 zbl = zbl.strip()
599 if zbl.find("Zbl") == 0: 599 ↛ 603line 599 didn't jump to line 603 because the condition on line 599 was always true
600 zbl = zbl[3:].strip()
601 xarticle.extids.append(("zbl-item-id", zbl))
603 if "publisher" in what:
604 # PUBLISHER
605 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
606 if citation_publisher_node:
607 pub = citation_publisher_node.get("content")
608 if isinstance(pub, str): 608 ↛ 615line 608 didn't jump to line 615 because the condition on line 608 was always true
609 pub = pub.strip()
610 if pub != "": 610 ↛ 615line 610 didn't jump to line 615 because the condition on line 610 was always true
611 xpub = create_publisherdata()
612 xpub.name = pub
613 xissue.publisher = xpub
615 if "keywords" in what:
616 # KEYWORDS
617 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
618 for kwd_node in citation_kwd_nodes:
619 kwds = kwd_node.get("content")
620 if isinstance(kwds, str): 620 ↛ 618line 620 didn't jump to line 618 because the condition on line 620 was always true
621 kwds = kwds.split(",")
622 for kwd in kwds:
623 if kwd == "":
624 continue
625 kwd = kwd.strip()
626 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
628 if "references" in what:
629 citation_references = soup.select("meta[name='citation_reference']")
630 for index, tag in enumerate(citation_references):
631 content = tag.get("content")
632 if not isinstance(content, str): 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true
633 raise ValueError("Cannot parse citation_reference meta")
634 xarticle.bibitems.append(
635 self.__parse_meta_citation_reference(content, str(index + 1))
636 )
638 def create_xissue(
639 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
640 ):
641 if url is not None and url.endswith("/"):
642 url = url[:-1]
643 xissue = create_issuedata()
644 xissue.url = url
646 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
648 xissue.year = year
650 if volume_number is not None:
651 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
653 if issue_number is not None:
654 xissue.number = issue_number.replace(",", "-")
655 return xissue
657 def detect_language(self, text: str, article: ArticleData | None = None):
658 if article and article.lang is not None and article.lang != "und":
659 return article.lang
661 language = self.language_detector.detect_language_of(text)
663 if not language: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true
664 return "und"
665 return language.iso_code_639_1.name.lower()
667 references_mapping = {
668 "citation_title": get_article_title_xml,
669 "citation_journal_title": get_source_xml,
670 "citation_publication_date": get_year_xml,
671 "citation_firstpage": get_fpage_xml,
672 "citation_lastpage": get_lpage_xml,
673 }
675 @classmethod
676 def __parse_meta_citation_reference(cls, content: str, label=None):
677 categories = content.split(";")
679 if len(categories) == 1:
680 return JatsBase.bake_ref(content, label=label)
682 citation_data = [c.split("=") for c in categories if "=" in c]
683 del categories
685 xml_string = ""
686 authors_parsed = False
687 authors_strings = []
688 for data in citation_data:
689 key = data[0].strip()
690 citation_content = data[1]
691 if key == "citation_author":
692 authors_strings.append(get_author_xml(template_str=citation_content))
693 continue
694 elif not authors_parsed:
695 xml_string += ", ".join(authors_strings)
696 authors_parsed = True
698 if key in cls.references_mapping:
699 xml_string += " " + cls.references_mapping[key](citation_content)
701 return JatsBase.bake_ref(xml_string, label=label)
703 @classmethod
704 def get_or_create_source(cls):
705 source, created = Source.objects.get_or_create(
706 domain=cls.source_domain,
707 defaults={
708 "name": cls.source_name,
709 "website": cls.source_website,
710 },
711 )
712 if created: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 source.save()
714 return source
716 @staticmethod
717 def get_issue_pid(
718 collection_id: str,
719 year: str,
720 volume_number: str | None = None,
721 issue_number: str | None = None,
722 ):
723 # Replace any non-word character with an underscore
724 pid = f"{collection_id}_{year}"
725 if volume_number is not None:
726 pid += f"_{volume_number}"
727 if issue_number is not None:
728 pid += f"_{issue_number}"
729 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
730 return pid
732 @staticmethod
733 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
734 pages_split = pages.split(separator)
735 if len(pages_split) == 0: 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true
736 article.page_range = pages
737 if len(pages_split) > 0: 737 ↛ exitline 737 didn't return from function 'set_pages' because the condition on line 737 was always true
738 if pages[0].isnumeric():
739 article.fpage = pages_split[0]
740 if (
741 len(pages_split) > 1
742 and pages_split[0] != pages_split[1]
743 and pages_split[1].isnumeric()
744 ):
745 article.lpage = pages_split[1]