Coverage for src/crawler/base_crawler.py: 79%
439 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1import time
2from collections.abc import Sequence
3from datetime import timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetectorBuilder
13from ptf.cmds import xml_cmds
14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
15from ptf.cmds.xml.jats.builder.citation import (
16 get_article_title_xml,
17 get_author_xml,
18 get_fpage_xml,
19 get_lpage_xml,
20 get_source_xml,
21 get_year_xml,
22)
23from ptf.cmds.xml.jats.builder.issue import get_title_xml
24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml
25from ptf.display.resolver import extids_formats, resolve_id
26from ptf.model_data import (
27 ArticleData,
28 ContributorDict,
29 IssueData,
30 RefData,
31 ResourceData,
32 create_abstract,
33 create_contributor,
34 create_extlink,
35 create_issuedata,
36 create_publisherdata,
37)
38from ptf.model_data_converter import update_data_for_jats
39from pylatexenc.latex2text import LatexNodes2Text
40from pysolr import SolrError
41from requests_cache import CachedSession, FileCache
43from crawler.models import Source
44from crawler.models.container_source import ContainerSource
45from crawler.types import CitationLiteral
46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
48# TODO: pass a class factory instead of a dependency to a site
49# TODO: pass a class factory instead of a dependency to a site
52class BaseCollectionCrawler:
53 """
54 Base collection for the crawlers.
55 To create a crawler:
56 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
57 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
58 3) update factory.py so that crawler_factory can return your new crawler
59 """
61 source_name = ""
62 source_domain = ""
63 source_website = ""
65 issue_href = ""
67 collection = None
68 source = None
69 user = None
70 session: requests.Session | CachedSession
71 # Updated in constructor with user agent from settings_local
72 headers = {"accept_encoding": "utf-8"}
74 next_allowed_request: float = time.time()
76 # seconds to wait between two http requests
77 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
79 latext_parser = LatexNodes2Text()
81 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
82 # do not use the "$" to surround tex formulas
83 delimiter_inline_formula = "$"
84 delimiter_disp_formula = "$"
86 # HACK : Workaround for tests (monkeypatching)
87 # We store the class here, so we can monkeypatch it when running tests
88 # subCrawlers = {
89 # LofplCrawler: None
90 # }
91 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
93 language_detector = LanguageDetectorBuilder.from_all_languages().build()
95 force_refresh = False
97 # Whereas to include headers in requests cache key
98 match_headers = False
99 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
101 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
102 ignore_missing_pdf = True
104 def __init__(
105 self,
106 *args,
107 username: str,
108 collection_id: str,
109 collection_url: str,
110 test_mode: bool = False,
111 publisher: str = "mathdoc",
112 force_refresh=False,
113 ):
114 for CrawlerClass in self.subCrawlers:
115 self.subCrawlers[CrawlerClass] = CrawlerClass(
116 *args,
117 username=username,
118 collection_id=collection_id,
119 collection_url=collection_url,
120 test_mode=test_mode,
121 publisher=publisher,
122 )
124 self.username = username
126 self.collection_id = collection_id
127 self.collection_url = (
128 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
129 )
131 self.test_mode = test_mode
132 self.publisher = publisher
134 # Skipped when running tests
135 self.initialize()
137 self.session = CachedSession(
138 match_headers=self.match_headers,
139 backend=FileCache(
140 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"),
141 decode_content=False,
142 ),
143 expire_after=timedelta(days=30),
144 )
145 self.headers.update(
146 {
147 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
148 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
149 }
150 )
152 self.force_refresh = force_refresh
154 def initialize(self):
155 """
156 Acts as a "second" init function to skip model accesses during test data generation
157 """
158 self.collection = get_or_create_collection(self.collection_id)
159 self.source = self.get_or_create_source()
160 self.user = User.objects.get(username=self.username)
162 @classmethod
163 def can_crawl(cls, pid: str) -> bool:
164 return True
166 def parse_collection_content(self, content: str) -> list[IssueData]:
167 """
168 Parse the HTML content with BeautifulSoup
169 returns a list of xissue.
170 Override this function in a derived class
171 """
172 return []
174 def parse_issue_content(self, content: str, xissue: IssueData):
175 """
176 Parse the HTML content with BeautifulSoup
177 Fills the xissue.articles
178 Override this function in a derived class.
180 CAV : You are supposed to create articles there. Please assign a PID to each article.
181 The PID can be `a + article_index`, like this : `a0` `a21`
182 """
184 def parse_article_content(
185 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
186 ) -> ArticleData | None:
187 """
188 Parse the HTML content with BeautifulSoup
189 returns the xarticle.
190 Override this function in a derived class.
191 The xissue is passed to the function in case the article page has issue information (ex: publisher)
192 The article url is also passed as a parameter
194 CAV : You are supposed to assign articles pid again here
195 """
196 return xarticle
198 def crawl_collection(self):
199 # TODO: Comments, filter
200 """
201 Crawl an entire collection. ptf.models.Container objects are created.
202 - get the HTML content of the collection_url
203 - parse the HTML content with beautifulsoup to extract the list of issues
204 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
205 - crawl each issue if col_only is False
206 - Returns the list of merged issues.
207 It is an OrderedDict {pid: {"issues": xissues}}
208 The key is the pid of the merged issues.
209 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
210 the pid is then made with 1999-2000__6_
211 """
213 if self.source is None:
214 raise RuntimeError("ERROR: the source is not set")
216 content = self.download_file(self.collection_url)
217 xissues = self.parse_collection_content(content)
219 """
220 Some collections split the same volumes in different pages
221 Ex: Volume 6 (2000) and Volume 6 (1999)
222 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
223 """
224 # merged_xissues = self.merge_xissues(xissues)
226 xissues_dict = {str(i.pid): i for i in xissues}
228 return xissues_dict
230 def crawl_issue(self, xissue: IssueData):
231 """
232 Crawl 1 wag page of an issue.
233 - get the HTML content of the issue
234 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
235 - crawl each article
236 """
238 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
239 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
241 issue_url = xissue.url
242 if issue_url is not None:
243 if issue_url.endswith(".pdf"):
244 add_pdf_link_to_xarticle(xissue, issue_url)
245 xissue.url = None
246 else:
247 content = self.download_file(issue_url)
248 self.parse_issue_content(content, xissue)
250 xarticles = xissue.articles
252 parsed_xarticles = []
254 for xarticle in xarticles:
255 parsed_xarticle = self.crawl_article(xarticle, xissue)
256 if parsed_xarticle is not None:
257 parsed_xarticles.append(parsed_xarticle)
259 xissue.articles = parsed_xarticles
261 article_has_pdf = self.article_has_pdf(xissue)
263 if self.ignore_missing_pdf:
264 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
266 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
267 self.process_resource_metadata(xissue)
268 self.add_xissue_into_database(xissue)
270 @staticmethod
271 def article_has_source(art: ArticleData | IssueData):
272 return (
273 next(
274 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
275 None,
276 )
277 is not None
278 )
280 @staticmethod
281 def article_has_pdf(art: ArticleData | IssueData):
282 return (
283 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
284 is not None
285 )
287 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
288 # ARTICLE URL as en ExtLink (to display the link in the article page)
289 if xarticle.url is None:
290 if not self.article_has_source(xarticle): 290 ↛ 300line 290 didn't jump to line 300 because the condition on line 290 was always true
291 if xissue.url:
292 article_source = xissue.url
293 else:
294 article_source = self.collection_url
295 ext_link = create_extlink()
296 ext_link["rel"] = "source"
297 ext_link["location"] = article_source
298 ext_link["metadata"] = self.source_domain
299 xarticle.ext_links.append(ext_link)
300 return self.process_resource_metadata(xarticle)
302 content = self.download_file(xarticle.url)
304 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
305 if parsed_xarticle is None: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 return None
308 if parsed_xarticle.doi:
309 parsed_xarticle.pid = (
310 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
311 )
312 else:
313 parsed_xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
315 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
316 ext_link = create_extlink()
317 ext_link["rel"] = "source"
318 ext_link["location"] = parsed_xarticle.url
319 ext_link["metadata"] = self.source_domain
320 parsed_xarticle.ext_links.append(ext_link)
322 # The article title may have formulas surrounded with '$'
323 return self.process_resource_metadata(parsed_xarticle)
325 def process_resource_metadata(self, xresource: ResourceData):
326 # Process title tex
327 html, xml = get_html_and_xml_from_text_with_formulas(
328 xresource.title_tex,
329 delimiter_inline=self.delimiter_inline_formula,
330 delimiter_disp=self.delimiter_disp_formula,
331 )
332 xml = get_title_xml(xml, with_tex_values=False)
333 xresource.title_html = html
334 xresource.title_xml = xml
335 del xml
336 del html
338 # Process trans_title tex
339 html, xml = get_html_and_xml_from_text_with_formulas(
340 xresource.trans_title_tex,
341 delimiter_inline=self.delimiter_inline_formula,
342 delimiter_disp=self.delimiter_disp_formula,
343 )
344 xml = get_title_xml(xml, with_tex_values=False)
345 xresource.trans_title_html = html
346 xresource.trans_title_xml = xml
347 del xml
348 del html
350 abstracts_to_parse = [
351 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
352 ]
353 # abstract may have formulas surrounded with '$'
354 if len(abstracts_to_parse) > 0:
355 for xabstract in abstracts_to_parse:
356 html, xml = get_html_and_xml_from_text_with_formulas(
357 xabstract["value_tex"],
358 delimiter_inline=self.delimiter_inline_formula,
359 delimiter_disp=self.delimiter_disp_formula,
360 )
361 xabstract["value_html"] = html
362 lang = xabstract["lang"]
363 if lang == xresource.lang:
364 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
365 else:
366 xabstract["value_xml"] = (
367 f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
368 )
370 if isinstance(xresource, ArticleData):
371 update_data_for_jats(xresource)
372 return xresource
374 def get(self, url: str, force_refresh=False, headers={}):
375 attempt = 0
376 response = None
378 while attempt < 3:
379 # If we already have a key, we can skip the timeout
380 if isinstance(self.session, CachedSession): 380 ↛ 385line 380 didn't jump to line 385 because the condition on line 380 was always true
381 if not self.session.cache.contains(url=url) or force_refresh:
382 delta = self.next_allowed_request - time.time()
383 if delta > 0:
384 time.sleep(delta)
385 self.next_allowed_request = time.time() + self.requests_interval
386 try:
387 # For SSL Errors, use verify=False kwarg
388 verify = True
389 if url.startswith("https://hdml.di.ionio.gr/"): 389 ↛ 390line 389 didn't jump to line 390 because the condition on line 389 was never true
390 verify = False
391 # self.session.cache.delete(urls=[url])
392 if isinstance(self.session, CachedSession): 392 ↛ 400line 392 didn't jump to line 400 because the condition on line 392 was always true
393 response = self.session.get(
394 url,
395 headers={**self.headers, **headers},
396 verify=verify,
397 force_refresh=force_refresh,
398 )
399 else:
400 response = self.session.get(
401 url, headers={**self.headers, **headers}, verify=verify
402 )
403 if not response.ok:
404 raise requests.exceptions.HTTPError(
405 f"Endpoint answered with code {response.status_code} : {url}",
406 response=response,
407 )
408 return response
409 except (
410 requests.ConnectionError,
411 requests.ConnectTimeout,
412 requests.exceptions.HTTPError,
413 ):
414 attempt += 1
415 raise requests.exceptions.HTTPError(f"Unable to download {url}")
417 def download_file(self, url: str, force_refresh=False, headers={}):
418 """
419 Downloads a URL, saves its content on disk in filename and returns its content.
420 """
421 response = self.get(
422 url, force_refresh=force_refresh or self.force_refresh, headers=headers
423 )
424 content = self.decode_response(response)
425 if content == "" or not content: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true
426 raise requests.exceptions.HTTPError(response)
427 return content
429 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
430 """Override this if the content-type headers from the sources are advertising something else than the actual content
431 SASA needs this"""
432 response.encoding = encoding
433 return response.text
435 def add_xissue_into_database(self, xissue: IssueData):
436 xissue.journal = self.collection
438 if xissue.year == "":
439 raise ValueError("Failsafe : Cannot insert issue without a year")
441 xpub = create_publisherdata()
442 xpub.name = self.publisher
443 xissue.publisher = xpub
444 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
446 attempt = 1
447 success = False
449 while not success and attempt < 4:
450 try:
451 params = {"xissue": xissue, "use_body": False}
452 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
453 container = cmd.do()
454 success = True
455 ContainerSource.objects.create(source=self.source, container=container)
456 except SolrError:
457 attempt += 1
458 time.sleep(10)
460 def get_metadata_using_citation_meta(
461 self,
462 xarticle: ArticleData,
463 xissue: IssueData,
464 soup: BeautifulSoup,
465 what: list[CitationLiteral] = [],
466 ):
467 """
468 :param xarticle: the xarticle that will collect the metadata
469 :param xissue: the xissue that will collect the publisher
470 :param soup: the BeautifulSoup object of tha article page
471 :param what: list of citation_ items to collect.
472 :return: None. The given article is modified
473 """
475 if "title" in what:
476 # TITLE
477 citation_title_node = soup.select_one("meta[name='citation_title']")
478 if citation_title_node: 478 ↛ 483line 478 didn't jump to line 483 because the condition on line 478 was always true
479 title = citation_title_node.get("content")
480 if isinstance(title, str): 480 ↛ 483line 480 didn't jump to line 483 because the condition on line 480 was always true
481 xarticle.title_tex = title
483 if "author" in what: 483 ↛ 512line 483 didn't jump to line 512 because the condition on line 483 was always true
484 # AUTHORS
485 citation_author_nodes = soup.select("meta[name^='citation_author']")
486 current_author: ContributorDict | None = None
487 for citation_author_node in citation_author_nodes:
488 if citation_author_node.get("name") == "citation_author":
489 text_author = citation_author_node.get("content")
490 if not isinstance(text_author, str): 490 ↛ 491line 490 didn't jump to line 491 because the condition on line 490 was never true
491 raise ValueError("Cannot parse author")
492 if text_author == "": 492 ↛ 493line 492 didn't jump to line 493 because the condition on line 492 was never true
493 current_author = None
494 continue
495 current_author = create_contributor(role="author", string_name=text_author)
496 xarticle.contributors.append(current_author)
497 continue
498 if current_author is None: 498 ↛ 499line 498 didn't jump to line 499 because the condition on line 498 was never true
499 print("Couldn't parse citation author")
500 continue
501 if citation_author_node.get("name") == "citation_author_institution":
502 text_institution = citation_author_node.get("content")
503 if not isinstance(text_institution, str): 503 ↛ 504line 503 didn't jump to line 504 because the condition on line 503 was never true
504 continue
505 current_author["addresses"].append(text_institution)
506 if citation_author_node.get("name") == "citation_author_ocrid": 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true
507 text_orcid = citation_author_node.get("content")
508 if not isinstance(text_orcid, str):
509 continue
510 current_author["orcid"] = text_orcid
512 if "pdf" in what:
513 # PDF
514 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
515 if citation_pdf_node:
516 pdf_url = citation_pdf_node.get("content")
517 if isinstance(pdf_url, str): 517 ↛ 520line 517 didn't jump to line 520 because the condition on line 517 was always true
518 add_pdf_link_to_xarticle(xarticle, pdf_url)
520 if "lang" in what:
521 # LANG
522 citation_lang_node = soup.select_one("meta[name='citation_language']")
523 if citation_lang_node: 523 ↛ 529line 523 didn't jump to line 529 because the condition on line 523 was always true
524 # TODO: check other language code
525 content_text = citation_lang_node.get("content")
526 if isinstance(content_text, str): 526 ↛ 529line 526 didn't jump to line 529 because the condition on line 526 was always true
527 xarticle.lang = standardize_tag(content_text)
529 if "abstract" in what:
530 # ABSTRACT
531 abstract_node = soup.select_one("meta[name='citation_abstract']")
532 if abstract_node is not None:
533 abstract = abstract_node.get("content")
534 if not isinstance(abstract, str): 534 ↛ 535line 534 didn't jump to line 535 because the condition on line 534 was never true
535 raise ValueError("Couldn't parse abstract from meta")
536 abstract = BeautifulSoup(abstract, "html.parser").text
537 lang = abstract_node.get("lang")
538 if not isinstance(lang, str): 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 lang = self.detect_language(abstract, xarticle)
540 xarticle.abstracts.append(
541 {
542 "tag": "abstract",
543 "value_html": "",
544 "value_tex": abstract,
545 "value_xml": "",
546 "lang": lang,
547 }
548 )
550 if "page" in what:
551 # PAGES
552 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
553 if citation_fpage_node:
554 page = citation_fpage_node.get("content")
555 if isinstance(page, str): 555 ↛ 560line 555 didn't jump to line 560 because the condition on line 555 was always true
556 page = page.split("(")[0]
557 if len(page) < 32: 557 ↛ 560line 557 didn't jump to line 560 because the condition on line 557 was always true
558 xarticle.fpage = page
560 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
561 if citation_lpage_node:
562 page = citation_lpage_node.get("content")
563 if isinstance(page, str): 563 ↛ 568line 563 didn't jump to line 568 because the condition on line 563 was always true
564 page = page.split("(")[0]
565 if len(page) < 32: 565 ↛ 568line 565 didn't jump to line 568 because the condition on line 565 was always true
566 xarticle.lpage = page
568 if "doi" in what:
569 # DOI
570 citation_doi_node = soup.select_one("meta[name='citation_doi']")
571 if citation_doi_node:
572 doi = citation_doi_node.get("content")
573 if isinstance(doi, str): 573 ↛ 580line 573 didn't jump to line 580 because the condition on line 573 was always true
574 doi = doi.strip()
575 pos = doi.find("10.")
576 if pos > 0:
577 doi = doi[pos:]
578 xarticle.doi = doi
580 if "mr" in what:
581 # MR
582 citation_mr_node = soup.select_one("meta[name='citation_mr']")
583 if citation_mr_node: 583 ↛ 584line 583 didn't jump to line 584 because the condition on line 583 was never true
584 mr = citation_mr_node.get("content")
585 if isinstance(mr, str):
586 mr = mr.strip()
587 if mr.find("MR") == 0:
588 mr = mr[2:]
589 xarticle.extids.append(("mr-item-id", mr))
591 if "zbl" in what:
592 # ZBL
593 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
594 if citation_zbl_node:
595 zbl = citation_zbl_node.get("content")
596 if isinstance(zbl, str): 596 ↛ 602line 596 didn't jump to line 602 because the condition on line 596 was always true
597 zbl = zbl.strip()
598 if zbl.find("Zbl") == 0: 598 ↛ 602line 598 didn't jump to line 602 because the condition on line 598 was always true
599 zbl = zbl[3:].strip()
600 xarticle.extids.append(("zbl-item-id", zbl))
602 if "publisher" in what:
603 # PUBLISHER
604 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
605 if citation_publisher_node:
606 pub = citation_publisher_node.get("content")
607 if isinstance(pub, str): 607 ↛ 614line 607 didn't jump to line 614 because the condition on line 607 was always true
608 pub = pub.strip()
609 if pub != "": 609 ↛ 614line 609 didn't jump to line 614 because the condition on line 609 was always true
610 xpub = create_publisherdata()
611 xpub.name = pub
612 xissue.publisher = xpub
614 if "keywords" in what:
615 # KEYWORDS
616 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
617 for kwd_node in citation_kwd_nodes:
618 kwds = kwd_node.get("content")
619 if isinstance(kwds, str): 619 ↛ 617line 619 didn't jump to line 617 because the condition on line 619 was always true
620 kwds = kwds.split(",")
621 for kwd in kwds:
622 if kwd == "":
623 continue
624 kwd = kwd.strip()
625 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
627 if "references" in what:
628 citation_references = soup.select("meta[name='citation_reference']")
629 for index, tag in enumerate(citation_references):
630 content = tag.get("content")
631 if not isinstance(content, str): 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true
632 raise ValueError("Cannot parse citation_reference meta")
633 xarticle.bibitems.append(
634 self.__parse_meta_citation_reference(content, str(index + 1))
635 )
637 def create_xissue(
638 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
639 ):
640 if url is not None and url.endswith("/"):
641 url = url[:-1]
642 xissue = create_issuedata()
643 xissue.url = url
645 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
647 xissue.year = year
649 if volume_number is not None: 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was always true
650 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
652 if issue_number is not None:
653 xissue.number = issue_number.replace(",", "-")
654 return xissue
656 def detect_language(self, text: str, article: ArticleData | None = None):
657 if article and article.lang is not None and article.lang != "und":
658 return article.lang
660 language = self.language_detector.detect_language_of(text)
662 if not language: 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true
663 return "und"
664 return language.iso_code_639_1.name.lower()
666 references_mapping = {
667 "citation_title": get_article_title_xml,
668 "citation_journal_title": get_source_xml,
669 "citation_publication_date": get_year_xml,
670 "citation_firstpage": get_fpage_xml,
671 "citation_lastpage": get_lpage_xml,
672 }
674 @classmethod
675 def __parse_meta_citation_reference(cls, content: str, label=None):
676 categories = content.split(";")
678 if len(categories) == 1:
679 return cls.create_crawled_bibitem(content, label=label)
681 citation_data = [c.split("=") for c in categories if "=" in c]
682 del categories
684 xml_string = ""
685 authors_parsed = False
686 authors_strings = []
687 for data in citation_data:
688 key = data[0].strip()
689 citation_content = data[1]
690 if key == "citation_author":
691 authors_strings.append(get_author_xml(template_str=citation_content))
692 continue
693 elif not authors_parsed:
694 xml_string += ", ".join(authors_strings)
695 authors_parsed = True
697 if key in cls.references_mapping:
698 xml_string += " " + cls.references_mapping[key](citation_content)
700 return cls.create_crawled_bibitem(xml_string, label=label)
702 @classmethod
703 def get_or_create_source(cls):
704 source, created = Source.objects.get_or_create(
705 domain=cls.source_domain,
706 defaults={
707 "name": cls.source_name,
708 "website": cls.source_website,
709 },
710 )
711 if created: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 source.save()
713 return source
715 @staticmethod
716 def create_crawled_bibitem(ref_value: str | JatsRef, label=None):
717 if isinstance(ref_value, str):
718 xref = RefData(lang="en")
719 value_xml = ""
720 if label:
721 value_xml += f"<label>{label}</label>"
722 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
723 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>'
724 xref.citation_xml = value_xml
725 else:
726 xref = ref_value
728 xref = check_bibitem_xml(xref)
730 # Bakes extlink badges into the bibliography html
731 # Maybe we should put this into another file (jats_parser ?)
732 for extid in xref.extids:
733 href = resolve_id(extid[0], extid[1])
734 if (not href) or (not xref.citation_html): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true
735 continue
736 str_format = extid[0]
737 if str_format in extids_formats: 737 ↛ 739line 737 didn't jump to line 739 because the condition on line 737 was always true
738 str_format = extids_formats[str_format]
739 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
741 return xref
743 @staticmethod
744 def create_bibliography(bibitems: Sequence[RefData]):
745 xml_str = "<ref-list>\n"
746 html_str = "<div>\n"
748 for item in bibitems:
749 xml_str += f"\t{item.citation_xml}\n"
750 html_str += f"\t<p>{item.citation_html}</p>\n"
751 xml_str += "</ref-list>"
753 # for item in bibitems:
754 # html_str =
755 # html_str += f"\t<p>{item.citation_html}</p>\n"
756 html_str += "</div>"
758 tex_str = "<div>\n"
759 for item in bibitems:
760 tex_str += f"\t<p>{item.citation_tex}</p>\n"
761 tex_str += "</div>"
763 biblio_dict = create_abstract(
764 tag="biblio",
765 value_html=html_str,
766 value_tex=tex_str,
767 value_xml=xml_str,
768 lang="en",
769 )
771 return biblio_dict
773 @staticmethod
774 def get_issue_pid(
775 collection_id: str,
776 year: str,
777 volume_number: str | None = None,
778 issue_number: str | None = None,
779 ):
780 # Replace any non-word character with an underscore
781 pid = f"{collection_id}_{year}"
782 if volume_number is not None: 782 ↛ 784line 782 didn't jump to line 784 because the condition on line 782 was always true
783 pid += f"_{volume_number}"
784 if issue_number is not None:
785 pid += f"_{issue_number}"
786 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
787 return pid
789 @staticmethod
790 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
791 pages_split = pages.split(separator)
792 if len(pages_split) == 0: 792 ↛ 793line 792 didn't jump to line 793 because the condition on line 792 was never true
793 article.page_range = pages
794 if len(pages_split) > 0: 794 ↛ exitline 794 didn't return from function 'set_pages' because the condition on line 794 was always true
795 if pages[0].isnumeric():
796 article.fpage = pages_split[0]
797 if (
798 len(pages_split) > 1
799 and pages_split[0] != pages_split[1]
800 and pages_split[1].isnumeric()
801 ):
802 article.lpage = pages_split[1]