Coverage for src/crawler/base_crawler.py: 79%
433 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import time
2from collections.abc import Sequence
3from datetime import timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetector, LanguageDetectorBuilder
13from ptf.cmds import xml_cmds
14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
15from ptf.cmds.xml.jats.builder.citation import (
16 get_article_title_xml,
17 get_author_xml,
18 get_fpage_xml,
19 get_lpage_xml,
20 get_source_xml,
21 get_year_xml,
22)
23from ptf.cmds.xml.jats.builder.issue import get_title_xml
24from ptf.cmds.xml.jats.jats_parser import JatsRef, check_bibitem_xml
25from ptf.display.resolver import extids_formats, resolve_id
26from ptf.model_data import (
27 ArticleData,
28 ContributorDict,
29 IssueData,
30 RefData,
31 create_abstract,
32 create_contributor,
33 create_extlink,
34 create_issuedata,
35 create_publisherdata,
36)
37from ptf.model_data_converter import update_data_for_jats
38from pylatexenc.latex2text import LatexNodes2Text
39from pysolr import SolrError
40from requests_cache import CachedSession, FileCache
42from crawler.models import Periode, Source
43from crawler.types import CitationLiteral
44from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
46# TODO: pass a class factory instead of a dependency to a site
47# TODO: pass a class factory instead of a dependency to a site
50class BaseCollectionCrawler:
51 """
52 Base collection for the crawlers.
53 To create a crawler:
54 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
55 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
56 3) update factory.py so that crawler_factory can return your new crawler
57 """
59 source_name = ""
60 source_domain = ""
61 source_website = ""
63 periode_begin: int = 0
64 periode_end: int = 9999
66 issue_href = ""
68 collection = None
69 source = None
70 periode = None
71 user = None
72 session: requests.Session | CachedSession
73 # Updated in constructor with user agent from settings_local
74 headers = {"accept_encoding": "utf-8"}
76 next_allowed_request: float = time.time()
78 # seconds to wait between two http requests
79 requests_interval = 5
81 latext_parser = LatexNodes2Text()
83 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
84 # do not use the "$" to surround tex formulas
85 delimiter_inline_formula = "$"
86 delimiter_disp_formula = "$"
88 # HACK : Workaround for tests (monkeypatching)
89 # We store the class here, so we can monkeypatch it when running tests
90 # subCrawlers = {
91 # LofplCrawler: None
92 # }
93 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
95 language_detector: LanguageDetector
97 force_refresh = False
99 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
101 def __init__(
102 self,
103 *args,
104 username: str,
105 collection_id: str,
106 collection_url: str,
107 test_mode: bool = False,
108 publisher: str = "mathdoc",
109 force_refresh=False,
110 ):
111 for CrawlerClass in self.subCrawlers:
112 self.subCrawlers[CrawlerClass] = CrawlerClass(
113 *args,
114 username=username,
115 collection_id=collection_id,
116 collection_url=collection_url,
117 test_mode=test_mode,
118 publisher=publisher,
119 )
121 self.username = username
123 self.collection_id = collection_id
124 self.collection_url = (
125 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
126 )
128 self.test_mode = test_mode
129 self.publisher = publisher
131 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page
132 # AMP sets or creates the Periode during the __init__
133 # TODO: see with other sources when to create the Periode
134 self.periode = None
135 self.periode_first_issue = None
136 self.periode_last_issue = None
138 self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
140 # Skipped when running tests
141 self.initialize()
143 self.session = CachedSession(
144 backend=FileCache(
145 getattr(settings, "REQUESTS_CACHE_LOCATION", "/tmp/ptf_requests_cache"),
146 decode_content=False,
147 ),
148 expire_after=timedelta(days=30),
149 )
150 self.headers.update(
151 {
152 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
153 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
154 }
155 )
157 self.force_refresh = force_refresh
159 def initialize(self):
160 """
161 Acts as a "second" init function to skip model accesses during test data generation
162 """
163 self.collection = get_or_create_collection(self.collection_id)
164 self.source = self.get_or_create_source()
165 self.periode = self.get_or_create_periode()
166 self.user = User.objects.get(username=self.username)
168 def parse_collection_content(self, content: str) -> list[IssueData]:
169 """
170 Parse the HTML content with BeautifulSoup
171 returns a list of xissue.
172 Override this function in a derived class
173 """
174 return []
176 def parse_issue_content(self, content: str, xissue: IssueData):
177 """
178 Parse the HTML content with BeautifulSoup
179 Fills the xissue.articles
180 Override this function in a derived class.
182 CAV : You are supposed to create articles there. Please assign a PID to each article.
183 The PID can be `a + article_index`, like this : `a0` `a21`
184 """
186 def parse_article_content(
187 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str
188 ) -> ArticleData | None:
189 """
190 Parse the HTML content with BeautifulSoup
191 returns the xarticle.
192 Override this function in a derived class.
193 The xissue is passed to the function in case the article page has issue information (ex: publisher)
194 The article url is also passed as a parameter
196 CAV : You are supposed to assign articles pid again here
197 """
198 xarticle.pid = pid
199 return xarticle
201 def crawl_collection(self):
202 # TODO: Comments, filter
203 """
204 Crawl an entire collection. ptf.models.Container objects are created.
205 - get the HTML content of the collection_url
206 - parse the HTML content with beautifulsoup to extract the list of issues
207 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
208 - crawl each issue if col_only is False
209 - Returns the list of merged issues.
210 It is an OrderedDict {pid: {"issues": xissues}}
211 The key is the pid of the merged issues.
212 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
213 the pid is then made with 1999-2000__6_
214 """
216 if self.source is None:
217 raise RuntimeError("ERROR: the source is not set")
219 content = self.download_file(self.collection_url)
220 xissues = self.parse_collection_content(content)
222 # xissues = [
223 # issue
224 # for issue in xissues
225 # if int(issue.year) >= self.periode_begin and int(issue.year) <= self.periode_end
226 # ]
228 """
229 Some collections split the same volumes in different pages
230 Ex: Volume 6 (2000) and Volume 6 (1999)
231 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
232 """
233 # merged_xissues = self.merge_xissues(xissues)
235 xissues_dict = {str(i.pid): i for i in xissues}
237 return xissues_dict
239 def crawl_issue(self, xissue: IssueData):
240 """
241 Crawl 1 wag page of an issue.
242 - get the HTML content of the issue
243 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
244 - crawl each article
245 """
247 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
248 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
250 issue_url = xissue.url
251 if issue_url is not None:
252 if issue_url.endswith(".pdf"):
253 add_pdf_link_to_xarticle(xissue, issue_url)
254 xissue.url = None
255 else:
256 content = self.download_file(issue_url)
257 self.parse_issue_content(content, xissue)
259 xarticles = xissue.articles
261 parsed_xarticles = []
263 for xarticle in xarticles:
264 parsed_xarticle = self.crawl_article(xarticle, xissue)
265 if parsed_xarticle is not None:
266 parsed_xarticles.append(parsed_xarticle)
268 xissue.articles = parsed_xarticles
270 article_has_pdf = (
271 next((link["mimetype"] == "application/pdf" for link in xissue.ext_links), None)
272 is not None
273 )
274 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
275 self.add_xissue_into_database(xissue)
277 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
278 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
279 def article_has_source(art):
280 return (
281 next(
282 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
283 None,
284 )
285 is not None
286 )
288 # ARTICLE URL as en ExtLink (to display the link in the article page)
289 if xarticle.url is None:
290 if not article_has_source(xarticle): 290 ↛ 300line 290 didn't jump to line 300 because the condition on line 290 was always true
291 if xissue.url:
292 article_source = xissue.url
293 else:
294 article_source = self.collection_url
295 ext_link = create_extlink()
296 ext_link["rel"] = "source"
297 ext_link["location"] = article_source
298 ext_link["metadata"] = self.source_domain
299 xarticle.ext_links.append(ext_link)
300 return self.process_article_metadata(xarticle)
302 content = self.download_file(xarticle.url)
303 pid = f"{xissue.pid}_{xarticle.pid}"
305 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url, pid)
306 if parsed_xarticle is None: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 return None
309 if not article_has_source(parsed_xarticle) and parsed_xarticle.url:
310 ext_link = create_extlink()
311 ext_link["rel"] = "source"
312 ext_link["location"] = parsed_xarticle.url
313 ext_link["metadata"] = self.source_domain
314 parsed_xarticle.ext_links.append(ext_link)
316 # The article title may have formulas surrounded with '$'
317 return self.process_article_metadata(parsed_xarticle)
319 def process_article_metadata(self, xarticle: ArticleData):
320 html, xml = get_html_and_xml_from_text_with_formulas(
321 xarticle.title_tex,
322 delimiter_inline=self.delimiter_inline_formula,
323 delimiter_disp=self.delimiter_disp_formula,
324 )
325 xml = get_title_xml(xml, with_tex_values=False)
326 xarticle.title_html = html
327 xarticle.title_xml = xml
329 abstracts_to_parse = [
330 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract"
331 ]
332 # abstract may have formulas surrounded with '$'
333 if len(abstracts_to_parse) > 0:
334 for xabstract in abstracts_to_parse:
335 html, xml = get_html_and_xml_from_text_with_formulas(
336 xabstract["value_tex"],
337 delimiter_inline=self.delimiter_inline_formula,
338 delimiter_disp=self.delimiter_disp_formula,
339 )
340 xabstract["value_html"] = html
341 lang = xabstract["lang"]
342 if lang == xarticle.lang:
343 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
344 else:
345 xabstract[
346 "value_xml"
347 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
349 update_data_for_jats(xarticle)
351 return xarticle
353 def get(self, url: str, force_refresh=False):
354 attempt = 0
355 response = None
357 while attempt < 3:
358 # If we already have a key, we can skip the timeout
359 if isinstance(self.session, CachedSession): 359 ↛ 364line 359 didn't jump to line 364 because the condition on line 359 was always true
360 if not self.session.cache.contains(url=url):
361 delta = self.next_allowed_request - time.time()
362 if delta > 0:
363 time.sleep(delta)
364 self.next_allowed_request = time.time() + self.requests_interval
365 try:
366 # For SSL Errors, use verify=False kwarg
367 verify = True
368 if url.startswith("https://hdml.di.ionio.gr/"): 368 ↛ 369line 368 didn't jump to line 369 because the condition on line 368 was never true
369 verify = False
370 # self.session.cache.delete(urls=[url])
371 if isinstance(self.session, CachedSession): 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true
372 response = self.session.get(
373 url, headers=self.headers, verify=verify, force_refresh=force_refresh
374 )
375 else:
376 response = self.session.get(url, headers=self.headers, verify=verify)
377 if not response.ok:
378 raise requests.exceptions.HTTPError(
379 f"Endpoint answered with code {response.status_code} : {url}",
380 response=response,
381 )
382 return response
383 except (
384 requests.ConnectionError,
385 requests.ConnectTimeout,
386 requests.exceptions.HTTPError,
387 ):
388 attempt += 1
389 raise requests.exceptions.HTTPError(f"Unable to download {url}")
391 def download_file(self, url: str, force_refresh=False):
392 """
393 Downloads a URL, saves its content on disk in filename and returns its content.
394 """
395 response = self.get(url, force_refresh=force_refresh or self.force_refresh)
396 content = self.decode_response(response)
397 if content == "" or not content: 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 raise requests.exceptions.HTTPError(response)
399 return content
401 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
402 """Override this if the content-type headers from the sources are advertising something else than the actual content
403 SASA needs this"""
404 response.encoding = encoding
405 return response.text
407 def add_xissue_into_database(self, xissue: IssueData):
408 xissue.journal = self.collection
410 xpub = create_publisherdata()
411 xpub.name = self.publisher
412 xissue.publisher = xpub
413 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
415 attempt = 1
416 success = False
418 while not success and attempt < 4:
419 try:
420 params = {"xissue": xissue, "use_body": False}
421 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
422 cmd.do()
423 success = True
424 except SolrError:
425 attempt += 1
426 time.sleep(10)
428 def get_metadata_using_citation_meta(
429 self,
430 xarticle: ArticleData,
431 xissue: IssueData,
432 soup: BeautifulSoup,
433 what: list[CitationLiteral] = [],
434 ):
435 """
436 :param xarticle: the xarticle that will collect the metadata
437 :param xissue: the xissue that will collect the publisher
438 :param soup: the BeautifulSoup object of tha article page
439 :param what: list of citation_ items to collect.
440 :return: None. The given article is modified
441 """
443 if "title" in what:
444 # TITLE
445 citation_title_node = soup.select_one("meta[name='citation_title']")
446 if citation_title_node: 446 ↛ 451line 446 didn't jump to line 451 because the condition on line 446 was always true
447 title = citation_title_node.get("content")
448 if isinstance(title, str): 448 ↛ 451line 448 didn't jump to line 451 because the condition on line 448 was always true
449 xarticle.title_tex = title
451 if "author" in what: 451 ↛ 480line 451 didn't jump to line 480 because the condition on line 451 was always true
452 # AUTHORS
453 citation_author_nodes = soup.select("meta[name^='citation_author']")
454 current_author: ContributorDict | None = None
455 for citation_author_node in citation_author_nodes:
456 if citation_author_node.get("name") == "citation_author":
457 text_author = citation_author_node.get("content")
458 if not isinstance(text_author, str): 458 ↛ 459line 458 didn't jump to line 459 because the condition on line 458 was never true
459 raise ValueError("Cannot parse author")
460 if text_author == "": 460 ↛ 461line 460 didn't jump to line 461 because the condition on line 460 was never true
461 current_author = None
462 continue
463 current_author = create_contributor(role="author", string_name=text_author)
464 xarticle.contributors.append(current_author)
465 continue
466 if current_author is None: 466 ↛ 467line 466 didn't jump to line 467 because the condition on line 466 was never true
467 print("Couldn't parse citation author")
468 continue
469 if citation_author_node.get("name") == "citation_author_institution":
470 text_institution = citation_author_node.get("content")
471 if not isinstance(text_institution, str): 471 ↛ 472line 471 didn't jump to line 472 because the condition on line 471 was never true
472 continue
473 current_author["addresses"].append(text_institution)
474 if citation_author_node.get("name") == "citation_author_ocrid": 474 ↛ 475line 474 didn't jump to line 475 because the condition on line 474 was never true
475 text_orcid = citation_author_node.get("content")
476 if not isinstance(text_orcid, str):
477 continue
478 current_author["orcid"] = text_orcid
480 if "pdf" in what: 480 ↛ 488line 480 didn't jump to line 488 because the condition on line 480 was always true
481 # PDF
482 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
483 if citation_pdf_node:
484 pdf_url = citation_pdf_node.get("content")
485 if isinstance(pdf_url, str): 485 ↛ 488line 485 didn't jump to line 488 because the condition on line 485 was always true
486 add_pdf_link_to_xarticle(xarticle, pdf_url)
488 if "lang" in what:
489 # LANG
490 citation_lang_node = soup.select_one("meta[name='citation_language']")
491 if citation_lang_node: 491 ↛ 497line 491 didn't jump to line 497 because the condition on line 491 was always true
492 # TODO: check other language code
493 content_text = citation_lang_node.get("content")
494 if isinstance(content_text, str): 494 ↛ 497line 494 didn't jump to line 497 because the condition on line 494 was always true
495 xarticle.lang = standardize_tag(content_text)
497 if "abstract" in what:
498 # ABSTRACT
499 abstract_node = soup.select_one("meta[name='citation_abstract']")
500 if abstract_node is not None:
501 abstract = abstract_node.get("content")
502 if not isinstance(abstract, str): 502 ↛ 503line 502 didn't jump to line 503 because the condition on line 502 was never true
503 raise ValueError("Couldn't parse abstract from meta")
504 abstract = BeautifulSoup(abstract, "html.parser").text
505 lang = abstract_node.get("lang")
506 if not isinstance(lang, str): 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true
507 lang = self.detect_language(abstract, xarticle)
508 xarticle.abstracts.append(
509 {
510 "tag": "abstract",
511 "value_html": "",
512 "value_tex": abstract,
513 "value_xml": "",
514 "lang": lang,
515 }
516 )
518 if "page" in what:
519 # PAGES
520 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
521 if citation_fpage_node:
522 page = citation_fpage_node.get("content")
523 if isinstance(page, str): 523 ↛ 528line 523 didn't jump to line 528 because the condition on line 523 was always true
524 page = page.split("(")[0]
525 if len(page) < 32: 525 ↛ 528line 525 didn't jump to line 528 because the condition on line 525 was always true
526 xarticle.fpage = page
528 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
529 if citation_lpage_node:
530 page = citation_lpage_node.get("content")
531 if isinstance(page, str): 531 ↛ 536line 531 didn't jump to line 536 because the condition on line 531 was always true
532 page = page.split("(")[0]
533 if len(page) < 32: 533 ↛ 536line 533 didn't jump to line 536 because the condition on line 533 was always true
534 xarticle.lpage = page
536 if "doi" in what:
537 # DOI
538 citation_doi_node = soup.select_one("meta[name='citation_doi']")
539 if citation_doi_node:
540 doi = citation_doi_node.get("content")
541 if isinstance(doi, str): 541 ↛ 549line 541 didn't jump to line 549 because the condition on line 541 was always true
542 doi = doi.strip()
543 pos = doi.find("10.")
544 if pos > 0:
545 doi = doi[pos:]
546 xarticle.doi = doi
547 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
549 if "mr" in what:
550 # MR
551 citation_mr_node = soup.select_one("meta[name='citation_mr']")
552 if citation_mr_node: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true
553 mr = citation_mr_node.get("content")
554 if isinstance(mr, str):
555 mr = mr.strip()
556 if mr.find("MR") == 0:
557 mr = mr[2:]
558 xarticle.extids.append(("mr-item-id", mr))
560 if "zbl" in what:
561 # ZBL
562 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
563 if citation_zbl_node:
564 zbl = citation_zbl_node.get("content")
565 if isinstance(zbl, str): 565 ↛ 571line 565 didn't jump to line 571 because the condition on line 565 was always true
566 zbl = zbl.strip()
567 if zbl.find("Zbl") == 0: 567 ↛ 571line 567 didn't jump to line 571 because the condition on line 567 was always true
568 zbl = zbl[3:].strip()
569 xarticle.extids.append(("zbl-item-id", zbl))
571 if "publisher" in what:
572 # PUBLISHER
573 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
574 if citation_publisher_node:
575 pub = citation_publisher_node.get("content")
576 if isinstance(pub, str): 576 ↛ 583line 576 didn't jump to line 583 because the condition on line 576 was always true
577 pub = pub.strip()
578 if pub != "": 578 ↛ 583line 578 didn't jump to line 583 because the condition on line 578 was always true
579 xpub = create_publisherdata()
580 xpub.name = pub
581 xissue.publisher = xpub
583 if "keywords" in what:
584 # KEYWORDS
585 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
586 for kwd_node in citation_kwd_nodes:
587 kwds = kwd_node.get("content")
588 if isinstance(kwds, str): 588 ↛ 586line 588 didn't jump to line 586 because the condition on line 588 was always true
589 kwds = kwds.split(",")
590 for kwd in kwds:
591 if kwd == "":
592 continue
593 kwd = kwd.strip()
594 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
596 if "references" in what:
597 citation_references = soup.select("meta[name='citation_reference']")
598 for index, tag in enumerate(citation_references):
599 content = tag.get("content")
600 if not isinstance(content, str): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true
601 raise ValueError("Cannot parse citation_reference meta")
602 xarticle.bibitems.append(
603 self.__parse_meta_citation_reference(content, str(index + 1))
604 )
606 def create_xissue(
607 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
608 ):
609 if url is not None and url.endswith("/"):
610 url = url[:-1]
611 xissue = create_issuedata()
612 xissue.url = url
614 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
616 xissue.year = year
618 if volume_number is not None:
619 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
621 if issue_number is not None:
622 xissue.number = issue_number.replace(",", "-")
623 return xissue
625 def detect_language(self, text: str, article: ArticleData | None = None):
626 if article and article.lang is not None and article.lang != "und":
627 return article.lang
629 language = self.language_detector.detect_language_of(text)
631 if not language: 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true
632 return "und"
633 return language.iso_code_639_1.name.lower()
635 def get_or_create_periode(self):
636 if self.periode is not None: 636 ↛ 637line 636 didn't jump to line 637 because the condition on line 636 was never true
637 return self.periode
639 if self.collection is None or self.source is None: 639 ↛ 640line 639 didn't jump to line 640 because the condition on line 639 was never true
640 raise ValueError("You need to set a collection or a source before creating a periode")
642 qs = Periode.objects.filter(collection=self.collection, source=self.source)
643 if qs.exists(): 643 ↛ 646line 643 didn't jump to line 646 because the condition on line 643 was always true
644 periode = qs.first()
645 else:
646 periode = Periode(
647 collection=self.collection,
648 source=self.source,
649 title=self.collection.title_tex,
650 issue_href=self.issue_href,
651 collection_href=self.collection_url,
652 doi_href="",
653 published=False,
654 begin=self.periode_begin,
655 end=self.periode_end,
656 first_issue=self.periode_first_issue,
657 last_issue=self.periode_last_issue,
658 )
659 periode.save()
661 return periode
663 references_mapping = {
664 "citation_title": get_article_title_xml,
665 "citation_journal_title": get_source_xml,
666 "citation_publication_date": get_year_xml,
667 "citation_firstpage": get_fpage_xml,
668 "citation_lastpage": get_lpage_xml,
669 }
671 @classmethod
672 def __parse_meta_citation_reference(cls, content: str, label=None):
673 categories = content.split(";")
675 if len(categories) == 1:
676 return cls.create_crawled_bibitem(content, label=label)
678 citation_data = [c.split("=") for c in categories if "=" in c]
679 del categories
681 xml_string = ""
682 authors_parsed = False
683 authors_strings = []
684 for data in citation_data:
685 key = data[0].strip()
686 citation_content = data[1]
687 if key == "citation_author":
688 authors_strings.append(get_author_xml(template_str=citation_content))
689 continue
690 elif not authors_parsed:
691 xml_string += ", ".join(authors_strings)
692 authors_parsed = True
694 if key in cls.references_mapping:
695 xml_string += " " + cls.references_mapping[key](citation_content)
697 return cls.create_crawled_bibitem(xml_string, label=label)
699 @classmethod
700 def get_or_create_source(cls):
701 source, created = Source.objects.get_or_create(
702 domain=cls.source_domain,
703 defaults={
704 "name": cls.source_name,
705 "website": cls.source_website,
706 "create_xissue": True,
707 "periode_href": "",
708 "article_href": "",
709 "pdf_href": "",
710 },
711 )
712 if created: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 source.save()
714 return source
716 @staticmethod
717 def create_crawled_bibitem(ref_value: str | JatsRef, label=None):
718 if isinstance(ref_value, str):
719 xref = RefData(lang="en")
720 value_xml = ""
721 if label:
722 value_xml += f"<label>{label}</label>"
723 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
724 value_xml += f'<mixed-citation xml:space="preserve">{ref_value}</mixed-citation>'
725 xref.citation_xml = value_xml
726 else:
727 xref = ref_value
729 xref = check_bibitem_xml(xref)
731 # Bakes extlink badges into the bibliography html
732 # Maybe we should put this into another file (jats_parser ?)
733 for extid in xref.extids:
734 href = resolve_id(extid[0], extid[1])
735 if (not href) or (not xref.citation_html): 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true
736 continue
737 str_format = extid[0]
738 if str_format in extids_formats: 738 ↛ 740line 738 didn't jump to line 740 because the condition on line 738 was always true
739 str_format = extids_formats[str_format]
740 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
742 return xref
744 @staticmethod
745 def create_bibliography(bibitems: Sequence[RefData]):
746 xml_str = "<ref-list>\n"
747 html_str = "<div>\n"
749 for item in bibitems:
750 xml_str += f"\t{item.citation_xml}\n"
751 html_str += f"\t<p>{item.citation_html}</p>\n"
752 xml_str += "</ref-list>"
754 # for item in bibitems:
755 # html_str =
756 # html_str += f"\t<p>{item.citation_html}</p>\n"
757 html_str += "</div>"
759 tex_str = "<div>\n"
760 for item in bibitems:
761 tex_str += f"\t<p>{item.citation_tex}</p>\n"
762 tex_str += "</div>"
764 biblio_dict = create_abstract(
765 tag="biblio",
766 value_html=html_str,
767 value_tex=tex_str,
768 value_xml=xml_str,
769 lang="en",
770 )
772 return biblio_dict
774 @staticmethod
775 def get_issue_pid(
776 collection_id: str,
777 year: str,
778 volume_number: str | None = None,
779 issue_number: str | None = None,
780 ):
781 # Replace any non-word character with an underscore
782 pid = f"{collection_id}_{year}"
783 if volume_number is not None:
784 pid += f"_{volume_number}"
785 if issue_number is not None:
786 pid += f"_{issue_number}"
787 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
788 return pid
790 @staticmethod
791 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
792 pages_split = pages.split(separator)
793 if len(pages_split) == 0: 793 ↛ 794line 793 didn't jump to line 794 because the condition on line 793 was never true
794 article.page_range = pages
795 if len(pages_split) > 0: 795 ↛ exitline 795 didn't return from function 'set_pages' because the condition on line 795 was always true
796 if pages[0].isnumeric():
797 article.fpage = pages_split[0]
798 if (
799 len(pages_split) > 1
800 and pages_split[0] != pages_split[1]
801 and pages_split[1].isnumeric()
802 ):
803 article.lpage = pages_split[1]