Coverage for src/crawler/base_crawler.py: 69%
506 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-08 15:14 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-08 15:14 +0000
1import logging
2import time
3from concurrent.futures import (
4 Executor,
5 ThreadPoolExecutor,
6)
7from datetime import datetime, timedelta
8from email.policy import EmailPolicy
10import regex
11import requests
12from bs4 import BeautifulSoup
13from django.conf import settings
14from django.contrib.auth.models import User
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_titledata,
43)
44from ptf.model_data_converter import update_data_for_jats
45from pylatexenc.latex2text import LatexNodes2Text
46from pymongo.errors import DocumentTooLarge
47from pysolr import SolrError
48from requests.adapters import HTTPAdapter
49from requests_cache import CachedSession, MongoCache
50from urllib3 import Retry
52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
53from crawler.models import Source
54from crawler.types import CitationLiteral
55from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
58class CrawlerTitleDict(TitleDict):
59 title_tex: str | None
62class BaseCollectionCrawler:
63 """
64 Base collection for the crawlers.
65 To create a crawler:
66 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
67 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
68 3) update factory.py so that crawler_factory can return your new crawler
69 """
71 logger = logging.getLogger(__name__)
72 tracer = trace.get_tracer(__name__)
74 source_name = ""
75 source_domain = ""
76 source_website = ""
78 issue_href = ""
80 collection = None
81 source = None
82 user = None
83 session: requests.Session | CachedSession
85 verify = True
86 headers = {
87 "accept_encoding": "utf-8",
88 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
89 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
90 }
92 next_allowed_request: float = time.time()
94 # seconds to wait between two http requests
95 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
96 # seconds to wait before aborting the connection (if no bytes are recieved)
97 requests_timeout = 60
99 latext_parser = LatexNodes2Text()
101 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
102 # do not use the "$" to surround tex formulas
103 delimiter_inline_formula = "$"
104 delimiter_disp_formula = "$"
106 # HACK : Workaround for tests (monkeypatching)
107 # We store the class here, so we can monkeypatch it when running tests
108 # subCrawlers = {
109 # LofplCrawler: None
110 # }
111 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
113 _language_detector: LanguageDetector | None = None
114 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
116 force_refresh = False
118 # Whereas to include headers in requests cache key
119 match_headers = False
120 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
122 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
123 ignore_missing_pdf = True
125 database_executor: Executor
127 @classmethod
128 def get_view_id(cls):
129 return cls.source_domain
131 @property
132 def language_detector(self):
133 """Crawler Instance singleton for language builder.
134 Late init of LanguageDetector to save on memory"""
135 if not self._language_detector:
136 self._language_detector = self._language_detector_builder.build()
137 return self._language_detector
139 def __init__(
140 self,
141 *args,
142 username: str,
143 collection_id: str,
144 collection_url: str,
145 test_mode: bool = False,
146 publisher: str = "mathdoc",
147 force_refresh=False,
148 ):
149 for CrawlerClass in self.subCrawlers: 149 ↛ 150line 149 didn't jump to line 150 because the loop on line 149 never started
150 self.subCrawlers[CrawlerClass] = CrawlerClass(
151 *args,
152 username=username,
153 collection_id=collection_id,
154 collection_url=collection_url,
155 test_mode=test_mode,
156 publisher=publisher,
157 )
158 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
160 self.username = username
162 self.collection_id = collection_id
163 self.collection_url = (
164 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
165 )
167 self.test_mode = test_mode
168 self.publisher = publisher
170 self.session = requests.session()
172 # Skipped when running tests
173 self.initialize()
174 self.session.verify = self.verify
175 self.force_refresh = force_refresh
177 # We implemented custom retry behaviour, so we don't want to make extra requests here
178 retries = Retry(
179 total=0,
180 )
181 self.session.mount("https://", HTTPAdapter(max_retries=retries))
182 self.session.mount("http://", HTTPAdapter(max_retries=retries))
184 self.database_executor = ThreadPoolExecutor(
185 max_workers=2, thread_name_prefix="crawler_database_thread"
186 )
188 def initialize(self):
189 """
190 Acts as a "second" init function to skip model accesses during test data generation
191 """
192 self.collection = get_or_create_collection(self.collection_id)
193 self.source = self.get_or_create_source()
194 self.user = User.objects.get(username=self.username)
195 self.session = CachedSession(
196 match_headers=self.match_headers,
197 headers=self.headers,
198 backend=MongoCache(
199 host=getattr(settings, "MONGO_HOSTNAME", "localhost"),
200 ),
201 expire_after=timedelta(days=30),
202 )
204 @classmethod
205 def can_crawl(cls, pid: str) -> bool:
206 return True
208 def parse_collection_content(self, content: str) -> list[IssueData]:
209 """
210 Parse the HTML content with BeautifulSoup
211 returns a list of xissue.
212 Override this function in a derived class
213 """
214 return []
216 def parse_issue_content(self, content: str, xissue: IssueData):
217 """
218 Parse the HTML content with BeautifulSoup
219 Fills the xissue.articles
220 Override this function in a derived class.
222 CAV : You are supposed to create articles there. Please assign a PID to each article.
223 The PID can be `a + article_index`, like this : `a0` `a21`
224 """
226 def parse_article_content(
227 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
228 ) -> ArticleData | None:
229 """
230 Parse the HTML content with BeautifulSoup
231 returns the xarticle.
232 Override this function in a derived class.
233 The xissue is passed to the function in case the article page has issue information (ex: publisher)
234 The article url is also passed as a parameter
236 CAV : You are supposed to assign articles pid again here
237 """
238 return xarticle
240 @tracer.start_as_current_span("crawl_collection")
241 def crawl_collection(self):
242 # TODO: Comments, filter
243 """
244 Crawl an entire collection. ptf.models.Container objects are created.
245 - get the HTML content of the collection_url
246 - parse the HTML content with beautifulsoup to extract the list of issues
247 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
248 - crawl each issue if col_only is False
249 - Returns the list of merged issues.
250 It is an OrderedDict {pid: {"issues": xissues}}
251 The key is the pid of the merged issues.
252 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
253 the pid is then made with 1999-2000__6_
254 """
256 if self.source is None:
257 raise RuntimeError("ERROR: the source is not set")
259 content = self.download_file(self.collection_url)
260 xissues = self.parse_collection_content(content)
262 """
263 Some collections split the same volumes in different pages
264 Ex: Volume 6 (2000) and Volume 6 (1999)
265 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
266 """
267 # merged_xissues = self.merge_xissues(xissues)
269 xissues_dict = {str(i.pid): i for i in xissues}
271 return xissues_dict
273 @tracer.start_as_current_span("crawl_issue")
274 def crawl_issue(self, xissue: IssueData):
275 """
276 Crawl 1 wag page of an issue.
277 - get the HTML content of the issue
278 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
279 - crawl each article
280 """
282 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
283 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
285 issue_url = xissue.url
286 if issue_url is not None:
287 if issue_url.endswith(".pdf"):
288 add_pdf_link_to_xarticle(xissue, issue_url)
289 xissue.url = None
290 else:
291 content = self.download_file(issue_url)
292 with self.tracer.start_as_current_span("parse_issue_content"):
293 self.parse_issue_content(content, xissue)
295 xarticles = xissue.articles
297 parsed_xarticles = []
299 for xarticle in xarticles:
300 parsed_xarticle = self.crawl_article(xarticle, xissue)
301 if parsed_xarticle is not None:
302 parsed_xarticles.append(parsed_xarticle)
304 xissue.articles = parsed_xarticles
306 article_has_pdf = self.article_has_pdf(xissue)
308 if self.ignore_missing_pdf:
309 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
311 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
312 self.process_resource_metadata(xissue, resource_type="issue")
313 self.database_executor.submit(self.add_xissue_into_database, xissue)
315 @staticmethod
316 def article_has_source(art: ArticleData | IssueData):
317 return (
318 next(
319 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
320 None,
321 )
322 is not None
323 )
325 @staticmethod
326 def article_has_pdf(art: ArticleData | IssueData):
327 return (
328 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
329 is not None
330 )
332 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
333 # ARTICLE URL as en ExtLink (to display the link in the article page)
334 if xarticle.url is None:
335 if not self.article_has_source(xarticle): 335 ↛ 345line 335 didn't jump to line 345 because the condition on line 335 was always true
336 if xissue.url:
337 article_source = xissue.url
338 else:
339 article_source = self.collection_url
340 ext_link = create_extlink()
341 ext_link["rel"] = "source"
342 ext_link["location"] = article_source
343 ext_link["metadata"] = self.source_domain
344 xarticle.ext_links.append(ext_link)
345 return self.process_article_metadata(xarticle)
347 content = self.download_file(xarticle.url)
348 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
350 try:
351 with self.tracer.start_as_current_span("parse_article_content"):
352 parsed_xarticle = self.parse_article_content(
353 content, xissue, xarticle, xarticle.url
354 )
355 except ValueError as e:
356 self.logger.warning(e)
357 self.logger.warning("Retrying in 5 mins while invalidating cache")
358 time.sleep(5 * 60)
359 content = self.download_file(xarticle.url, force_refresh=True)
360 with self.tracer.start_as_current_span("parse_article_content"):
361 parsed_xarticle = self.parse_article_content(
362 content, xissue, xarticle, xarticle.url
363 )
365 if parsed_xarticle is None: 365 ↛ 366line 365 didn't jump to line 366 because the condition on line 365 was never true
366 return None
368 if parsed_xarticle.doi:
369 parsed_xarticle.pid = (
370 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
371 )
373 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
374 ext_link = create_extlink()
375 ext_link["rel"] = "source"
376 ext_link["location"] = parsed_xarticle.url
377 ext_link["metadata"] = self.source_domain
378 parsed_xarticle.ext_links.append(ext_link)
380 # The article title may have formulas surrounded with '$'
381 return self.process_article_metadata(parsed_xarticle)
383 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
384 tag = "article-title" if resource_type == "article" else "issue-title"
386 # Process title tex
387 ckeditor_data = build_jats_data_from_html_field(
388 xresource.title_tex,
389 tag=tag,
390 text_lang=xresource.lang,
391 delimiter_inline=self.delimiter_inline_formula,
392 delimiter_disp=self.delimiter_disp_formula,
393 )
395 xresource.title_html = ckeditor_data["value_html"]
396 # xresource.title_tex = ckeditor_data["value_tex"]
397 xresource.title_xml = ckeditor_data["value_xml"]
399 # Process trans_title tex
400 if xresource.trans_title_tex: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 self.logger.warning(
402 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
403 )
404 trans_title = self.create_trans_title(
405 xresource_lang=xresource.lang,
406 resource_type=resource_type,
407 title_tex=xresource.trans_title_tex,
408 lang=xresource.trans_lang,
409 )
410 xresource.titles.append(trans_title)
412 abstracts_to_parse = [
413 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
414 ]
415 # abstract may have formulas surrounded with '$'
416 if len(abstracts_to_parse) > 0:
417 for xabstract in abstracts_to_parse:
418 ckeditor_data = build_jats_data_from_html_field(
419 xabstract["value_tex"],
420 tag="abstract",
421 text_lang=xabstract["lang"],
422 resource_lang=xresource.lang,
423 field_type="abstract",
424 delimiter_inline=self.delimiter_inline_formula,
425 delimiter_disp=self.delimiter_disp_formula,
426 )
428 xabstract["value_html"] = ckeditor_data["value_html"]
429 # xabstract["value_tex"] = ckeditor_data["value_tex"]
430 xabstract["value_xml"] = ckeditor_data["value_xml"]
432 return xresource
434 def process_article_metadata(self, xarticle: ArticleData):
435 self.process_resource_metadata(xarticle)
436 for bibitem in xarticle.bibitems:
437 bibitem.type = "unknown"
438 update_data_for_jats(xarticle, with_label=False)
440 return xarticle
442 def _wait_download_delay(self, url: str, force_refresh=False):
443 # If we already have a key, we can skip the timeout
444 if isinstance(self.session, CachedSession):
445 if self.session.cache.contains(url=url) and not force_refresh:
446 return
448 delta = self.next_allowed_request - time.time()
449 if delta > 0:
450 self.logger.info(f"Waiting {int(delta)}s before making another request")
451 time.sleep(delta)
452 self.next_allowed_request = time.time() + self.requests_interval
454 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response:
455 """
456 Wrapper around requests.get with delay based on the crawler class instance
457 """
459 self._wait_download_delay(url, force_refresh)
461 kwargs = {}
462 # self.session.cache.delete(urls=[url])
463 if isinstance(self.session, CachedSession):
464 kwargs["force_refresh"] = force_refresh
466 try:
467 response = self.session.get(
468 url,
469 headers={**self.headers, **headers},
470 timeout=self.requests_timeout,
471 **kwargs,
472 )
473 except DocumentTooLarge as e:
474 self.logger.error(e)
475 response = requests.get(
476 url, headers={**self.headers, **headers}, timeout=self.requests_timeout
477 )
479 if not response.ok:
480 raise requests.exceptions.HTTPError(
481 f"Endpoint answered with code {response.status_code} : {url}",
482 response=response,
483 )
485 return response
487 def download_file(self, url: str, force_refresh=False, headers={}):
488 """
489 Downloads a page and returns its content (decoded string).
490 This function handles retries and decoding
491 """
492 attempts = 0
493 while True:
494 try:
495 if attempts > 0:
496 force_refresh = True
497 response = self._get(
498 url, force_refresh=force_refresh or self.force_refresh, headers=headers
499 )
501 if getattr(response, "from_cache", False):
502 return response.text
504 content = self.decode_response(response)
505 if content == "" or not content:
506 raise requests.exceptions.HTTPError(response)
508 if isinstance(self.session, CachedSession):
509 if "Expires" in response.headers:
510 del response.headers["Expires"]
511 del response.headers["Cache-Control"]
512 try:
513 self.session.cache.save_response(response)
514 except DocumentTooLarge as e:
515 self.logger.warning(e)
516 return content
517 except (
518 requests.ConnectionError,
519 requests.ConnectTimeout,
520 requests.exceptions.HTTPError,
521 ) as e:
522 if attempts > 3:
523 raise e
524 self.logger.debug(f"Caught error : {e}", extra={"url": url})
525 attempts += 1
526 # 15 mins, 30 mins, 45 mins
527 delay_minutes = attempts * 15
528 self.logger.debug(
529 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
530 extra={"url": url},
531 )
532 time.sleep(delay_minutes * 60)
534 def decode_response(self, response: requests.Response, encoding: str | None = None):
535 """Override this if the content-type headers from the sources are advertising something else than the actual content
536 SASA needs this"""
537 # Force
538 if encoding:
539 response.encoding = encoding
540 return response.text
542 # Attempt to get encoding using HTTP headers
543 content_type_tag = response.headers.get("Content-Type", None)
545 if content_type_tag: 545 ↛ 552line 545 didn't jump to line 552 because the condition on line 545 was always true
546 charset = self.parse_content_type_charset(content_type_tag)
547 if charset: 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true
548 response.encoding = charset
549 return response.text
551 # Attempt to get encoding using HTML meta charset tag
552 soup = BeautifulSoup(response.text, "html5lib")
553 charset = soup.select_one("meta[charset]")
554 if charset:
555 htmlencoding = charset.get("charset")
556 if isinstance(htmlencoding, str): 556 ↛ 561line 556 didn't jump to line 561 because the condition on line 556 was always true
557 response.encoding = htmlencoding
558 return response.text
560 # Attempt to get encoding using HTML meta content type tag
561 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]')
562 if content_type_tag:
563 content_type = content_type_tag.get("content")
564 if isinstance(content_type, str): 564 ↛ 570line 564 didn't jump to line 570 because the condition on line 564 was always true
565 charset = self.parse_content_type_charset(content_type)
566 if charset: 566 ↛ 570line 566 didn't jump to line 570 because the condition on line 566 was always true
567 response.encoding = charset
568 return response.text
570 return response.text
572 @staticmethod
573 def parse_content_type_charset(content_type: str):
574 header = EmailPolicy.header_factory("content-type", content_type)
575 if "charset" in header.params:
576 return header.params.get("charset")
578 @tracer.start_as_current_span("add_xissue_to_database")
579 def add_xissue_into_database(self, xissue: IssueData):
580 xissue.journal = self.collection
581 xissue.source = self.source_domain
583 if xissue.year == "":
584 raise ValueError("Failsafe : Cannot insert issue without a year")
586 xpub = create_publisherdata()
587 xpub.name = self.publisher
588 xissue.publisher = xpub
589 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
591 attempt = 1
592 success = False
594 while not success and attempt < 4:
595 try:
596 params = {"xissue": xissue, "use_body": False}
597 cmd = addOrUpdateGDMLIssueXmlCmd(params)
598 cmd.do()
599 success = True
600 self.logger.debug(f"Issue {xissue.pid} inserted in database")
601 except SolrError:
602 self.logger.warning(
603 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
604 )
605 attempt += 1
606 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
607 time.sleep(10)
609 if success is False:
610 raise ConnectionRefusedError("Cannot connect to SolR")
612 def get_metadata_using_citation_meta(
613 self,
614 xarticle: ArticleData,
615 xissue: IssueData,
616 soup: BeautifulSoup,
617 what: list[CitationLiteral] = [],
618 ):
619 """
620 :param xarticle: the xarticle that will collect the metadata
621 :param xissue: the xissue that will collect the publisher
622 :param soup: the BeautifulSoup object of tha article page
623 :param what: list of citation_ items to collect.
624 :return: None. The given article is modified
625 """
627 if "title" in what:
628 # TITLE
629 citation_title_node = soup.select_one("meta[name='citation_title']")
630 if citation_title_node: 630 ↛ 635line 630 didn't jump to line 635 because the condition on line 630 was always true
631 title = citation_title_node.get("content")
632 if isinstance(title, str): 632 ↛ 635line 632 didn't jump to line 635 because the condition on line 632 was always true
633 xarticle.title_tex = title
635 if "author" in what: 635 ↛ 664line 635 didn't jump to line 664 because the condition on line 635 was always true
636 # AUTHORS
637 citation_author_nodes = soup.select("meta[name^='citation_author']")
638 current_author: ContributorDict | None = None
639 for citation_author_node in citation_author_nodes:
640 if citation_author_node.get("name") == "citation_author":
641 text_author = citation_author_node.get("content")
642 if not isinstance(text_author, str): 642 ↛ 643line 642 didn't jump to line 643 because the condition on line 642 was never true
643 raise ValueError("Cannot parse author")
644 if text_author == "": 644 ↛ 645line 644 didn't jump to line 645 because the condition on line 644 was never true
645 current_author = None
646 continue
647 current_author = create_contributor(role="author", string_name=text_author)
648 xarticle.contributors.append(current_author)
649 continue
650 if current_author is None: 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true
651 self.logger.warning("Couldn't parse citation author")
652 continue
653 if citation_author_node.get("name") == "citation_author_institution":
654 text_institution = citation_author_node.get("content")
655 if not isinstance(text_institution, str): 655 ↛ 656line 655 didn't jump to line 656 because the condition on line 655 was never true
656 continue
657 current_author["addresses"].append(text_institution)
658 if citation_author_node.get("name") == "citation_author_ocrid": 658 ↛ 659line 658 didn't jump to line 659 because the condition on line 658 was never true
659 text_orcid = citation_author_node.get("content")
660 if not isinstance(text_orcid, str):
661 continue
662 current_author["orcid"] = text_orcid
664 if "pdf" in what:
665 # PDF
666 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
667 if citation_pdf_node:
668 pdf_url = citation_pdf_node.get("content")
669 if isinstance(pdf_url, str): 669 ↛ 672line 669 didn't jump to line 672 because the condition on line 669 was always true
670 add_pdf_link_to_xarticle(xarticle, pdf_url)
672 if "lang" in what:
673 # LANG
674 citation_lang_node = soup.select_one("meta[name='citation_language']")
675 if citation_lang_node: 675 ↛ 681line 675 didn't jump to line 681 because the condition on line 675 was always true
676 # TODO: check other language code
677 content_text = citation_lang_node.get("content")
678 if isinstance(content_text, str): 678 ↛ 681line 678 didn't jump to line 681 because the condition on line 678 was always true
679 xarticle.lang = standardize_tag(content_text)
681 if "abstract" in what:
682 # ABSTRACT
683 abstract_node = soup.select_one("meta[name='citation_abstract']")
684 if abstract_node is not None:
685 abstract = abstract_node.get("content")
686 if not isinstance(abstract, str): 686 ↛ 687line 686 didn't jump to line 687 because the condition on line 686 was never true
687 raise ValueError("Couldn't parse abstract from meta")
688 abstract = BeautifulSoup(abstract, "html.parser").text
689 lang = abstract_node.get("lang")
690 if not isinstance(lang, str):
691 lang = self.detect_language(abstract, xarticle)
692 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
694 if "page" in what:
695 # PAGES
696 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
697 if citation_fpage_node:
698 page = citation_fpage_node.get("content")
699 if isinstance(page, str): 699 ↛ 704line 699 didn't jump to line 704 because the condition on line 699 was always true
700 page = page.split("(")[0]
701 if len(page) < 32: 701 ↛ 704line 701 didn't jump to line 704 because the condition on line 701 was always true
702 xarticle.fpage = page
704 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
705 if citation_lpage_node:
706 page = citation_lpage_node.get("content")
707 if isinstance(page, str): 707 ↛ 712line 707 didn't jump to line 712 because the condition on line 707 was always true
708 page = page.split("(")[0]
709 if len(page) < 32: 709 ↛ 712line 709 didn't jump to line 712 because the condition on line 709 was always true
710 xarticle.lpage = page
712 if "doi" in what:
713 # DOI
714 citation_doi_node = soup.select_one("meta[name='citation_doi']")
715 if citation_doi_node:
716 doi = citation_doi_node.get("content")
717 if isinstance(doi, str): 717 ↛ 724line 717 didn't jump to line 724 because the condition on line 717 was always true
718 doi = doi.strip()
719 pos = doi.find("10.")
720 if pos > 0:
721 doi = doi[pos:]
722 xarticle.doi = doi
724 if "mr" in what:
725 # MR
726 citation_mr_node = soup.select_one("meta[name='citation_mr']")
727 if citation_mr_node:
728 mr = citation_mr_node.get("content")
729 if isinstance(mr, str): 729 ↛ 735line 729 didn't jump to line 735 because the condition on line 729 was always true
730 mr = mr.strip()
731 if mr.find("MR") == 0: 731 ↛ 735line 731 didn't jump to line 735 because the condition on line 731 was always true
732 mr = mr[2:]
733 xarticle.extids.append(("mr-item-id", mr))
735 if "zbl" in what:
736 # ZBL
737 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
738 if citation_zbl_node:
739 zbl = citation_zbl_node.get("content")
740 if isinstance(zbl, str): 740 ↛ 746line 740 didn't jump to line 746 because the condition on line 740 was always true
741 zbl = zbl.strip()
742 if zbl.find("Zbl") == 0: 742 ↛ 746line 742 didn't jump to line 746 because the condition on line 742 was always true
743 zbl = zbl[3:].strip()
744 xarticle.extids.append(("zbl-item-id", zbl))
746 if "publisher" in what:
747 # PUBLISHER
748 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
749 if citation_publisher_node:
750 pub = citation_publisher_node.get("content")
751 if isinstance(pub, str): 751 ↛ 758line 751 didn't jump to line 758 because the condition on line 751 was always true
752 pub = pub.strip()
753 if pub != "": 753 ↛ 758line 753 didn't jump to line 758 because the condition on line 753 was always true
754 xpub = create_publisherdata()
755 xpub.name = pub
756 xissue.publisher = xpub
758 if "keywords" in what:
759 # KEYWORDS
760 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
761 for kwd_node in citation_kwd_nodes:
762 kwds = kwd_node.get("content")
763 if isinstance(kwds, str): 763 ↛ 761line 763 didn't jump to line 761 because the condition on line 763 was always true
764 kwds = kwds.split(",")
765 for kwd in kwds:
766 if kwd == "":
767 continue
768 kwd = kwd.strip()
769 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
771 if "references" in what:
772 citation_references = soup.select("meta[name='citation_reference']")
773 for index, tag in enumerate(citation_references):
774 content = tag.get("content")
775 if not isinstance(content, str): 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true
776 raise ValueError("Cannot parse citation_reference meta")
777 label = str(index + 1)
778 if regex.match(r"^\[\d+\].*", content): 778 ↛ 779line 778 didn't jump to line 779 because the condition on line 778 was never true
779 label = None
780 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
782 def create_xissue(
783 self,
784 url: str | None,
785 year: str,
786 volume_number: str | None,
787 issue_number: str | None = "1",
788 vseries: str | None = None,
789 ):
790 if url is not None and url.endswith("/"):
791 url = url[:-1]
792 xissue = create_issuedata()
793 xissue.url = url
795 xissue.pid = self.get_issue_pid(
796 self.collection_id, year, volume_number, issue_number, vseries
797 )
799 xissue.year = year
801 if volume_number is not None:
802 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
804 if issue_number is not None:
805 xissue.number = issue_number.replace(",", "-")
807 if vseries is not None: 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true
808 xissue.vseries = vseries
809 return xissue
811 def detect_language(self, text: str, article: ArticleData | None = None):
812 if article and article.lang is not None and article.lang != "und":
813 return article.lang
815 language = self.language_detector.detect_language_of(text)
817 if not language: 817 ↛ 818line 817 didn't jump to line 818 because the condition on line 817 was never true
818 return "und"
819 return language.iso_code_639_1.name.lower()
821 def create_trans_title(
822 self,
823 resource_type: str,
824 title_tex: str,
825 lang: str,
826 xresource_lang: str,
827 title_type: str = "main",
828 ):
829 tag = "trans-article" if resource_type == "article" else "issue-title"
831 ckeditor_data = build_jats_data_from_html_field(
832 title_tex,
833 tag=tag,
834 text_lang=lang,
835 resource_lang=xresource_lang,
836 delimiter_inline=self.delimiter_inline_formula,
837 delimiter_disp=self.delimiter_disp_formula,
838 )
840 titledata = create_titledata(
841 lang=lang,
842 type="main",
843 title_html=ckeditor_data["value_html"],
844 title_xml=ckeditor_data["value_xml"],
845 )
847 return titledata
849 references_mapping = {
850 "citation_title": get_article_title_xml,
851 "citation_journal_title": get_source_xml,
852 "citation_publication_date": get_year_xml,
853 "citation_firstpage": get_fpage_xml,
854 "citation_lastpage": get_lpage_xml,
855 }
857 @classmethod
858 def __parse_meta_citation_reference(cls, content: str, label=None):
859 categories = content.split(";")
861 if len(categories) == 1:
862 return JatsBase.bake_ref(content, label=label)
864 citation_data = [c.split("=") for c in categories if "=" in c]
865 del categories
867 xml_string = ""
868 authors_parsed = False
869 authors_strings = []
870 for data in citation_data:
871 key = data[0].strip()
872 citation_content = data[1]
873 if key == "citation_author":
874 authors_strings.append(get_author_xml(template_str=citation_content))
875 continue
876 elif not authors_parsed:
877 xml_string += ", ".join(authors_strings)
878 authors_parsed = True
880 if key in cls.references_mapping:
881 xml_string += " " + cls.references_mapping[key](citation_content)
883 return JatsBase.bake_ref(xml_string, label=label)
885 @classmethod
886 def get_or_create_source(cls):
887 source, created = Source.objects.get_or_create(
888 domain=cls.source_domain,
889 defaults={
890 "name": cls.source_name,
891 "website": cls.source_website,
892 },
893 )
894 if created: 894 ↛ 895line 894 didn't jump to line 895 because the condition on line 894 was never true
895 source.save()
896 return source
898 @staticmethod
899 def get_issue_pid(
900 collection_id: str,
901 year: str,
902 volume_number: str | None = None,
903 issue_number: str | None = None,
904 series: str | None = None,
905 ):
906 # Replace any non-word character with an underscore
907 pid = f"{collection_id}_{year}"
908 if series is not None: 908 ↛ 909line 908 didn't jump to line 909 because the condition on line 908 was never true
909 pid += f"_{series}"
910 if volume_number is not None:
911 pid += f"_{volume_number}"
912 if issue_number is not None:
913 pid += f"_{issue_number}"
914 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
915 return pid
917 @staticmethod
918 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
919 pages_split = pages.split(separator)
920 if len(pages_split) == 0: 920 ↛ 921line 920 didn't jump to line 921 because the condition on line 920 was never true
921 article.page_range = pages
922 if len(pages_split) > 0: 922 ↛ exitline 922 didn't return from function 'set_pages' because the condition on line 922 was always true
923 if pages[0].isnumeric(): 923 ↛ exitline 923 didn't return from function 'set_pages' because the condition on line 923 was always true
924 article.fpage = pages_split[0]
925 if (
926 len(pages_split) > 1
927 and pages_split[0] != pages_split[1]
928 and pages_split[1].isnumeric()
929 ):
930 article.lpage = pages_split[1]