Coverage for src/crawler/base_crawler.py: 68%
516 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-29 14:25 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-29 14:25 +0000
1import logging
2import time
3from concurrent.futures import (
4 Executor,
5 ThreadPoolExecutor,
6)
7from datetime import datetime, timedelta
8from email.policy import EmailPolicy
10import regex
11import requests
12from bs4 import BeautifulSoup
13from django.conf import settings
14from django.contrib.auth.models import User
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_titledata,
43)
44from ptf.model_data_converter import update_data_for_jats
45from pylatexenc.latex2text import LatexNodes2Text
46from pymongo.errors import DocumentTooLarge
47from pysolr import SolrError
48from requests.adapters import HTTPAdapter
49from requests_cache import CachedSession, MongoCache
50from urllib3 import Retry
52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
53from crawler.models import Source
54from crawler.types import CitationLiteral
55from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
58class CrawlerTitleDict(TitleDict):
59 title_tex: str | None
62class BaseCollectionCrawler:
63 """
64 Base collection for the crawlers.
65 To create a crawler:
66 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
67 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
68 3) update factory.py so that crawler_factory can return your new crawler
69 """
71 logger = logging.getLogger(__name__)
72 tracer = trace.get_tracer(__name__)
74 source_name = ""
75 source_domain = ""
76 source_website = ""
78 issue_href = ""
80 collection = None
81 source = None
82 user = None
83 session: requests.Session | CachedSession
85 verify = True
86 headers = {
87 "accept_encoding": "utf-8",
88 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
89 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
90 }
92 next_allowed_request: float = time.time()
94 # seconds to wait between two http requests
95 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
96 # seconds to wait before aborting the connection (if no bytes are recieved)
97 requests_timeout = 60
99 latext_parser = LatexNodes2Text()
101 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
102 # do not use the "$" to surround tex formulas
103 delimiter_inline_formula = "$"
104 delimiter_disp_formula = "$"
106 # HACK : Workaround for tests (monkeypatching)
107 # We store the class here, so we can monkeypatch it when running tests
108 # subCrawlers = {
109 # LofplCrawler: None
110 # }
111 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
113 _language_detector: LanguageDetector | None = None
114 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
116 force_refresh = False
118 # Whereas to include headers in requests cache key
119 match_headers = False
120 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
122 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
123 ignore_missing_pdf = True
125 database_executor: Executor
127 @classmethod
128 def get_view_id(cls):
129 return cls.source_domain
131 @property
132 def language_detector(self):
133 """Crawler Instance singleton for language builder.
134 Late init of LanguageDetector to save on memory"""
135 if not self._language_detector:
136 self._language_detector = self._language_detector_builder.build()
137 return self._language_detector
139 def __init__(
140 self,
141 *args,
142 username: str,
143 collection_id: str,
144 collection_url: str,
145 test_mode: bool = False,
146 publisher: str = "mathdoc",
147 force_refresh=False,
148 ):
149 for CrawlerClass in self.subCrawlers: 149 ↛ 150line 149 didn't jump to line 150 because the loop on line 149 never started
150 self.subCrawlers[CrawlerClass] = CrawlerClass(
151 *args,
152 username=username,
153 collection_id=collection_id,
154 collection_url=collection_url,
155 test_mode=test_mode,
156 publisher=publisher,
157 )
158 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
160 self.username = username
162 self.collection_id = collection_id
163 self.collection_url = (
164 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
165 )
167 self.test_mode = test_mode
168 self.publisher = publisher
170 self.session = requests.session()
172 # Skipped when running tests
173 self.initialize()
174 self.session.verify = self.verify
175 self.force_refresh = force_refresh
177 # We implemented custom retry behaviour, so we don't want to make extra requests here
178 retries = Retry(
179 total=0,
180 )
181 self.session.mount("https://", HTTPAdapter(max_retries=retries))
182 self.session.mount("http://", HTTPAdapter(max_retries=retries))
184 self.database_executor = ThreadPoolExecutor(
185 max_workers=2, thread_name_prefix="crawler_database_thread"
186 )
188 def initialize(self):
189 """
190 Acts as a "second" init function to skip model accesses during test data generation
191 """
192 self.collection = get_or_create_collection(self.collection_id)
193 self.source = self.get_or_create_source()
194 self.user = User.objects.get(username=self.username)
195 self.session = CachedSession(
196 match_headers=self.match_headers,
197 headers=self.headers,
198 backend=MongoCache(
199 host=getattr(settings, "MONGO_HOSTNAME", "localhost"),
200 ),
201 expire_after=timedelta(days=30),
202 )
204 @classmethod
205 def can_crawl(cls, pid: str) -> bool:
206 return True
208 def parse_collection_content(self, content: str) -> list[IssueData]:
209 """
210 Parse the HTML content with BeautifulSoup
211 returns a list of xissue.
212 Override this function in a derived class
213 """
214 return []
216 def parse_issue_content(self, content: str, xissue: IssueData):
217 """
218 Parse the HTML content with BeautifulSoup
219 Fills the xissue.articles
220 Override this function in a derived class.
222 CAV : You are supposed to create articles there. Please assign a PID to each article.
223 The PID can be `a + article_index`, like this : `a0` `a21`
224 """
226 def parse_article_content(
227 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
228 ) -> ArticleData | None:
229 """
230 Parse the HTML content with BeautifulSoup
231 returns the xarticle.
232 Override this function in a derived class.
233 The xissue is passed to the function in case the article page has issue information (ex: publisher)
234 The article url is also passed as a parameter
236 CAV : You are supposed to assign articles pid again here
237 """
238 return xarticle
240 @tracer.start_as_current_span("crawl_collection")
241 def crawl_collection(self):
242 # TODO: Comments, filter
243 """
244 Crawl an entire collection. ptf.models.Container objects are created.
245 - get the HTML content of the collection_url
246 - parse the HTML content with beautifulsoup to extract the list of issues
247 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
248 - crawl each issue if col_only is False
249 - Returns the list of merged issues.
250 It is an OrderedDict {pid: {"issues": xissues}}
251 The key is the pid of the merged issues.
252 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
253 the pid is then made with 1999-2000__6_
254 """
256 if self.source is None:
257 raise RuntimeError("ERROR: the source is not set")
259 content = self.download_file(self.collection_url)
260 if content:
261 xissues = self.parse_collection_content(content)
262 else:
263 # download_file returns None (404)
264 return None
266 """
267 Some collections split the same volumes in different pages
268 Ex: Volume 6 (2000) and Volume 6 (1999)
269 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
270 """
271 # merged_xissues = self.merge_xissues(xissues)
273 xissues_dict = {str(i.pid): i for i in xissues}
275 return xissues_dict
277 @tracer.start_as_current_span("crawl_issue")
278 def crawl_issue(self, xissue: IssueData):
279 """
280 Crawl 1 wag page of an issue.
281 - get the HTML content of the issue
282 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
283 - crawl each article
284 """
286 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
287 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
289 issue_url = xissue.url
290 if issue_url is not None:
291 if issue_url.endswith(".pdf"):
292 add_pdf_link_to_xarticle(xissue, issue_url)
293 xissue.url = None
294 else:
295 content = self.download_file(issue_url)
296 with self.tracer.start_as_current_span("parse_issue_content"):
297 self.parse_issue_content(content, xissue)
299 xarticles = xissue.articles
301 parsed_xarticles = []
303 for xarticle in xarticles:
304 parsed_xarticle = self.crawl_article(xarticle, xissue)
305 if parsed_xarticle is not None:
306 parsed_xarticles.append(parsed_xarticle)
308 xissue.articles = parsed_xarticles
310 article_has_pdf = self.article_has_pdf(xissue)
312 if self.ignore_missing_pdf:
313 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
315 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
316 self.process_resource_metadata(xissue, resource_type="issue")
317 self.database_executor.submit(self.add_xissue_into_database, xissue)
319 @staticmethod
320 def article_has_source(art: ArticleData | IssueData):
321 return (
322 next(
323 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
324 None,
325 )
326 is not None
327 )
329 @staticmethod
330 def article_has_pdf(art: ArticleData | IssueData):
331 return (
332 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
333 is not None
334 )
336 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
337 # ARTICLE URL as en ExtLink (to display the link in the article page)
338 if xarticle.url is None:
339 if not self.article_has_source(xarticle): 339 ↛ 349line 339 didn't jump to line 349 because the condition on line 339 was always true
340 if xissue.url:
341 article_source = xissue.url
342 else:
343 article_source = self.collection_url
344 ext_link = create_extlink()
345 ext_link["rel"] = "source"
346 ext_link["location"] = article_source
347 ext_link["metadata"] = self.source_domain
348 xarticle.ext_links.append(ext_link)
349 return self.process_article_metadata(xarticle)
351 content = self.download_file(xarticle.url)
352 if not content: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true
353 return None
354 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
356 try:
357 with self.tracer.start_as_current_span("parse_article_content"):
358 parsed_xarticle = self.parse_article_content(
359 content, xissue, xarticle, xarticle.url
360 )
361 except ValueError as e:
362 self.logger.warning(e)
363 self.logger.warning("Retrying in 5 mins while invalidating cache")
364 time.sleep(5 * 60)
365 content = self.download_file(xarticle.url, force_refresh=True)
366 with self.tracer.start_as_current_span("parse_article_content"):
367 parsed_xarticle = self.parse_article_content(
368 content, xissue, xarticle, xarticle.url
369 )
371 if parsed_xarticle is None: 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true
372 return None
374 if parsed_xarticle.doi:
375 parsed_xarticle.pid = (
376 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
377 )
379 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
380 ext_link = create_extlink()
381 ext_link["rel"] = "source"
382 ext_link["location"] = parsed_xarticle.url
383 ext_link["metadata"] = self.source_domain
384 parsed_xarticle.ext_links.append(ext_link)
386 # The article title may have formulas surrounded with '$'
387 return self.process_article_metadata(parsed_xarticle)
389 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
390 tag = "article-title" if resource_type == "article" else "issue-title"
392 # Process title tex
393 ckeditor_data = build_jats_data_from_html_field(
394 xresource.title_tex,
395 tag=tag,
396 text_lang=xresource.lang,
397 delimiter_inline=self.delimiter_inline_formula,
398 delimiter_disp=self.delimiter_disp_formula,
399 )
401 xresource.title_html = ckeditor_data["value_html"]
402 # xresource.title_tex = ckeditor_data["value_tex"]
403 xresource.title_xml = ckeditor_data["value_xml"]
405 # Process trans_title tex
406 if xresource.trans_title_tex: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 self.logger.warning(
408 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
409 )
410 trans_title = self.create_trans_title(
411 xresource_lang=xresource.lang,
412 resource_type=resource_type,
413 title_tex=xresource.trans_title_tex,
414 lang=xresource.trans_lang,
415 )
416 xresource.titles.append(trans_title)
418 abstracts_to_parse = [
419 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
420 ]
421 # abstract may have formulas surrounded with '$'
422 if len(abstracts_to_parse) > 0:
423 for xabstract in abstracts_to_parse:
424 ckeditor_data = build_jats_data_from_html_field(
425 xabstract["value_tex"],
426 tag="abstract",
427 text_lang=xabstract["lang"],
428 resource_lang=xresource.lang,
429 field_type="abstract",
430 delimiter_inline=self.delimiter_inline_formula,
431 delimiter_disp=self.delimiter_disp_formula,
432 )
434 xabstract["value_html"] = ckeditor_data["value_html"]
435 # xabstract["value_tex"] = ckeditor_data["value_tex"]
436 xabstract["value_xml"] = ckeditor_data["value_xml"]
438 return xresource
440 def process_article_metadata(self, xarticle: ArticleData):
441 self.process_resource_metadata(xarticle)
442 for bibitem in xarticle.bibitems:
443 bibitem.type = "unknown"
444 update_data_for_jats(xarticle, with_label=False)
446 return xarticle
448 def _wait_download_delay(self):
449 delta = self.next_allowed_request - time.time()
450 self.next_allowed_request = time.time() + self.requests_interval
451 if delta > 0:
452 self.logger.info(f"Waiting {int(delta)}s before making another request")
453 time.sleep(delta)
455 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response:
456 """
457 Wrapper around requests.get with delay based on the crawler class instance
458 """
460 kwargs = {}
461 # self.session.cache.delete(urls=[url])
462 if isinstance(self.session, CachedSession):
463 kwargs["force_refresh"] = force_refresh
465 try:
466 response = self.session.get(
467 url,
468 headers={**self.headers, **headers},
469 timeout=self.requests_timeout,
470 **kwargs,
471 )
472 except DocumentTooLarge as e:
473 self.logger.error(e)
474 response = requests.get(
475 url, headers={**self.headers, **headers}, timeout=self.requests_timeout
476 )
478 if not response.ok:
479 raise requests.exceptions.HTTPError(
480 f"Endpoint answered with code {response.status_code} : {url}",
481 response=response,
482 )
484 if not getattr(response, "from_cache", False):
485 self._wait_download_delay()
486 return response
488 def download_file(self, url: str, force_refresh=False, headers={}):
489 """
490 Downloads a page and returns its content (decoded string).
491 This function handles retries and decoding
492 """
493 attempts = 0
494 while True:
495 try:
496 if attempts > 0:
497 force_refresh = True
498 response = self._get(
499 url, force_refresh=force_refresh or self.force_refresh, headers=headers
500 )
502 if getattr(response, "from_cache", False):
503 return response.text
505 content = self.decode_response(response)
506 if content == "" or not content:
507 raise requests.exceptions.HTTPError(response)
509 if isinstance(self.session, CachedSession):
510 if "Expires" in response.headers:
511 del response.headers["Expires"]
512 del response.headers["Cache-Control"]
513 try:
514 self.session.cache.save_response(response)
515 except DocumentTooLarge as e:
516 self.logger.warning(e)
517 return content
518 except (
519 requests.ConnectionError,
520 requests.ConnectTimeout,
521 requests.exceptions.HTTPError,
522 ) as e:
523 if isinstance(e, requests.exceptions.HTTPError):
524 # if Error 404 (resource not found) we skip
525 status_code = e.response.status_code
526 if status_code == 404:
527 return None
528 else:
529 raise e
530 if attempts > 3:
531 raise e
532 self.logger.debug(f"Caught error : {e}", extra={"url": url})
533 attempts += 1
534 # 15 mins, 30 mins, 45 mins
535 delay_minutes = attempts * 15
536 self.logger.debug(
537 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
538 extra={"url": url},
539 )
540 time.sleep(delay_minutes * 60)
542 def decode_response(self, response: requests.Response, encoding: str | None = None):
543 """Override this if the content-type headers from the sources are advertising something else than the actual content
544 SASA needs this"""
545 # Force
546 if encoding:
547 response.encoding = encoding
548 return response.text
550 # Attempt to get encoding using HTTP headers
551 content_type_tag = response.headers.get("Content-Type", None)
553 if content_type_tag: 553 ↛ 560line 553 didn't jump to line 560 because the condition on line 553 was always true
554 charset = self.parse_content_type_charset(content_type_tag)
555 if charset: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true
556 response.encoding = charset
557 return response.text
559 # Attempt to get encoding using HTML meta charset tag
560 soup = BeautifulSoup(response.text, "html5lib")
561 charset = soup.select_one("meta[charset]")
562 if charset:
563 htmlencoding = charset.get("charset")
564 if isinstance(htmlencoding, str): 564 ↛ 569line 564 didn't jump to line 569 because the condition on line 564 was always true
565 response.encoding = htmlencoding
566 return response.text
568 # Attempt to get encoding using HTML meta content type tag
569 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]')
570 if content_type_tag:
571 content_type = content_type_tag.get("content")
572 if isinstance(content_type, str): 572 ↛ 578line 572 didn't jump to line 578 because the condition on line 572 was always true
573 charset = self.parse_content_type_charset(content_type)
574 if charset: 574 ↛ 578line 574 didn't jump to line 578 because the condition on line 574 was always true
575 response.encoding = charset
576 return response.text
578 return response.text
580 @staticmethod
581 def parse_content_type_charset(content_type: str):
582 header = EmailPolicy.header_factory("content-type", content_type)
583 if "charset" in header.params:
584 return header.params.get("charset")
586 @tracer.start_as_current_span("add_xissue_to_database")
587 def add_xissue_into_database(self, xissue: IssueData):
588 xissue.journal = self.collection
589 xissue.source = self.source_domain
591 if xissue.year == "":
592 raise ValueError("Failsafe : Cannot insert issue without a year")
594 xpub = create_publisherdata()
595 xpub.name = self.publisher
596 xissue.publisher = xpub
597 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
599 attempt = 1
600 success = False
602 while not success and attempt < 4:
603 try:
604 params = {"xissue": xissue, "use_body": False}
605 cmd = addOrUpdateGDMLIssueXmlCmd(params)
606 cmd.do()
607 success = True
608 self.logger.debug(f"Issue {xissue.pid} inserted in database")
609 except SolrError:
610 self.logger.warning(
611 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
612 )
613 attempt += 1
614 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
615 time.sleep(10)
616 except Exception as e:
617 self.logger.error(
618 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
619 )
620 raise e
622 if success is False:
623 raise ConnectionRefusedError("Cannot connect to SolR")
625 def get_metadata_using_citation_meta(
626 self,
627 xarticle: ArticleData,
628 xissue: IssueData,
629 soup: BeautifulSoup,
630 what: list[CitationLiteral] = [],
631 ):
632 """
633 :param xarticle: the xarticle that will collect the metadata
634 :param xissue: the xissue that will collect the publisher
635 :param soup: the BeautifulSoup object of tha article page
636 :param what: list of citation_ items to collect.
637 :return: None. The given article is modified
638 """
640 if "title" in what:
641 # TITLE
642 citation_title_node = soup.select_one("meta[name='citation_title']")
643 if citation_title_node: 643 ↛ 648line 643 didn't jump to line 648 because the condition on line 643 was always true
644 title = citation_title_node.get("content")
645 if isinstance(title, str): 645 ↛ 648line 645 didn't jump to line 648 because the condition on line 645 was always true
646 xarticle.title_tex = title
648 if "author" in what: 648 ↛ 677line 648 didn't jump to line 677 because the condition on line 648 was always true
649 # AUTHORS
650 citation_author_nodes = soup.select("meta[name^='citation_author']")
651 current_author: ContributorDict | None = None
652 for citation_author_node in citation_author_nodes:
653 if citation_author_node.get("name") == "citation_author":
654 text_author = citation_author_node.get("content")
655 if not isinstance(text_author, str): 655 ↛ 656line 655 didn't jump to line 656 because the condition on line 655 was never true
656 raise ValueError("Cannot parse author")
657 if text_author == "": 657 ↛ 658line 657 didn't jump to line 658 because the condition on line 657 was never true
658 current_author = None
659 continue
660 current_author = create_contributor(role="author", string_name=text_author)
661 xarticle.contributors.append(current_author)
662 continue
663 if current_author is None: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true
664 self.logger.warning("Couldn't parse citation author")
665 continue
666 if citation_author_node.get("name") == "citation_author_institution":
667 text_institution = citation_author_node.get("content")
668 if not isinstance(text_institution, str): 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true
669 continue
670 current_author["addresses"].append(text_institution)
671 if citation_author_node.get("name") == "citation_author_ocrid": 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true
672 text_orcid = citation_author_node.get("content")
673 if not isinstance(text_orcid, str):
674 continue
675 current_author["orcid"] = text_orcid
677 if "pdf" in what:
678 # PDF
679 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
680 if citation_pdf_node:
681 pdf_url = citation_pdf_node.get("content")
682 if isinstance(pdf_url, str): 682 ↛ 685line 682 didn't jump to line 685 because the condition on line 682 was always true
683 add_pdf_link_to_xarticle(xarticle, pdf_url)
685 if "lang" in what:
686 # LANG
687 citation_lang_node = soup.select_one("meta[name='citation_language']")
688 if citation_lang_node: 688 ↛ 694line 688 didn't jump to line 694 because the condition on line 688 was always true
689 # TODO: check other language code
690 content_text = citation_lang_node.get("content")
691 if isinstance(content_text, str): 691 ↛ 694line 691 didn't jump to line 694 because the condition on line 691 was always true
692 xarticle.lang = standardize_tag(content_text)
694 if "abstract" in what:
695 # ABSTRACT
696 abstract_node = soup.select_one("meta[name='citation_abstract']")
697 if abstract_node is not None:
698 abstract = abstract_node.get("content")
699 if not isinstance(abstract, str): 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true
700 raise ValueError("Couldn't parse abstract from meta")
701 abstract = BeautifulSoup(abstract, "html.parser").text
702 lang = abstract_node.get("lang")
703 if not isinstance(lang, str):
704 lang = self.detect_language(abstract, xarticle)
705 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
707 if "page" in what:
708 # PAGES
709 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
710 if citation_fpage_node:
711 page = citation_fpage_node.get("content")
712 if isinstance(page, str): 712 ↛ 717line 712 didn't jump to line 717 because the condition on line 712 was always true
713 page = page.split("(")[0]
714 if len(page) < 32: 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was always true
715 xarticle.fpage = page
717 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
718 if citation_lpage_node:
719 page = citation_lpage_node.get("content")
720 if isinstance(page, str): 720 ↛ 725line 720 didn't jump to line 725 because the condition on line 720 was always true
721 page = page.split("(")[0]
722 if len(page) < 32: 722 ↛ 725line 722 didn't jump to line 725 because the condition on line 722 was always true
723 xarticle.lpage = page
725 if "doi" in what:
726 # DOI
727 citation_doi_node = soup.select_one("meta[name='citation_doi']")
728 if citation_doi_node:
729 doi = citation_doi_node.get("content")
730 if isinstance(doi, str): 730 ↛ 737line 730 didn't jump to line 737 because the condition on line 730 was always true
731 doi = doi.strip()
732 pos = doi.find("10.")
733 if pos > 0:
734 doi = doi[pos:]
735 xarticle.doi = doi
737 if "mr" in what:
738 # MR
739 citation_mr_node = soup.select_one("meta[name='citation_mr']")
740 if citation_mr_node:
741 mr = citation_mr_node.get("content")
742 if isinstance(mr, str): 742 ↛ 748line 742 didn't jump to line 748 because the condition on line 742 was always true
743 mr = mr.strip()
744 if mr.find("MR") == 0: 744 ↛ 748line 744 didn't jump to line 748 because the condition on line 744 was always true
745 mr = mr[2:]
746 xarticle.extids.append(("mr-item-id", mr))
748 if "zbl" in what:
749 # ZBL
750 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
751 if citation_zbl_node:
752 zbl = citation_zbl_node.get("content")
753 if isinstance(zbl, str): 753 ↛ 759line 753 didn't jump to line 759 because the condition on line 753 was always true
754 zbl = zbl.strip()
755 if zbl.find("Zbl") == 0: 755 ↛ 759line 755 didn't jump to line 759 because the condition on line 755 was always true
756 zbl = zbl[3:].strip()
757 xarticle.extids.append(("zbl-item-id", zbl))
759 if "publisher" in what:
760 # PUBLISHER
761 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
762 if citation_publisher_node:
763 pub = citation_publisher_node.get("content")
764 if isinstance(pub, str): 764 ↛ 771line 764 didn't jump to line 771 because the condition on line 764 was always true
765 pub = pub.strip()
766 if pub != "": 766 ↛ 771line 766 didn't jump to line 771 because the condition on line 766 was always true
767 xpub = create_publisherdata()
768 xpub.name = pub
769 xissue.publisher = xpub
771 if "keywords" in what:
772 # KEYWORDS
773 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
774 for kwd_node in citation_kwd_nodes:
775 kwds = kwd_node.get("content")
776 if isinstance(kwds, str): 776 ↛ 774line 776 didn't jump to line 774 because the condition on line 776 was always true
777 kwds = kwds.split(",")
778 for kwd in kwds:
779 if kwd == "":
780 continue
781 kwd = kwd.strip()
782 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
784 if "references" in what:
785 citation_references = soup.select("meta[name='citation_reference']")
786 for index, tag in enumerate(citation_references):
787 content = tag.get("content")
788 if not isinstance(content, str): 788 ↛ 789line 788 didn't jump to line 789 because the condition on line 788 was never true
789 raise ValueError("Cannot parse citation_reference meta")
790 label = str(index + 1)
791 if regex.match(r"^\[\d+\].*", content): 791 ↛ 792line 791 didn't jump to line 792 because the condition on line 791 was never true
792 label = None
793 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
795 def create_xissue(
796 self,
797 url: str | None,
798 year: str,
799 volume_number: str | None,
800 issue_number: str | None = "1",
801 vseries: str | None = None,
802 ):
803 if url is not None and url.endswith("/"):
804 url = url[:-1]
805 xissue = create_issuedata()
806 xissue.url = url
808 xissue.pid = self.get_issue_pid(
809 self.collection_id, year, volume_number, issue_number, vseries
810 )
812 xissue.year = year
814 if volume_number is not None:
815 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
817 if issue_number is not None:
818 xissue.number = issue_number.replace(",", "-")
820 if vseries is not None: 820 ↛ 821line 820 didn't jump to line 821 because the condition on line 820 was never true
821 xissue.vseries = vseries
822 return xissue
824 def detect_language(self, text: str, article: ArticleData | None = None):
825 if article and article.lang is not None and article.lang != "und":
826 return article.lang
828 language = self.language_detector.detect_language_of(text)
830 if not language: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 return "und"
832 return language.iso_code_639_1.name.lower()
834 def create_trans_title(
835 self,
836 resource_type: str,
837 title_tex: str,
838 lang: str,
839 xresource_lang: str,
840 title_type: str = "main",
841 ):
842 tag = "trans-article" if resource_type == "article" else "issue-title"
844 ckeditor_data = build_jats_data_from_html_field(
845 title_tex,
846 tag=tag,
847 text_lang=lang,
848 resource_lang=xresource_lang,
849 delimiter_inline=self.delimiter_inline_formula,
850 delimiter_disp=self.delimiter_disp_formula,
851 )
853 titledata = create_titledata(
854 lang=lang,
855 type="main",
856 title_html=ckeditor_data["value_html"],
857 title_xml=ckeditor_data["value_xml"],
858 )
860 return titledata
862 references_mapping = {
863 "citation_title": get_article_title_xml,
864 "citation_journal_title": get_source_xml,
865 "citation_publication_date": get_year_xml,
866 "citation_firstpage": get_fpage_xml,
867 "citation_lastpage": get_lpage_xml,
868 }
870 @classmethod
871 def __parse_meta_citation_reference(cls, content: str, label=None):
872 categories = content.split(";")
874 if len(categories) == 1:
875 return JatsBase.bake_ref(content, label=label)
877 citation_data = [c.split("=") for c in categories if "=" in c]
878 del categories
880 xml_string = ""
881 authors_parsed = False
882 authors_strings = []
883 for data in citation_data:
884 key = data[0].strip()
885 citation_content = data[1]
886 if key == "citation_author":
887 authors_strings.append(get_author_xml(template_str=citation_content))
888 continue
889 elif not authors_parsed:
890 xml_string += ", ".join(authors_strings)
891 authors_parsed = True
893 if key in cls.references_mapping:
894 xml_string += " " + cls.references_mapping[key](citation_content)
896 return JatsBase.bake_ref(xml_string, label=label)
898 @classmethod
899 def get_or_create_source(cls):
900 source, created = Source.objects.get_or_create(
901 domain=cls.source_domain,
902 defaults={
903 "name": cls.source_name,
904 "website": cls.source_website,
905 },
906 )
907 if created: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true
908 source.save()
909 return source
911 @staticmethod
912 def get_issue_pid(
913 collection_id: str,
914 year: str,
915 volume_number: str | None = None,
916 issue_number: str | None = None,
917 series: str | None = None,
918 ):
919 # Replace any non-word character with an underscore
920 pid = f"{collection_id}_{year}"
921 if series is not None: 921 ↛ 922line 921 didn't jump to line 922 because the condition on line 921 was never true
922 pid += f"_{series}"
923 if volume_number is not None:
924 pid += f"_{volume_number}"
925 if issue_number is not None:
926 pid += f"_{issue_number}"
927 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
928 return pid
930 @staticmethod
931 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
932 pages_split = pages.split(separator)
933 if len(pages_split) == 0: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true
934 article.page_range = pages
935 if len(pages_split) > 0: 935 ↛ exitline 935 didn't return from function 'set_pages' because the condition on line 935 was always true
936 if pages[0].isnumeric(): 936 ↛ exitline 936 didn't return from function 'set_pages' because the condition on line 936 was always true
937 article.fpage = pages_split[0]
938 if (
939 len(pages_split) > 1
940 and pages_split[0] != pages_split[1]
941 and pages_split[1].isnumeric()
942 ):
943 article.lpage = pages_split[1]