Coverage for src/crawler/base_crawler.py: 68%
520 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
1import logging
2import time
3from concurrent.futures import (
4 Executor,
5 ThreadPoolExecutor,
6)
7from datetime import datetime, timedelta
8from email.policy import EmailPolicy
10import regex
11import requests
12from bs4 import BeautifulSoup
13from django.conf import settings
14from django.contrib.auth.models import User
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_titledata,
43)
44from ptf.model_data_converter import update_data_for_jats
45from pylatexenc.latex2text import LatexNodes2Text
46from pymongo.errors import DocumentTooLarge
47from pysolr import SolrError
48from requests.adapters import HTTPAdapter
49from requests_cache import CachedSession, MongoCache
50from urllib3 import Retry
52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
53from crawler.models import Source
54from crawler.types import CitationLiteral
55from crawler.utils import (
56 add_pdf_link_to_xarticle,
57 cleanup_str,
58 get_all_cols,
59 get_or_create_collection,
60)
63class CrawlerTitleDict(TitleDict):
64 title_tex: str | None
67class BaseCollectionCrawler:
68 """
69 Base collection for the crawlers.
70 To create a crawler:
71 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
72 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
73 3) update factory.py so that crawler_factory can return your new crawler
74 """
76 logger = logging.getLogger(__name__)
77 tracer = trace.get_tracer(__name__)
79 source_name = ""
80 source_domain = ""
81 source_website = ""
83 issue_href = ""
85 collection = None
86 source = None
87 user = None
88 session: requests.Session | CachedSession
90 verify = True
91 headers = {
92 "accept_encoding": "utf-8",
93 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
94 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
95 }
97 next_allowed_request: float = time.time()
99 # seconds to wait between two http requests
100 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
101 # seconds to wait before aborting the connection (if no bytes are recieved)
102 requests_timeout = 60
104 latext_parser = LatexNodes2Text()
106 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
107 # do not use the "$" to surround tex formulas
108 delimiter_inline_formula = "$"
109 delimiter_disp_formula = "$"
111 # HACK : Workaround for tests (monkeypatching)
112 # We store the class here, so we can monkeypatch it when running tests
113 # subCrawlers = {
114 # LofplCrawler: None
115 # }
116 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
118 _language_detector: LanguageDetector | None = None
119 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
121 force_refresh = False
123 # Whereas to include headers in requests cache key
124 match_headers = False
125 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
127 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
128 ignore_missing_pdf = True
130 database_executor: Executor
132 @classmethod
133 def get_view_id(cls):
134 return cls.source_domain
136 @property
137 def language_detector(self):
138 """Crawler Instance singleton for language builder.
139 Late init of LanguageDetector to save on memory"""
140 if not self._language_detector:
141 self._language_detector = self._language_detector_builder.build()
142 return self._language_detector
144 def __init__(
145 self,
146 *args,
147 username: str,
148 collection_id: str,
149 dry: bool = False,
150 publisher: str = "",
151 force_refresh=False,
152 collection_url: str | None = None,
153 ):
154 if not collection_url: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 all_cols = get_all_cols()
156 col = all_cols[collection_id]
157 collection_url = col["sources"][self.source_domain]
158 self.collection_url = collection_url
159 for CrawlerClass in self.subCrawlers: 159 ↛ 160line 159 didn't jump to line 160 because the loop on line 159 never started
160 self.subCrawlers[CrawlerClass] = CrawlerClass(
161 *args,
162 username=username,
163 collection_id=collection_id,
164 dry=dry,
165 publisher=publisher,
166 collection_url=collection_url,
167 )
168 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
170 self.username = username
172 self.collection_id = collection_id
174 self.dry = dry
175 self.publisher = publisher
177 self.session = requests.session()
179 # Skipped when running tests
180 self.initialize()
181 self.session.verify = self.verify
182 self.force_refresh = force_refresh
184 # We implemented custom retry behaviour, so we don't want to make extra requests here
185 retries = Retry(
186 total=0,
187 )
188 self.session.mount("https://", HTTPAdapter(max_retries=retries))
189 self.session.mount("http://", HTTPAdapter(max_retries=retries))
191 self.database_executor = ThreadPoolExecutor(
192 max_workers=2, thread_name_prefix="crawler_database_thread"
193 )
195 def initialize(self):
196 """
197 Acts as a "second" init function to skip model accesses during test data generation
198 """
199 self.collection = get_or_create_collection(self.collection_id)
200 self.source = self.get_or_create_source()
201 self.user = User.objects.get(username=self.username)
202 self.session = CachedSession(
203 match_headers=self.match_headers,
204 headers=self.headers,
205 backend=MongoCache(
206 host=getattr(settings, "MONGO_HOSTNAME", "localhost"),
207 ),
208 expire_after=timedelta(days=30),
209 )
211 @classmethod
212 def can_crawl(cls, pid: str) -> bool:
213 return True
215 def parse_collection_content(self, content: str) -> list[IssueData]:
216 """
217 Parse the HTML content with BeautifulSoup
218 returns a list of xissue.
219 Override this function in a derived class
220 """
221 return []
223 def parse_issue_content(self, content: str, xissue: IssueData):
224 """
225 Parse the HTML content with BeautifulSoup
226 Fills the xissue.articles
227 Override this function in a derived class.
229 CAV : You are supposed to create articles there. Please assign a PID to each article.
230 The PID can be `a + article_index`, like this : `a0` `a21`
231 """
233 def parse_article_content(
234 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
235 ) -> ArticleData | None:
236 """
237 Parse the HTML content with BeautifulSoup
238 returns the xarticle.
239 Override this function in a derived class.
240 The xissue is passed to the function in case the article page has issue information (ex: publisher)
241 The article url is also passed as a parameter
243 CAV : You are supposed to assign articles pid again here
244 """
245 return xarticle
247 @tracer.start_as_current_span("crawl_collection")
248 def crawl_collection(self):
249 # TODO: Comments, filter
250 """
251 Crawl an entire collection. ptf.models.Container objects are created.
252 - get the HTML content of the collection_url
253 - parse the HTML content with beautifulsoup to extract the list of issues
254 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
255 - crawl each issue if col_only is False
256 - Returns the list of merged issues.
257 It is an OrderedDict {pid: {"issues": xissues}}
258 The key is the pid of the merged issues.
259 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
260 the pid is then made with 1999-2000__6_
261 """
263 if self.source is None:
264 raise RuntimeError("ERROR: the source is not set")
266 content = self.download_file(self.collection_url)
267 if content:
268 xissues = self.parse_collection_content(content)
269 else:
270 # download_file returns None (404)
271 return None
273 """
274 Some collections split the same volumes in different pages
275 Ex: Volume 6 (2000) and Volume 6 (1999)
276 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
277 """
278 # merged_xissues = self.merge_xissues(xissues)
280 xissues_dict = {str(i.pid): i for i in xissues}
282 return xissues_dict
284 @tracer.start_as_current_span("crawl_issue")
285 def crawl_issue(self, xissue: IssueData):
286 """
287 Crawl 1 wag page of an issue.
288 - get the HTML content of the issue
289 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
290 - crawl each article
291 """
293 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
294 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
296 issue_url = xissue.url
297 if issue_url is not None:
298 if issue_url.endswith(".pdf"):
299 add_pdf_link_to_xarticle(xissue, issue_url)
300 xissue.url = None
301 else:
302 content = self.download_file(issue_url)
303 with self.tracer.start_as_current_span("parse_issue_content"):
304 self.parse_issue_content(content, xissue)
306 xarticles = xissue.articles
308 parsed_xarticles = []
310 for xarticle in xarticles:
311 parsed_xarticle = self.crawl_article(xarticle, xissue)
312 if parsed_xarticle is not None:
313 parsed_xarticles.append(parsed_xarticle)
315 xissue.articles = parsed_xarticles
317 article_has_pdf = self.article_has_pdf(xissue)
319 if self.ignore_missing_pdf:
320 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
322 if not self.dry and (len(xissue.articles) > 0 or article_has_pdf):
323 self.process_resource_metadata(xissue, resource_type="issue")
324 self.database_executor.submit(self.add_xissue_into_database, xissue)
326 @staticmethod
327 def article_has_source(art: ArticleData | IssueData):
328 return (
329 next(
330 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
331 None,
332 )
333 is not None
334 )
336 @staticmethod
337 def article_has_pdf(art: ArticleData | IssueData):
338 return (
339 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
340 is not None
341 )
343 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
344 # ARTICLE URL as en ExtLink (to display the link in the article page)
345 if xarticle.url is None:
346 if not self.article_has_source(xarticle): 346 ↛ 356line 346 didn't jump to line 356 because the condition on line 346 was always true
347 if xissue.url:
348 article_source = xissue.url
349 else:
350 article_source = self.collection_url
351 ext_link = create_extlink()
352 ext_link["rel"] = "source"
353 ext_link["location"] = article_source
354 ext_link["metadata"] = self.source_domain
355 xarticle.ext_links.append(ext_link)
356 return self.process_article_metadata(xarticle)
358 content = self.download_file(xarticle.url)
359 if not content: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true
360 return None
361 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
363 try:
364 with self.tracer.start_as_current_span("parse_article_content"):
365 parsed_xarticle = self.parse_article_content(
366 content, xissue, xarticle, xarticle.url
367 )
368 except ValueError as e:
369 self.logger.warning(e)
370 self.logger.warning("Retrying in 5 mins while invalidating cache")
371 time.sleep(5 * 60)
372 content = self.download_file(xarticle.url, force_refresh=True)
373 with self.tracer.start_as_current_span("parse_article_content"):
374 parsed_xarticle = self.parse_article_content(
375 content, xissue, xarticle, xarticle.url
376 )
378 if parsed_xarticle is None: 378 ↛ 379line 378 didn't jump to line 379 because the condition on line 378 was never true
379 return None
381 if parsed_xarticle.doi:
382 parsed_xarticle.pid = (
383 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
384 )
386 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
387 ext_link = create_extlink()
388 ext_link["rel"] = "source"
389 ext_link["location"] = parsed_xarticle.url
390 ext_link["metadata"] = self.source_domain
391 parsed_xarticle.ext_links.append(ext_link)
393 # The article title may have formulas surrounded with '$'
394 return self.process_article_metadata(parsed_xarticle)
396 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
397 tag = "article-title" if resource_type == "article" else "issue-title"
399 # Process title tex
400 ckeditor_data = build_jats_data_from_html_field(
401 xresource.title_tex,
402 tag=tag,
403 text_lang=xresource.lang,
404 delimiter_inline=self.delimiter_inline_formula,
405 delimiter_disp=self.delimiter_disp_formula,
406 )
408 xresource.title_html = ckeditor_data["value_html"]
409 # xresource.title_tex = ckeditor_data["value_tex"]
410 xresource.title_xml = ckeditor_data["value_xml"]
412 # Process trans_title tex
413 if xresource.trans_title_tex: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true
414 self.logger.warning(
415 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
416 )
417 trans_title = self.create_trans_title(
418 xresource_lang=xresource.lang,
419 resource_type=resource_type,
420 title_tex=xresource.trans_title_tex,
421 lang=xresource.trans_lang,
422 )
423 xresource.titles.append(trans_title)
425 abstracts_to_parse = [
426 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
427 ]
428 # abstract may have formulas surrounded with '$'
429 if len(abstracts_to_parse) > 0:
430 for xabstract in abstracts_to_parse:
431 ckeditor_data = build_jats_data_from_html_field(
432 xabstract["value_tex"],
433 tag="abstract",
434 text_lang=xabstract["lang"],
435 resource_lang=xresource.lang,
436 field_type="abstract",
437 delimiter_inline=self.delimiter_inline_formula,
438 delimiter_disp=self.delimiter_disp_formula,
439 )
441 xabstract["value_html"] = ckeditor_data["value_html"]
442 # xabstract["value_tex"] = ckeditor_data["value_tex"]
443 xabstract["value_xml"] = ckeditor_data["value_xml"]
445 return xresource
447 def process_article_metadata(self, xarticle: ArticleData):
448 self.process_resource_metadata(xarticle)
449 for bibitem in xarticle.bibitems:
450 bibitem.type = "unknown"
451 update_data_for_jats(xarticle, with_label=False)
453 return xarticle
455 def _wait_download_delay(self):
456 delta = self.next_allowed_request - time.time()
457 self.next_allowed_request = time.time() + self.requests_interval
458 if delta > 0:
459 self.logger.info(f"Waiting {int(delta)}s before making another request")
460 time.sleep(delta)
462 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response:
463 """
464 Wrapper around requests.get with delay based on the crawler class instance
465 """
467 kwargs = {}
468 # self.session.cache.delete(urls=[url])
469 if isinstance(self.session, CachedSession):
470 kwargs["force_refresh"] = force_refresh
472 try:
473 response = self.session.get(
474 url,
475 headers={**self.headers, **headers},
476 timeout=self.requests_timeout,
477 **kwargs,
478 )
479 except DocumentTooLarge as e:
480 self.logger.error(e)
481 response = requests.get(
482 url, headers={**self.headers, **headers}, timeout=self.requests_timeout
483 )
485 if not response.ok:
486 raise requests.exceptions.HTTPError(
487 f"Endpoint answered with code {response.status_code} : {url}",
488 response=response,
489 )
491 if not getattr(response, "from_cache", False):
492 self._wait_download_delay()
493 return response
495 def download_file(self, url: str, force_refresh=False, headers={}):
496 """
497 Downloads a page and returns its content (decoded string).
498 This function handles retries and decoding
499 """
500 attempts = 0
501 while True:
502 try:
503 if attempts > 0:
504 force_refresh = True
505 response = self._get(
506 url, force_refresh=force_refresh or self.force_refresh, headers=headers
507 )
509 if getattr(response, "from_cache", False):
510 return response.text
512 content = self.decode_response(response)
513 if content == "" or not content:
514 raise requests.exceptions.HTTPError(response)
516 if isinstance(self.session, CachedSession):
517 if "Expires" in response.headers:
518 del response.headers["Expires"]
519 del response.headers["Cache-Control"]
520 try:
521 self.session.cache.save_response(response)
522 except DocumentTooLarge as e:
523 self.logger.warning(e)
524 return content
525 except (
526 requests.ConnectionError,
527 requests.ConnectTimeout,
528 requests.exceptions.HTTPError,
529 ) as e:
530 if isinstance(e, requests.exceptions.HTTPError):
531 # if Error 404 (resource not found) we skip
532 status_code = e.response.status_code
533 if status_code == 404:
534 return None
535 else:
536 raise e
537 if attempts > 3:
538 raise e
539 self.logger.debug(f"Caught error : {e}", extra={"url": url})
540 attempts += 1
541 # 15 mins, 30 mins, 45 mins
542 delay_minutes = attempts * 15
543 self.logger.debug(
544 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
545 extra={"url": url},
546 )
547 time.sleep(delay_minutes * 60)
549 def decode_response(self, response: requests.Response, encoding: str | None = None):
550 """Override this if the content-type headers from the sources are advertising something else than the actual content
551 SASA needs this"""
552 # Force
553 if encoding:
554 response.encoding = encoding
555 return response.text
557 # Attempt to get encoding using HTTP headers
558 content_type_tag = response.headers.get("Content-Type", None)
560 if content_type_tag: 560 ↛ 567line 560 didn't jump to line 567 because the condition on line 560 was always true
561 charset = self.parse_content_type_charset(content_type_tag)
562 if charset: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true
563 response.encoding = charset
564 return response.text
566 # Attempt to get encoding using HTML meta charset tag
567 soup = BeautifulSoup(response.text, "html5lib")
568 charset = soup.select_one("meta[charset]")
569 if charset:
570 htmlencoding = charset.get("charset")
571 if isinstance(htmlencoding, str): 571 ↛ 576line 571 didn't jump to line 576 because the condition on line 571 was always true
572 response.encoding = htmlencoding
573 return response.text
575 # Attempt to get encoding using HTML meta content type tag
576 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]')
577 if content_type_tag:
578 content_type = content_type_tag.get("content")
579 if isinstance(content_type, str): 579 ↛ 585line 579 didn't jump to line 585 because the condition on line 579 was always true
580 charset = self.parse_content_type_charset(content_type)
581 if charset: 581 ↛ 585line 581 didn't jump to line 585 because the condition on line 581 was always true
582 response.encoding = charset
583 return response.text
585 return response.text
587 @staticmethod
588 def parse_content_type_charset(content_type: str):
589 header = EmailPolicy.header_factory("content-type", content_type)
590 if "charset" in header.params:
591 return header.params.get("charset")
593 @tracer.start_as_current_span("add_xissue_to_database")
594 def add_xissue_into_database(self, xissue: IssueData):
595 xissue.journal = self.collection
596 xissue.source = self.source_domain
598 if xissue.year == "":
599 raise ValueError("Failsafe : Cannot insert issue without a year")
601 xpub = create_publisherdata()
602 xpub.name = self.publisher
603 xissue.publisher = xpub
604 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
606 attempt = 1
607 success = False
609 while not success and attempt < 4:
610 try:
611 params = {"xissue": xissue, "use_body": False}
612 cmd = addOrUpdateGDMLIssueXmlCmd(params)
613 cmd.do()
614 success = True
615 self.logger.debug(f"Issue {xissue.pid} inserted in database")
616 except SolrError:
617 self.logger.warning(
618 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
619 )
620 attempt += 1
621 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
622 time.sleep(10)
623 except Exception as e:
624 self.logger.error(
625 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
626 )
627 raise e
629 if success is False:
630 raise ConnectionRefusedError("Cannot connect to SolR")
632 def get_metadata_using_citation_meta(
633 self,
634 xarticle: ArticleData,
635 xissue: IssueData,
636 soup: BeautifulSoup,
637 what: list[CitationLiteral] = [],
638 ):
639 """
640 :param xarticle: the xarticle that will collect the metadata
641 :param xissue: the xissue that will collect the publisher
642 :param soup: the BeautifulSoup object of tha article page
643 :param what: list of citation_ items to collect.
644 :return: None. The given article is modified
645 """
647 if "title" in what:
648 # TITLE
649 citation_title_node = soup.select_one("meta[name='citation_title']")
650 if citation_title_node: 650 ↛ 655line 650 didn't jump to line 655 because the condition on line 650 was always true
651 title = citation_title_node.get("content")
652 if isinstance(title, str): 652 ↛ 655line 652 didn't jump to line 655 because the condition on line 652 was always true
653 xarticle.title_tex = title
655 if "author" in what: 655 ↛ 684line 655 didn't jump to line 684 because the condition on line 655 was always true
656 # AUTHORS
657 citation_author_nodes = soup.select("meta[name^='citation_author']")
658 current_author: ContributorDict | None = None
659 for citation_author_node in citation_author_nodes:
660 if citation_author_node.get("name") == "citation_author":
661 text_author = citation_author_node.get("content")
662 if not isinstance(text_author, str): 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true
663 raise ValueError("Cannot parse author")
664 if text_author == "": 664 ↛ 665line 664 didn't jump to line 665 because the condition on line 664 was never true
665 current_author = None
666 continue
667 current_author = create_contributor(role="author", string_name=text_author)
668 xarticle.contributors.append(current_author)
669 continue
670 if current_author is None: 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true
671 self.logger.warning("Couldn't parse citation author")
672 continue
673 if citation_author_node.get("name") == "citation_author_institution":
674 text_institution = citation_author_node.get("content")
675 if not isinstance(text_institution, str): 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true
676 continue
677 current_author["addresses"].append(text_institution)
678 if citation_author_node.get("name") == "citation_author_ocrid": 678 ↛ 679line 678 didn't jump to line 679 because the condition on line 678 was never true
679 text_orcid = citation_author_node.get("content")
680 if not isinstance(text_orcid, str):
681 continue
682 current_author["orcid"] = text_orcid
684 if "pdf" in what:
685 # PDF
686 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
687 if citation_pdf_node:
688 pdf_url = citation_pdf_node.get("content")
689 if isinstance(pdf_url, str): 689 ↛ 692line 689 didn't jump to line 692 because the condition on line 689 was always true
690 add_pdf_link_to_xarticle(xarticle, pdf_url)
692 if "lang" in what:
693 # LANG
694 citation_lang_node = soup.select_one("meta[name='citation_language']")
695 if citation_lang_node: 695 ↛ 701line 695 didn't jump to line 701 because the condition on line 695 was always true
696 # TODO: check other language code
697 content_text = citation_lang_node.get("content")
698 if isinstance(content_text, str): 698 ↛ 701line 698 didn't jump to line 701 because the condition on line 698 was always true
699 xarticle.lang = standardize_tag(content_text)
701 if "abstract" in what:
702 # ABSTRACT
703 abstract_node = soup.select_one("meta[name='citation_abstract']")
704 if abstract_node is not None:
705 abstract = abstract_node.get("content")
706 if not isinstance(abstract, str): 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true
707 raise ValueError("Couldn't parse abstract from meta")
708 abstract = BeautifulSoup(abstract, "html.parser").text
709 lang = abstract_node.get("lang")
710 if not isinstance(lang, str):
711 lang = self.detect_language(abstract, xarticle)
712 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
714 if "page" in what:
715 # PAGES
716 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
717 if citation_fpage_node:
718 page = citation_fpage_node.get("content")
719 if isinstance(page, str): 719 ↛ 724line 719 didn't jump to line 724 because the condition on line 719 was always true
720 page = page.split("(")[0]
721 if len(page) < 32: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true
722 xarticle.fpage = page
724 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
725 if citation_lpage_node:
726 page = citation_lpage_node.get("content")
727 if isinstance(page, str): 727 ↛ 732line 727 didn't jump to line 732 because the condition on line 727 was always true
728 page = page.split("(")[0]
729 if len(page) < 32: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true
730 xarticle.lpage = page
732 if "doi" in what:
733 # DOI
734 citation_doi_node = soup.select_one("meta[name='citation_doi']")
735 if citation_doi_node:
736 doi = citation_doi_node.get("content")
737 if isinstance(doi, str): 737 ↛ 744line 737 didn't jump to line 744 because the condition on line 737 was always true
738 doi = doi.strip()
739 pos = doi.find("10.")
740 if pos > 0:
741 doi = doi[pos:]
742 xarticle.doi = doi
744 if "mr" in what:
745 # MR
746 citation_mr_node = soup.select_one("meta[name='citation_mr']")
747 if citation_mr_node:
748 mr = citation_mr_node.get("content")
749 if isinstance(mr, str): 749 ↛ 755line 749 didn't jump to line 755 because the condition on line 749 was always true
750 mr = mr.strip()
751 if mr.find("MR") == 0: 751 ↛ 755line 751 didn't jump to line 755 because the condition on line 751 was always true
752 mr = mr[2:]
753 xarticle.extids.append(("mr-item-id", mr))
755 if "zbl" in what:
756 # ZBL
757 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
758 if citation_zbl_node:
759 zbl = citation_zbl_node.get("content")
760 if isinstance(zbl, str): 760 ↛ 766line 760 didn't jump to line 766 because the condition on line 760 was always true
761 zbl = zbl.strip()
762 if zbl.find("Zbl") == 0: 762 ↛ 766line 762 didn't jump to line 766 because the condition on line 762 was always true
763 zbl = zbl[3:].strip()
764 xarticle.extids.append(("zbl-item-id", zbl))
766 if "publisher" in what:
767 # PUBLISHER
768 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
769 if citation_publisher_node:
770 pub = citation_publisher_node.get("content")
771 if isinstance(pub, str): 771 ↛ 778line 771 didn't jump to line 778 because the condition on line 771 was always true
772 pub = pub.strip()
773 if pub != "": 773 ↛ 778line 773 didn't jump to line 778 because the condition on line 773 was always true
774 xpub = create_publisherdata()
775 xpub.name = pub
776 xissue.publisher = xpub
778 if "keywords" in what:
779 # KEYWORDS
780 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
781 for kwd_node in citation_kwd_nodes:
782 kwds = kwd_node.get("content")
783 if isinstance(kwds, str): 783 ↛ 781line 783 didn't jump to line 781 because the condition on line 783 was always true
784 kwds = kwds.split(",")
785 for kwd in kwds:
786 if kwd == "":
787 continue
788 kwd = kwd.strip()
789 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
791 if "references" in what:
792 citation_references = soup.select("meta[name='citation_reference']")
793 for index, tag in enumerate(citation_references):
794 content = tag.get("content")
795 if not isinstance(content, str): 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true
796 raise ValueError("Cannot parse citation_reference meta")
797 label = str(index + 1)
798 if regex.match(r"^\[\d+\].*", content): 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true
799 label = None
800 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
802 def create_xissue(
803 self,
804 url: str | None,
805 year: str,
806 volume_number: str | None,
807 issue_number: str | None = "1",
808 vseries: str | None = None,
809 ):
810 if url is not None and url.endswith("/"):
811 url = url[:-1]
812 xissue = create_issuedata()
813 xissue.url = url
815 xissue.pid = self.get_issue_pid(
816 self.collection_id, year, volume_number, issue_number, vseries
817 )
819 xissue.year = year
821 if volume_number is not None:
822 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
824 if issue_number is not None:
825 xissue.number = issue_number.replace(",", "-")
827 if vseries is not None: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true
828 xissue.vseries = vseries
829 return xissue
831 def detect_language(self, text: str, article: ArticleData | None = None):
832 if article and article.lang is not None and article.lang != "und":
833 return article.lang
835 language = self.language_detector.detect_language_of(text)
837 if not language: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true
838 return "und"
839 return language.iso_code_639_1.name.lower()
841 def create_trans_title(
842 self,
843 resource_type: str,
844 title_tex: str,
845 lang: str,
846 xresource_lang: str,
847 title_type: str = "main",
848 ):
849 tag = "trans-title" if resource_type == "article" else "issue-title"
851 ckeditor_data = build_jats_data_from_html_field(
852 title_tex,
853 tag=tag,
854 text_lang=lang,
855 resource_lang=xresource_lang,
856 delimiter_inline=self.delimiter_inline_formula,
857 delimiter_disp=self.delimiter_disp_formula,
858 )
860 titledata = create_titledata(
861 lang=lang,
862 type="main",
863 title_html=ckeditor_data["value_html"],
864 title_xml=ckeditor_data["value_xml"],
865 )
867 return titledata
869 references_mapping = {
870 "citation_title": get_article_title_xml,
871 "citation_journal_title": get_source_xml,
872 "citation_publication_date": get_year_xml,
873 "citation_firstpage": get_fpage_xml,
874 "citation_lastpage": get_lpage_xml,
875 }
877 @classmethod
878 def __parse_meta_citation_reference(cls, content: str, label=None):
879 categories = content.split(";")
881 if len(categories) == 1:
882 return JatsBase.bake_ref(content, label=label)
884 citation_data = [c.split("=") for c in categories if "=" in c]
885 del categories
887 xml_string = ""
888 authors_parsed = False
889 authors_strings = []
890 for data in citation_data:
891 key = data[0].strip()
892 citation_content = data[1]
893 if key == "citation_author":
894 authors_strings.append(get_author_xml(template_str=citation_content))
895 continue
896 elif not authors_parsed:
897 xml_string += ", ".join(authors_strings)
898 authors_parsed = True
900 if key in cls.references_mapping:
901 xml_string += " " + cls.references_mapping[key](citation_content)
903 return JatsBase.bake_ref(xml_string, label=label)
905 @classmethod
906 def get_or_create_source(cls):
907 source, created = Source.objects.get_or_create(
908 domain=cls.source_domain,
909 defaults={
910 "name": cls.source_name,
911 "website": cls.source_website,
912 "view_id": cls.get_view_id(),
913 },
914 )
915 if created: 915 ↛ 916line 915 didn't jump to line 916 because the condition on line 915 was never true
916 source.save()
917 return source
919 @staticmethod
920 def get_issue_pid(
921 collection_id: str,
922 year: str,
923 volume_number: str | None = None,
924 issue_number: str | None = None,
925 series: str | None = None,
926 ):
927 # Replace any non-word character with an underscore
928 pid = f"{collection_id}_{year}"
929 if series is not None: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true
930 pid += f"_{series}"
931 if volume_number is not None:
932 pid += f"_{volume_number}"
933 if issue_number is not None:
934 pid += f"_{issue_number}"
935 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
936 return pid
938 @staticmethod
939 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
940 pages_split = pages.split(separator)
941 if len(pages_split) == 0: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true
942 article.page_range = pages
943 if len(pages_split) > 0: 943 ↛ exitline 943 didn't return from function 'set_pages' because the condition on line 943 was always true
944 if pages[0].isnumeric(): 944 ↛ exitline 944 didn't return from function 'set_pages' because the condition on line 944 was always true
945 article.fpage = pages_split[0]
946 if (
947 len(pages_split) > 1
948 and pages_split[0] != pages_split[1]
949 and pages_split[1].isnumeric()
950 ):
951 article.lpage = pages_split[1]