Coverage for src / crawler / base_crawler.py: 67%
553 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import logging
2import time
3from concurrent.futures import (
4 Executor,
5 ThreadPoolExecutor,
6)
7from datetime import datetime, timedelta
8from email.policy import EmailPolicy
9from typing import TYPE_CHECKING, Any
11import regex
12import requests
13from bs4 import BeautifulSoup
14from django.conf import settings
15from django.contrib.auth.models import User
16from django.utils import timezone
17from langcodes import standardize_tag
18from lingua import LanguageDetector, LanguageDetectorBuilder
19from opentelemetry import trace
20from ptf.cmds.xml.ckeditor.utils import (
21 build_jats_data_from_html_field,
22)
23from ptf.cmds.xml.jats.builder.references import (
24 get_article_title_xml,
25 get_author_xml,
26 get_fpage_xml,
27 get_lpage_xml,
28 get_source_xml,
29 get_year_xml,
30)
31from ptf.cmds.xml.jats.jats_parser import JatsBase
32from ptf.model_data import (
33 ArticleData,
34 ContributorDict,
35 IssueData,
36 ResourceData,
37 TitleDict,
38 create_abstract,
39 create_contributor,
40 create_extlink,
41 create_issuedata,
42 create_publisherdata,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pymongo.errors import DocumentTooLarge
49from pysolr import SolrError
50from requests.adapters import HTTPAdapter
51from requests.models import Response
52from requests_cache import CachedSession, MongoCache
53from urllib3 import Retry
55from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
56from crawler.models import Source
57from crawler.models.extlink_checked import ExtlinkChecked
58from crawler.types import CitationLiteral
59from crawler.utils import (
60 add_pdf_link_to_xarticle,
61 cleanup_str,
62 get_all_cols,
63 get_or_create_collection,
64)
66if TYPE_CHECKING:
67 from concurrent.futures import Future
70class CrawlerTitleDict(TitleDict):
71 title_tex: str | None
74class BaseCollectionCrawler:
75 """
76 Base collection for the crawlers.
77 To create a crawler:
78 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
79 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
80 3) update factory.py so that crawler_factory can return your new crawler
81 """
83 logger = logging.getLogger(__name__)
84 tracer = trace.get_tracer(__name__)
86 source_name = ""
87 source_domain = ""
88 source_website = ""
90 issue_href = ""
92 collection = None
93 source = None
94 user = None
95 session: requests.Session | CachedSession
97 verify = True
98 headers = {
99 "accept_encoding": "utf-8",
100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
102 }
104 next_allowed_request: float = time.time()
106 # seconds to wait between two http requests
107 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
108 # seconds to wait before aborting the connection (if no bytes are recieved)
109 requests_timeout = 60
111 latext_parser = LatexNodes2Text()
113 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
114 # do not use the "$" to surround tex formulas
115 delimiter_inline_formula = "$"
116 delimiter_disp_formula = "$"
118 # HACK : Workaround for tests (monkeypatching)
119 # We store the class here, so we can monkeypatch it when running tests
120 # subCrawlers = {
121 # LofplCrawler: None
122 # }
123 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
125 _language_detector: LanguageDetector | None = None
126 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
128 force_refresh = False
130 # Whereas to include headers in requests cache key
131 match_headers = False
132 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
134 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
135 ignore_missing_pdf = True
137 database_executor: Executor
138 exception: Exception | None = None
140 @classmethod
141 def get_view_id(cls):
142 return cls.source_domain
144 @property
145 def language_detector(self):
146 """Crawler Instance singleton for language builder.
147 Late init of LanguageDetector to save on memory"""
148 if not self._language_detector:
149 self._language_detector = self._language_detector_builder.build()
150 return self._language_detector
152 def __init__(
153 self,
154 *args,
155 username: str,
156 collection_id: str,
157 dry: bool = False,
158 publisher: str = "",
159 force_refresh=False,
160 collection_url: str | None = None,
161 ):
162 if not collection_url: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 all_cols = get_all_cols()
164 col = all_cols[collection_id]
166 collection_url = col["sources"].get(self.source_domain, None)
167 if collection_url is None:
168 raise ValueError(
169 f"Source {self.source_domain} not found for collection {collection_id}"
170 )
171 self.collection_url = collection_url
172 for CrawlerClass in self.subCrawlers: 172 ↛ 173line 172 didn't jump to line 173 because the loop on line 172 never started
173 self.subCrawlers[CrawlerClass] = CrawlerClass(
174 *args,
175 username=username,
176 collection_id=collection_id,
177 dry=dry,
178 publisher=publisher,
179 collection_url=collection_url,
180 )
181 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
183 self.username = username
185 self.collection_id = collection_id
187 self.dry = dry
188 self.publisher = publisher
190 self.session = requests.session()
192 # Skipped when running tests
193 self.initialize()
194 self.session.verify = self.verify
195 self.force_refresh = force_refresh
197 # We implemented custom retry behaviour, so we don't want to make extra requests here
198 retries = Retry(
199 total=0,
200 )
201 self.session.mount("https://", HTTPAdapter(max_retries=retries))
202 self.session.mount("http://", HTTPAdapter(max_retries=retries))
204 self.database_executor = ThreadPoolExecutor(
205 max_workers=1, thread_name_prefix="crawler_database_thread"
206 )
208 def initialize(self):
209 """
210 Acts as a "second" init function to skip model accesses during test data generation
211 """
212 self.collection = get_or_create_collection(self.collection_id)
213 self.source = self.get_or_create_source()
214 self.user = User.objects.get(username=self.username)
215 self.session = CachedSession(
216 match_headers=self.match_headers,
217 headers=self.headers,
218 backend=MongoCache(
219 host=getattr(settings, "MONGO_HOSTNAME", "localhost"), decode_content=False
220 ),
221 expire_after=timedelta(days=30),
222 )
224 @classmethod
225 def can_crawl(cls, pid: str) -> bool:
226 return True
228 def parse_collection_content(self, content: str) -> list[IssueData]:
229 """
230 Parse the HTML content with BeautifulSoup
231 returns a list of xissue.
232 Override this function in a derived class
233 """
234 return []
236 def parse_issue_content(self, content: str, xissue: IssueData):
237 """
238 Parse the HTML content with BeautifulSoup
239 Fills the xissue.articles
240 Override this function in a derived class.
242 CAV : You are supposed to create articles there. Please assign a PID to each article.
243 The PID can be `a + article_index`, like this : `a0` `a21`
244 """
246 def parse_article_content(
247 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
248 ) -> ArticleData | None:
249 """
250 Parse the HTML content with BeautifulSoup
251 returns the xarticle.
252 Override this function in a derived class.
253 The xissue is passed to the function in case the article page has issue information (ex: publisher)
254 The article url is also passed as a parameter
256 CAV : You are supposed to assign articles pid again here
257 """
258 return xarticle
260 @tracer.start_as_current_span("crawl_collection")
261 def crawl_collection(self):
262 # TODO: Comments, filter
263 """
264 Crawl an entire collection. ptf.models.Container objects are created.
265 - get the HTML content of the collection_url
266 - parse the HTML content with beautifulsoup to extract the list of issues
267 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
268 - crawl each issue if col_only is False
269 - Returns the list of merged issues.
270 It is an OrderedDict {pid: {"issues": xissues}}
271 The key is the pid of the merged issues.
272 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
273 the pid is then made with 1999-2000__6_
274 """
276 if self.source is None:
277 raise RuntimeError("ERROR: the source is not set")
279 content = self.download_file(self.collection_url)
280 if content:
281 xissues = self.parse_collection_content(content)
282 else:
283 # download_file returns None (404)
284 return None
286 """
287 Some collections split the same volumes in different pages
288 Ex: Volume 6 (2000) and Volume 6 (1999)
289 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
290 """
291 # merged_xissues = self.merge_xissues(xissues)
293 xissues_dict = {str(i.pid): i for i in xissues}
295 return xissues_dict
297 @tracer.start_as_current_span("crawl_issue")
298 def crawl_issue(self, xissue: IssueData):
299 """
300 Crawl 1 wag page of an issue.
301 - get the HTML content of the issue
302 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
303 - crawl each article
304 """
306 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
307 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
308 issue_url = xissue.url
309 if issue_url is not None:
310 if issue_url.endswith(".pdf"):
311 add_pdf_link_to_xarticle(xissue, issue_url)
312 xissue.url = None
313 else:
314 content = self.download_file(issue_url)
315 with self.tracer.start_as_current_span("parse_issue_content"):
316 self.parse_issue_content(content, xissue)
318 xarticles = xissue.articles
320 parsed_xarticles = []
322 for xarticle in xarticles:
323 parsed_xarticle = self.crawl_article(xarticle, xissue)
324 if parsed_xarticle is not None:
325 parsed_xarticles.append(parsed_xarticle)
327 xissue.articles = parsed_xarticles
329 article_has_pdf = self.article_has_pdf(xissue)
331 if self.ignore_missing_pdf:
332 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
334 if not self.dry and (len(xissue.articles) > 0 or article_has_pdf):
335 self.process_resource_metadata(xissue, resource_type="issue")
336 if self.exception:
337 raise self.exception
338 self.database_executor.submit(self.add_xissue_into_database, xissue).add_done_callback(
339 self._issue_added_callback
340 )
342 def _issue_added_callback(self, future: "Future"):
343 exception = future.exception()
344 if exception:
345 self.exception = exception
346 self.database_executor.shutdown(wait=False, cancel_futures=True)
348 @staticmethod
349 def article_has_source(art: ArticleData | IssueData):
350 return (
351 next(
352 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
353 None,
354 )
355 is not None
356 )
358 @staticmethod
359 def article_has_pdf(art: ArticleData | IssueData):
360 return (
361 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
362 is not None
363 )
365 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
366 # ARTICLE URL as en ExtLink (to display the link in the article page)
367 if xarticle.url is None:
368 if not self.article_has_source(xarticle): 368 ↛ 378line 368 didn't jump to line 378 because the condition on line 368 was always true
369 if xissue.url:
370 article_source = xissue.url
371 else:
372 article_source = self.collection_url
373 ext_link = create_extlink()
374 ext_link["rel"] = "source"
375 ext_link["location"] = article_source
376 ext_link["metadata"] = self.source_domain
377 xarticle.ext_links.append(ext_link)
378 return self.process_article_metadata(xarticle)
380 content = self.download_file(xarticle.url)
381 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
383 try:
384 with self.tracer.start_as_current_span("parse_article_content"):
385 parsed_xarticle = self.parse_article_content(
386 content, xissue, xarticle, xarticle.url
387 )
388 except ValueError as e:
389 self.logger.warning(e)
390 self.logger.warning("Retrying in 5 mins while invalidating cache")
391 time.sleep(5 * 60)
392 content = self.download_file(xarticle.url, force_refresh=True)
393 with self.tracer.start_as_current_span("parse_article_content"):
394 parsed_xarticle = self.parse_article_content(
395 content, xissue, xarticle, xarticle.url
396 )
398 if parsed_xarticle is None: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 return None
401 if parsed_xarticle.doi:
402 parsed_xarticle.pid = (
403 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
404 )
406 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
407 ext_link = create_extlink()
408 ext_link["rel"] = "source"
409 ext_link["location"] = parsed_xarticle.url
410 ext_link["metadata"] = self.source_domain
411 parsed_xarticle.ext_links.append(ext_link)
413 # The article title may have formulas surrounded with '$'
414 return self.process_article_metadata(parsed_xarticle)
416 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
417 tag = "article-title" if resource_type == "article" else "issue-title"
419 # Process title tex
420 ckeditor_data = build_jats_data_from_html_field(
421 xresource.title_tex,
422 tag=tag,
423 text_lang=xresource.lang,
424 delimiter_inline=self.delimiter_inline_formula,
425 delimiter_disp=self.delimiter_disp_formula,
426 )
428 xresource.title_html = ckeditor_data["value_html"]
429 # xresource.title_tex = ckeditor_data["value_tex"]
430 xresource.title_xml = ckeditor_data["value_xml"]
432 # Process trans_title tex
433 if xresource.trans_title_tex: 433 ↛ 434line 433 didn't jump to line 434 because the condition on line 433 was never true
434 self.logger.warning(
435 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
436 )
437 trans_title = self.create_trans_title(
438 xresource_lang=xresource.lang,
439 resource_type=resource_type,
440 title_tex=xresource.trans_title_tex,
441 lang=xresource.trans_lang,
442 )
443 xresource.titles.append(trans_title)
445 abstracts_to_parse = [
446 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
447 ]
448 # abstract may have formulas surrounded with '$'
449 if len(abstracts_to_parse) > 0:
450 for xabstract in abstracts_to_parse:
451 ckeditor_data = build_jats_data_from_html_field(
452 xabstract["value_tex"],
453 tag="abstract",
454 text_lang=xabstract["lang"],
455 resource_lang=xresource.lang,
456 field_type="abstract",
457 delimiter_inline=self.delimiter_inline_formula,
458 delimiter_disp=self.delimiter_disp_formula,
459 )
461 xabstract["value_html"] = ckeditor_data["value_html"]
462 # xabstract["value_tex"] = ckeditor_data["value_tex"]
463 xabstract["value_xml"] = ckeditor_data["value_xml"]
465 return xresource
467 def process_article_metadata(self, xarticle: ArticleData):
468 self.process_resource_metadata(xarticle)
469 for bibitem in xarticle.bibitems:
470 bibitem.type = "unknown"
471 update_data_for_jats(xarticle, with_label=False)
473 return xarticle
475 def _wait_download_delay(self):
476 delta = self.next_allowed_request - time.time()
477 self.next_allowed_request = time.time() + self.requests_interval
478 if delta > 0: 478 ↛ 479line 478 didn't jump to line 479 because the condition on line 478 was never true
479 self.logger.info(f"Waiting {int(delta)}s before making another request")
480 time.sleep(delta)
482 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response:
483 """
484 Wrapper around requests.get with delay based on the crawler class instance
485 """
487 kwargs = {}
488 # self.session.cache.delete(urls=[url])
489 if isinstance(self.session, CachedSession):
490 kwargs["force_refresh"] = force_refresh
492 try:
493 response = self.session.get(
494 url,
495 headers={**self.headers, **headers},
496 timeout=self.requests_timeout,
497 **kwargs,
498 )
499 except DocumentTooLarge as e:
500 self.logger.error(e)
501 response = requests.get(
502 url, headers={**self.headers, **headers}, timeout=self.requests_timeout
503 )
505 if not response.ok:
506 raise requests.exceptions.HTTPError(
507 f"Endpoint answered with code {response.status_code} : {url}",
508 response=response,
509 )
511 if not getattr(response, "from_cache", False):
512 self._wait_download_delay()
513 return response
515 def download_file(self, url: str, force_refresh=False, headers={}):
516 """
517 Downloads a page and returns its content (decoded string).
518 This function handles retries and decoding
519 """
520 attempts = 0
521 while True:
522 try:
523 if attempts > 0:
524 force_refresh = True
525 response = self._get(
526 url, force_refresh=force_refresh or self.force_refresh, headers=headers
527 )
529 content = self.decode_response(response)
530 if content == "" or not content:
531 raise requests.exceptions.HTTPError(response)
533 return content
534 except (
535 requests.ConnectionError,
536 requests.ConnectTimeout,
537 requests.exceptions.HTTPError,
538 ) as e:
539 if attempts > 3:
540 raise e
541 self.logger.debug(f"Caught error : {e}", extra={"url": url})
542 attempts += 1
543 # 15 mins, 30 mins, 45 mins
544 delay_minutes = attempts * 15
545 self.logger.debug(
546 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
547 extra={"url": url},
548 )
549 time.sleep(delay_minutes * 60)
551 def decode_response(self, response: requests.Response, encoding: str | None = None):
552 """Override this if the content-type headers from the sources are advertising something else than the actual content
553 SASA needs this"""
554 # Force
555 if encoding:
556 response.encoding = encoding
557 return response.text
559 # Attempt to get encoding using HTTP headers
560 content_type_tag = response.headers.get("Content-Type", None)
562 if content_type_tag: 562 ↛ 569line 562 didn't jump to line 569 because the condition on line 562 was always true
563 charset = self.parse_content_type_charset(content_type_tag)
564 if charset: 564 ↛ 565line 564 didn't jump to line 565 because the condition on line 564 was never true
565 response.encoding = charset
566 return response.text
568 # Attempt to get encoding using HTML meta charset tag
569 soup = BeautifulSoup(response.text, "html5lib")
570 charset = soup.select_one("meta[charset]")
571 if charset:
572 htmlencoding = charset.get("charset")
573 if isinstance(htmlencoding, str): 573 ↛ 578line 573 didn't jump to line 578 because the condition on line 573 was always true
574 response.encoding = htmlencoding
575 return response.text
577 # Attempt to get encoding using HTML meta content type tag
578 content_type_tag = soup.select_one(
579 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
580 )
581 if content_type_tag:
582 content_type = content_type_tag.get("content")
583 if isinstance(content_type, str): 583 ↛ 589line 583 didn't jump to line 589 because the condition on line 583 was always true
584 charset = self.parse_content_type_charset(content_type)
585 if charset: 585 ↛ 589line 585 didn't jump to line 589 because the condition on line 585 was always true
586 response.encoding = charset
587 return response.text
589 return response.text
591 @staticmethod
592 def parse_content_type_charset(content_type: str):
593 header = EmailPolicy.header_factory("content-type", content_type)
594 if "charset" in header.params:
595 return header.params.get("charset")
597 @tracer.start_as_current_span("add_xissue_to_database")
598 def add_xissue_into_database(self, xissue: IssueData):
599 xissue.journal = self.collection
600 xissue.source = self.source_domain
602 if xissue.year == "":
603 raise ValueError("Failsafe : Cannot insert issue without a year")
605 xpub = create_publisherdata()
606 xpub.name = self.publisher
607 xissue.publisher = xpub
608 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
610 attempt = 1
611 success = False
613 while not success and attempt < 4:
614 try:
615 params = {"xissue": xissue, "use_body": False}
616 cmd = addOrUpdateGDMLIssueXmlCmd(params)
617 cmd.do()
618 success = True
619 self.logger.debug(f"Issue {xissue.pid} inserted in database")
620 except SolrError:
621 self.logger.warning(
622 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
623 )
624 attempt += 1
625 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
626 time.sleep(10)
627 except Exception as e:
628 self.logger.error(
629 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
630 )
631 raise e
633 if success is False:
634 raise ConnectionRefusedError("Cannot connect to SolR")
636 def get_metadata_using_citation_meta(
637 self,
638 xarticle: ArticleData,
639 xissue: IssueData,
640 soup: BeautifulSoup,
641 what: list[CitationLiteral] = [],
642 ):
643 """
644 :param xarticle: the xarticle that will collect the metadata
645 :param xissue: the xissue that will collect the publisher
646 :param soup: the BeautifulSoup object of tha article page
647 :param what: list of citation_ items to collect.
648 :return: None. The given article is modified
649 """
651 if "title" in what:
652 # TITLE
653 citation_title_node = soup.select_one("meta[name='citation_title']")
654 if citation_title_node: 654 ↛ 659line 654 didn't jump to line 659 because the condition on line 654 was always true
655 title = citation_title_node.get("content")
656 if isinstance(title, str): 656 ↛ 659line 656 didn't jump to line 659 because the condition on line 656 was always true
657 xarticle.title_tex = title
659 if "author" in what: 659 ↛ 688line 659 didn't jump to line 688 because the condition on line 659 was always true
660 # AUTHORS
661 citation_author_nodes = soup.select("meta[name^='citation_author']")
662 current_author: ContributorDict | None = None
663 for citation_author_node in citation_author_nodes:
664 if citation_author_node.get("name") == "citation_author":
665 text_author = citation_author_node.get("content")
666 if not isinstance(text_author, str): 666 ↛ 667line 666 didn't jump to line 667 because the condition on line 666 was never true
667 raise ValueError("Cannot parse author")
668 if text_author == "": 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true
669 current_author = None
670 continue
671 current_author = create_contributor(role="author", string_name=text_author)
672 xarticle.contributors.append(current_author)
673 continue
674 if current_author is None: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true
675 self.logger.warning("Couldn't parse citation author")
676 continue
677 if citation_author_node.get("name") == "citation_author_institution":
678 text_institution = citation_author_node.get("content")
679 if not isinstance(text_institution, str): 679 ↛ 680line 679 didn't jump to line 680 because the condition on line 679 was never true
680 continue
681 current_author["addresses"].append(text_institution)
682 if citation_author_node.get("name") == "citation_author_ocrid": 682 ↛ 683line 682 didn't jump to line 683 because the condition on line 682 was never true
683 text_orcid = citation_author_node.get("content")
684 if not isinstance(text_orcid, str):
685 continue
686 current_author["orcid"] = text_orcid
688 if "pdf" in what:
689 # PDF
690 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
691 if citation_pdf_node:
692 pdf_url = citation_pdf_node.get("content")
693 if isinstance(pdf_url, str): 693 ↛ 696line 693 didn't jump to line 696 because the condition on line 693 was always true
694 add_pdf_link_to_xarticle(xarticle, pdf_url)
696 if "lang" in what:
697 # LANG
698 citation_lang_node = soup.select_one("meta[name='citation_language']")
699 if citation_lang_node: 699 ↛ 705line 699 didn't jump to line 705 because the condition on line 699 was always true
700 # TODO: check other language code
701 content_text = citation_lang_node.get("content")
702 if isinstance(content_text, str): 702 ↛ 705line 702 didn't jump to line 705 because the condition on line 702 was always true
703 xarticle.lang = standardize_tag(content_text)
705 if "abstract" in what:
706 # ABSTRACT
707 abstract_node = soup.select_one("meta[name='citation_abstract']")
708 if abstract_node is not None:
709 abstract = abstract_node.get("content")
710 if not isinstance(abstract, str): 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true
711 raise ValueError("Couldn't parse abstract from meta")
712 abstract = BeautifulSoup(abstract, "html.parser").text
713 lang = abstract_node.get("lang")
714 if not isinstance(lang, str):
715 lang = self.detect_language(abstract, xarticle)
716 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
718 if "page" in what:
719 # PAGES
720 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
721 if citation_fpage_node:
722 page = citation_fpage_node.get("content")
723 if isinstance(page, str): 723 ↛ 728line 723 didn't jump to line 728 because the condition on line 723 was always true
724 page = page.split("(")[0]
725 if len(page) < 32: 725 ↛ 728line 725 didn't jump to line 728 because the condition on line 725 was always true
726 xarticle.fpage = page
728 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
729 if citation_lpage_node:
730 page = citation_lpage_node.get("content")
731 if isinstance(page, str): 731 ↛ 736line 731 didn't jump to line 736 because the condition on line 731 was always true
732 page = page.split("(")[0]
733 if len(page) < 32: 733 ↛ 736line 733 didn't jump to line 736 because the condition on line 733 was always true
734 xarticle.lpage = page
736 if "doi" in what:
737 # DOI
738 citation_doi_node = soup.select_one("meta[name='citation_doi']")
739 if citation_doi_node:
740 doi = citation_doi_node.get("content")
741 if isinstance(doi, str): 741 ↛ 748line 741 didn't jump to line 748 because the condition on line 741 was always true
742 doi = doi.strip()
743 pos = doi.find("10.")
744 if pos > 0:
745 doi = doi[pos:]
746 xarticle.doi = doi
748 if "mr" in what:
749 # MR
750 citation_mr_node = soup.select_one("meta[name='citation_mr']")
751 if citation_mr_node:
752 mr = citation_mr_node.get("content")
753 if isinstance(mr, str): 753 ↛ 759line 753 didn't jump to line 759 because the condition on line 753 was always true
754 mr = mr.strip()
755 if mr.find("MR") == 0: 755 ↛ 759line 755 didn't jump to line 759 because the condition on line 755 was always true
756 mr = mr[2:]
757 xarticle.extids.append(("mr-item-id", mr))
759 if "zbl" in what:
760 # ZBL
761 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
762 if citation_zbl_node:
763 zbl = citation_zbl_node.get("content")
764 if isinstance(zbl, str): 764 ↛ 770line 764 didn't jump to line 770 because the condition on line 764 was always true
765 zbl = zbl.strip()
766 if zbl.find("Zbl") == 0: 766 ↛ 770line 766 didn't jump to line 770 because the condition on line 766 was always true
767 zbl = zbl[3:].strip()
768 xarticle.extids.append(("zbl-item-id", zbl))
770 if "publisher" in what:
771 # PUBLISHER
772 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
773 if citation_publisher_node:
774 pub = citation_publisher_node.get("content")
775 if isinstance(pub, str): 775 ↛ 782line 775 didn't jump to line 782 because the condition on line 775 was always true
776 pub = pub.strip()
777 if pub != "": 777 ↛ 782line 777 didn't jump to line 782 because the condition on line 777 was always true
778 xpub = create_publisherdata()
779 xpub.name = pub
780 xissue.publisher = xpub
782 if "keywords" in what:
783 # KEYWORDS
784 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
785 for kwd_node in citation_kwd_nodes:
786 kwds = kwd_node.get("content")
787 if isinstance(kwds, str): 787 ↛ 785line 787 didn't jump to line 785 because the condition on line 787 was always true
788 kwds = kwds.split(",")
789 for kwd in kwds:
790 if kwd == "":
791 continue
792 kwd = kwd.strip()
793 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
795 if "references" in what:
796 citation_references = soup.select("meta[name='citation_reference']")
797 for index, tag in enumerate(citation_references):
798 content = tag.get("content")
799 if not isinstance(content, str): 799 ↛ 800line 799 didn't jump to line 800 because the condition on line 799 was never true
800 raise ValueError("Cannot parse citation_reference meta")
801 label = str(index + 1)
802 if regex.match(r"^\[\d+\].*", content): 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true
803 label = None
804 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
806 def create_xissue(
807 self,
808 url: str | None,
809 year: str,
810 volume_number: str | None,
811 issue_number: str | None = None,
812 vseries: str | None = None,
813 ):
814 if url is not None and url.endswith("/"):
815 url = url[:-1]
816 xissue = create_issuedata()
817 xissue.url = url
819 xissue.pid = self.get_issue_pid(
820 self.collection_id, year, volume_number, issue_number, vseries
821 )
823 xissue.year = year
825 if volume_number is not None:
826 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
828 if issue_number is not None:
829 xissue.number = issue_number.replace(",", "-")
831 if vseries is not None: 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true
832 xissue.vseries = vseries
833 return xissue
835 def detect_language(self, text: str, article: ArticleData | None = None):
836 if article and article.lang is not None and article.lang != "und":
837 return article.lang
839 language = self.language_detector.detect_language_of(text)
841 if not language: 841 ↛ 842line 841 didn't jump to line 842 because the condition on line 841 was never true
842 return "und"
843 return language.iso_code_639_1.name.lower()
845 def create_trans_title(
846 self,
847 resource_type: str,
848 title_tex: str,
849 lang: str,
850 xresource_lang: str,
851 title_type: str = "main",
852 ):
853 tag = "trans-title" if resource_type == "article" else "issue-title"
855 ckeditor_data = build_jats_data_from_html_field(
856 title_tex,
857 tag=tag,
858 text_lang=lang,
859 resource_lang=xresource_lang,
860 delimiter_inline=self.delimiter_inline_formula,
861 delimiter_disp=self.delimiter_disp_formula,
862 )
864 titledata = create_titledata(
865 lang=lang,
866 type="main",
867 title_html=ckeditor_data["value_html"],
868 title_xml=ckeditor_data["value_xml"],
869 )
871 return titledata
873 references_mapping = {
874 "citation_title": get_article_title_xml,
875 "citation_journal_title": get_source_xml,
876 "citation_publication_date": get_year_xml,
877 "citation_firstpage": get_fpage_xml,
878 "citation_lastpage": get_lpage_xml,
879 }
881 @classmethod
882 def __parse_meta_citation_reference(cls, content: str, label=None):
883 categories = content.split(";")
885 if len(categories) == 1:
886 return JatsBase.bake_ref(content, label=label)
888 citation_data = [c.split("=") for c in categories if "=" in c]
889 del categories
891 xml_string = ""
892 authors_parsed = False
893 authors_strings = []
894 for data in citation_data:
895 key = data[0].strip()
896 citation_content = data[1]
897 if key == "citation_author":
898 authors_strings.append(get_author_xml(template_str=citation_content))
899 continue
900 elif not authors_parsed:
901 xml_string += ", ".join(authors_strings)
902 authors_parsed = True
904 if key in cls.references_mapping:
905 xml_string += " " + cls.references_mapping[key](citation_content)
907 return JatsBase.bake_ref(xml_string, label=label)
909 @classmethod
910 def get_or_create_source(cls):
911 source, created = Source.objects.get_or_create(
912 domain=cls.source_domain,
913 defaults={
914 "name": cls.source_name,
915 "website": cls.source_website,
916 "view_id": cls.get_view_id(),
917 },
918 )
919 if created: 919 ↛ 920line 919 didn't jump to line 920 because the condition on line 919 was never true
920 source.save()
921 return source
923 @staticmethod
924 def get_issue_pid(
925 collection_id: str,
926 year: str,
927 volume_number: str | None = None,
928 issue_number: str | None = None,
929 series: str | None = None,
930 ):
931 # Replace any non-word character with an underscore
932 pid = f"{collection_id}_{year}"
933 if series is not None: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true
934 pid += f"_{series}"
935 if volume_number is not None:
936 pid += f"_{volume_number}"
937 if issue_number is not None:
938 pid += f"_{issue_number}"
939 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
940 return pid
942 @staticmethod
943 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
944 pages_split = pages.split(separator)
945 if len(pages_split) == 0: 945 ↛ 946line 945 didn't jump to line 946 because the condition on line 945 was never true
946 article.page_range = pages
947 if len(pages_split) > 0: 947 ↛ exitline 947 didn't return from function 'set_pages' because the condition on line 947 was always true
948 if pages[0].isnumeric(): 948 ↛ exitline 948 didn't return from function 'set_pages' because the condition on line 948 was always true
949 article.fpage = pages_split[0]
950 if ( 950 ↛ 955line 950 didn't jump to line 955 because the condition on line 950 was never true
951 len(pages_split) > 1
952 and pages_split[0] != pages_split[1]
953 and pages_split[1].isnumeric()
954 ):
955 article.lpage = pages_split[1]
957 @classmethod
958 def check_pdf_link_validity(
959 cls, url: str, verify=True, session=requests.Session()
960 ) -> "tuple[bool, Response, dict[str, Any]]":
961 # Avoid downloading the whole PDF
962 CHUNK_SIZE = 10 # number of characters fetched
963 header = {
964 "Range": f"bytes=0-{CHUNK_SIZE}",
965 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
966 }
967 with session.get(
968 url, stream=True, allow_redirects=True, headers=header, verify=verify
969 ) as response:
970 content_type = response.headers.get("Content-Type")
971 if not content_type or "application/pdf" not in content_type: 971 ↛ 973line 971 didn't jump to line 973 because the condition on line 971 was never true
972 # Content type is wrong, lest's check the header
973 try:
974 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE))
975 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None:
976 return (
977 False,
978 response,
979 {
980 "status": ExtlinkChecked.Status.ERROR,
981 "message": f"Content-Type header: {content_type}; PDF Header not found : got {pdf_header}",
982 },
983 )
984 else:
985 return (
986 True,
987 response,
988 {
989 "status": ExtlinkChecked.Status.WARNING,
990 "message": f"Content-Type header: {content_type}",
991 },
992 )
993 except StopIteration:
994 return (
995 False,
996 response,
997 {
998 "status": ExtlinkChecked.Status.ERROR,
999 "message": f"Content-Type header: {content_type}.",
1000 },
1001 )
1002 try:
1003 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE))
1004 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None: 1004 ↛ 1005line 1004 didn't jump to line 1005 because the condition on line 1004 was never true
1005 return (
1006 False,
1007 response,
1008 {
1009 "status": ExtlinkChecked.Status.ERROR,
1010 "message": "PDF Header not found : got {pdf_header}",
1011 },
1012 )
1013 except StopIteration:
1014 return (
1015 False,
1016 response,
1017 {
1018 "status": ExtlinkChecked.Status.ERROR,
1019 "message": f"Content-Type header: {content_type}.",
1020 },
1021 )
1023 # if response.status_code not in (200, 206):
1024 # raise ValueError("Invalid status code")
1026 return (
1027 True,
1028 response,
1029 {
1030 "status": ExtlinkChecked.Status.OK,
1031 "message": "",
1032 },
1033 )
1035 @classmethod
1036 def check_extlink_validity(cls, extlink: "ExtLink"):
1037 """
1038 Method used by rot_monitoring to check if links have expired
1039 """
1040 defaults: dict = {"date": time.time(), "status": ExtlinkChecked.Status.OK}
1041 # CHUNK_SIZE = 100
1042 header = {
1043 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1044 }
1045 verify = True
1046 if not cls.verify:
1047 verify = False
1048 if extlink.rel == "article-pdf":
1049 isok, response, message = cls.check_pdf_link_validity(extlink.location, verify)
1050 defaults.update(message)
1051 else:
1052 # check the article page
1053 response = requests.get(
1054 url=extlink.location,
1055 headers=header,
1056 stream=False,
1057 allow_redirects=True,
1058 verify=verify,
1059 )
1061 defaults["http_status"] = response.status_code
1063 if response.status_code not in (200, 206):
1064 defaults["status"] = ExtlinkChecked.Status.ERROR
1066 ExtlinkChecked.objects.update_or_create(extlink=extlink, defaults=defaults)