Coverage for src / crawler / base_crawler.py: 65%
588 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-17 12:56 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-17 12:56 +0000
1import asyncio
2import logging
3import time
4from datetime import datetime, timedelta
5from email.policy import EmailPolicy
6from typing import TYPE_CHECKING, Iterable, Literal
8import aiohttp
9import regex
10import requests
11from bs4 import BeautifulSoup
12from django.conf import settings
13from django.contrib.auth.models import User
14from django.db.utils import IntegrityError
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_subj,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pysolr import SolrError
49from requests.adapters import HTTPAdapter
50from requests_cache import CachedSession
51from urllib3 import Retry
53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
54from crawler.models import Source
55from crawler.models.extlink_checked import ExtlinkChecked
56from crawler.types import CitationLiteral
57from crawler.utils import (
58 add_pdf_link_to_xarticle,
59 cleanup_str,
60 get_all_cols,
61 get_or_create_collection,
62 get_session,
63)
65if TYPE_CHECKING:
66 from bs4 import Tag
69class CrawlerTitleDict(TitleDict):
70 title_tex: str | None
73class BaseCollectionCrawler:
74 """
75 Base collection for the crawlers.
76 To create a crawler:
77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
79 3) update factory.py so that crawler_factory can return your new crawler
80 """
82 logger = logging.getLogger(__name__)
83 tracer = trace.get_tracer(__name__)
85 source_name = ""
86 source_domain = ""
87 source_website = ""
89 issue_href = ""
91 collection = None
92 source = None
93 user = None
94 session: requests.Session | CachedSession
95 async_session: aiohttp.ClientSession
96 is_checkable = True
97 verify = True
98 headers = {
99 "accept_encoding": "utf-8",
100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
102 }
104 # seconds to wait between two http requests
105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
106 # seconds to wait before aborting the connection (if no bytes are recieved)
107 requests_timeout = 60
109 latext_parser = LatexNodes2Text()
111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
112 # do not use the "$" to surround tex formulas
113 delimiter_inline_formula = "$"
114 delimiter_disp_formula = "$"
116 # HACK : Workaround for tests (monkeypatching)
117 # We store the class here, so we can monkeypatch it when running tests
118 # subCrawlers = {
119 # LofplCrawler: None
120 # }
121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
123 _language_detector: LanguageDetector | None = None
124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
126 force_refresh = False
128 # Whereas to include headers in requests cache key
129 match_headers = False
130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
133 ignore_missing_pdf = True
135 @classmethod
136 def get_view_id(cls):
137 return cls.source_domain
139 @property
140 def language_detector(self):
141 """Crawler Instance singleton for language builder.
142 Late init of LanguageDetector to save on memory"""
143 if not self._language_detector:
144 self._language_detector = self._language_detector_builder.build()
145 return self._language_detector
147 def __init__(
148 self,
149 *args,
150 username: str,
151 collection_id: str,
152 dry: bool = False,
153 publisher: str = "",
154 force_refresh=False,
155 collection_url: str | None = None,
156 ):
157 if not collection_url: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 all_cols = get_all_cols()
159 col = all_cols[collection_id]
161 collection_url = col["sources"].get(self.source_domain, None)
162 if collection_url is None:
163 raise ValueError(
164 f"Source {self.source_domain} not found for collection {collection_id}"
165 )
166 self.collection_url = collection_url
167 for CrawlerClass in self.subCrawlers: 167 ↛ 168line 167 didn't jump to line 168 because the loop on line 167 never started
168 self.subCrawlers[CrawlerClass] = CrawlerClass(
169 *args,
170 username=username,
171 collection_id=collection_id,
172 dry=dry,
173 publisher=publisher,
174 collection_url=collection_url,
175 )
176 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
177 # self.logger = logging.getLogger(__name__)
179 self.username = username
181 self.collection_id = collection_id
183 self.dry = dry
184 self.publisher = publisher
186 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
187 BaseCollectionCrawler.session: requests.Session
189 # Skipped when running tests
190 self.initialize()
192 self.force_refresh = force_refresh
194 # We implemented custom retry behaviour, so we don't want to make extra requests here
196 def initialize(self):
197 """
198 Acts as a "second" init function to skip model accesses during test data generation
199 """
200 self.collection = get_or_create_collection(self.collection_id)
201 self.source = self.get_or_create_source()
202 self.user = User.objects.get(username=self.username)
203 BaseCollectionCrawler.session = get_session()
204 BaseCollectionCrawler.session.verify = self.verify
205 self.session.delay = self.requests_interval
206 retries = Retry(
207 total=0,
208 )
209 self.session.mount("https://", HTTPAdapter(max_retries=retries))
210 self.session.mount("http://", HTTPAdapter(max_retries=retries))
212 @classmethod
213 def can_crawl(cls, pid: str) -> bool:
214 return True
216 def parse_collection_content(self, content: str) -> list[IssueData]:
217 """
218 Parse the HTML content with BeautifulSoup
219 returns a list of xissue.
220 Override this function in a derived class
221 """
222 return []
224 def parse_issue_content(self, content: str, xissue: IssueData):
225 """
226 Parse the HTML content with BeautifulSoup
227 Fills the xissue.articles
228 Override this function in a derived class.
230 CAV : You are supposed to create articles there. Please assign a PID to each article.
231 The PID can be `a + article_index`, like this : `a0` `a21`
232 """
234 def parse_article_content(
235 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
236 ) -> ArticleData | None:
237 """
238 Parse the HTML content with BeautifulSoup
239 returns the xarticle.
240 Override this function in a derived class.
241 The xissue is passed to the function in case the article page has issue information (ex: publisher)
242 The article url is also passed as a parameter
244 CAV : You are supposed to assign articles pid again here
245 """
246 return xarticle
248 @tracer.start_as_current_span("crawl_collection")
249 def crawl_collection(self):
250 # TODO: Comments, filter
251 """
252 Crawl an entire collection. ptf.models.Container objects are created.
253 - get the HTML content of the collection_url
254 - parse the HTML content with beautifulsoup to extract the list of issues
255 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
256 - crawl each issue if col_only is False
257 - Returns the list of merged issues.
258 It is an OrderedDict {pid: {"issues": xissues}}
259 The key is the pid of the merged issues.
260 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
261 the pid is then made with 1999-2000__6_
262 """
264 if self.source is None:
265 raise RuntimeError("ERROR: the source is not set")
267 content = self.download_file(self.collection_url)
268 if content:
269 xissues = self.parse_collection_content(content)
270 else:
271 # download_file returns None (404)
272 return None
274 """
275 Some collections split the same volumes in different pages
276 Ex: Volume 6 (2000) and Volume 6 (1999)
277 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
278 """
279 # merged_xissues = self.merge_xissues(xissues)
281 xissues_dict = {str(i.pid): i for i in xissues}
283 return xissues_dict
285 @tracer.start_as_current_span("crawl_issue")
286 def crawl_issue(self, xissue: IssueData):
287 """
288 Crawl 1 wag page of an issue.
289 - get the HTML content of the issue
290 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
291 - crawl each article
292 """
294 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
295 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
296 issue_url = xissue.url
297 if issue_url is not None:
298 if issue_url.endswith(".pdf"):
299 add_pdf_link_to_xarticle(xissue, issue_url)
300 xissue.url = None
301 else:
302 content = self.download_file(issue_url)
303 with self.tracer.start_as_current_span("parse_issue_content"):
304 self.parse_issue_content(content, xissue)
306 xarticles = xissue.articles
308 parsed_xarticles = []
310 for xarticle in xarticles:
311 parsed_xarticle = self.crawl_article(xarticle, xissue)
312 if parsed_xarticle is not None:
313 parsed_xarticles.append(parsed_xarticle)
315 xissue.articles = parsed_xarticles
317 issue_has_pdf = self.article_has_pdf(xissue)
319 if self.ignore_missing_pdf:
320 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
321 if self.dry:
322 return
323 if len(xissue.articles) == 0 and not issue_has_pdf:
324 return
325 self.process_resource_metadata(xissue, resource_type="issue")
327 self.add_xissue_into_database(xissue)
329 @staticmethod
330 def article_has_source(art: ArticleData | IssueData):
331 return (
332 next(
333 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
334 None,
335 )
336 is not None
337 )
339 @staticmethod
340 def article_has_pdf(art: ArticleData | IssueData):
341 return (
342 next(
343 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
344 None,
345 )
346 is not None
347 )
349 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
350 # ARTICLE URL as en ExtLink (to display the link in the article page)
351 if xarticle.url is None:
352 if not self.article_has_source(xarticle): 352 ↛ 362line 352 didn't jump to line 362 because the condition on line 352 was always true
353 if xissue.url:
354 article_source = xissue.url
355 else:
356 article_source = self.collection_url
357 ext_link = create_extlink()
358 ext_link["rel"] = "source"
359 ext_link["location"] = article_source
360 ext_link["metadata"] = self.source_domain
361 xarticle.ext_links.append(ext_link)
362 return self.process_article_metadata(xarticle)
364 content = self.download_file(xarticle.url)
365 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
367 try:
368 with self.tracer.start_as_current_span("parse_article_content"):
369 parsed_xarticle = self.parse_article_content(
370 content, xissue, xarticle, xarticle.url
371 )
372 except ValueError as e:
373 self.logger.warning(e)
374 self.logger.warning("Retrying in 5 mins while invalidating cache")
375 time.sleep(5 * 60)
376 content = self.download_file(xarticle.url, force_refresh=True)
377 with self.tracer.start_as_current_span("parse_article_content"):
378 parsed_xarticle = self.parse_article_content(
379 content, xissue, xarticle, xarticle.url
380 )
382 if parsed_xarticle is None: 382 ↛ 383line 382 didn't jump to line 383 because the condition on line 382 was never true
383 return None
385 if parsed_xarticle.doi:
386 parsed_xarticle.pid = (
387 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
388 )
390 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
391 ext_link = create_extlink()
392 ext_link["rel"] = "source"
393 ext_link["location"] = parsed_xarticle.url
394 ext_link["metadata"] = self.source_domain
395 parsed_xarticle.ext_links.append(ext_link)
397 # The article title may have formulas surrounded with '$'
398 return self.process_article_metadata(parsed_xarticle)
400 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
401 tag = "article-title" if resource_type == "article" else "issue-title"
403 # Process title tex
404 ckeditor_data = build_jats_data_from_html_field(
405 xresource.title_tex,
406 tag=tag,
407 text_lang=xresource.lang,
408 delimiter_inline=self.delimiter_inline_formula,
409 delimiter_disp=self.delimiter_disp_formula,
410 )
412 xresource.title_html = ckeditor_data["value_html"]
413 # xresource.title_tex = ckeditor_data["value_tex"]
414 xresource.title_xml = ckeditor_data["value_xml"]
416 # Process trans_title tex
417 if xresource.trans_title_tex: 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true
418 self.logger.warning(
419 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
420 )
421 trans_title = self.create_trans_title(
422 xresource_lang=xresource.lang,
423 resource_type=resource_type,
424 title_tex=xresource.trans_title_tex,
425 lang=xresource.trans_lang,
426 )
427 xresource.titles.append(trans_title)
429 abstracts_to_parse = [
430 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
431 ]
432 # abstract may have formulas surrounded with '$'
433 if len(abstracts_to_parse) > 0:
434 for xabstract in abstracts_to_parse:
435 ckeditor_data = build_jats_data_from_html_field(
436 xabstract["value_tex"],
437 tag="abstract",
438 text_lang=xabstract["lang"],
439 resource_lang=xresource.lang,
440 field_type="abstract",
441 delimiter_inline=self.delimiter_inline_formula,
442 delimiter_disp=self.delimiter_disp_formula,
443 )
445 xabstract["value_html"] = ckeditor_data["value_html"]
446 # xabstract["value_tex"] = ckeditor_data["value_tex"]
447 xabstract["value_xml"] = ckeditor_data["value_xml"]
449 return xresource
451 def process_article_metadata(self, xarticle: ArticleData):
452 self.process_resource_metadata(xarticle)
453 for bibitem in xarticle.bibitems:
454 bibitem.type = "unknown"
455 update_data_for_jats(xarticle, with_label=False)
457 return xarticle
459 def download_file(self, url: str, force_refresh=False, headers={}):
460 """
461 Downloads a page and returns its content (decoded string).
462 This function handles retries and decoding
463 """
464 current_exception: Exception | None = None
465 for attempt in range(3):
466 try:
467 kwargs = {
468 "url": url,
469 "headers": {**self.headers, **headers},
470 "timeout": self.requests_timeout,
471 }
472 if attempt > 0 and isinstance(self.session, CachedSession):
473 kwargs["force_refresh"] = True
474 response = self.session.get(**kwargs)
476 content = self.decode_response(response)
477 if content == "" or not content:
478 raise requests.exceptions.HTTPError(response)
480 return content
481 except (
482 requests.ConnectionError,
483 requests.ConnectTimeout,
484 requests.exceptions.HTTPError,
485 ) as e:
486 current_exception = e
487 self.logger.debug(f"Caught error : {e}", extra={"url": url})
488 # 15 mins, 30 mins, 45 mins
489 delay_minutes = attempt * 15
490 self.logger.debug(
491 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
492 extra={"url": url},
493 )
494 time.sleep(delay_minutes * 60)
496 raise current_exception
498 def decode_response(self, response: requests.Response, encoding: str | None = None):
499 """Override this if the content-type headers from the sources are advertising something else than the actual content
500 SASA needs this"""
501 # Force
502 if encoding:
503 response.encoding = encoding
504 return response.text
506 # Attempt to get encoding using HTTP headers
507 content_type_tag = response.headers.get("Content-Type", None)
509 if content_type_tag: 509 ↛ 516line 509 didn't jump to line 516 because the condition on line 509 was always true
510 charset = self.parse_content_type_charset(content_type_tag)
511 if charset: 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true
512 response.encoding = charset
513 return response.text
515 # Attempt to get encoding using HTML meta charset tag
516 soup = BeautifulSoup(response.text, "html5lib")
517 charset = soup.select_one("meta[charset]")
518 if charset:
519 htmlencoding = charset.get("charset")
520 if isinstance(htmlencoding, str): 520 ↛ 525line 520 didn't jump to line 525 because the condition on line 520 was always true
521 response.encoding = htmlencoding
522 return response.text
524 # Attempt to get encoding using HTML meta content type tag
525 content_type_tag = soup.select_one(
526 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
527 )
528 if content_type_tag:
529 content_type = content_type_tag.get("content")
530 if isinstance(content_type, str): 530 ↛ 536line 530 didn't jump to line 536 because the condition on line 530 was always true
531 charset = self.parse_content_type_charset(content_type)
532 if charset: 532 ↛ 536line 532 didn't jump to line 536 because the condition on line 532 was always true
533 response.encoding = charset
534 return response.text
536 return response.text
538 @staticmethod
539 def parse_content_type_charset(content_type: str):
540 header = EmailPolicy.header_factory("content-type", content_type)
541 if "charset" in header.params:
542 return header.params.get("charset")
544 @tracer.start_as_current_span("add_xissue_to_database")
545 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
546 xissue.journal = self.collection
547 xissue.source = self.source_domain
549 if xissue.year == "":
550 raise ValueError("Failsafe : Cannot insert issue without a year")
552 xpub = create_publisherdata()
553 xpub.name = self.publisher
554 xissue.publisher = xpub
555 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
557 attempt = 1
558 success = False
560 while not success and attempt < 4:
561 try:
562 params = {"xissue": xissue, "use_body": False}
563 cmd = addOrUpdateGDMLIssueXmlCmd(params)
564 cmd.do()
565 success = True
566 self.logger.debug(f"Issue {xissue.pid} inserted in database")
567 return xissue
568 except SolrError:
569 self.logger.warning(
570 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
571 )
572 attempt += 1
573 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
574 time.sleep(10)
575 except Exception as e:
576 self.logger.error(
577 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
578 )
579 raise e
581 if success is False:
582 raise ConnectionRefusedError("Cannot connect to SolR")
584 assert False, "Unreachable"
586 def get_metadata_using_citation_meta(
587 self,
588 xarticle: ArticleData,
589 xissue: IssueData,
590 soup: BeautifulSoup,
591 what: list[CitationLiteral] = [],
592 ):
593 """
594 :param xarticle: the xarticle that will collect the metadata
595 :param xissue: the xissue that will collect the publisher
596 :param soup: the BeautifulSoup object of tha article page
597 :param what: list of citation_ items to collect.
598 :return: None. The given article is modified
599 """
601 if "title" in what:
602 # TITLE
603 citation_title_node = soup.select_one("meta[name='citation_title']")
604 if citation_title_node: 604 ↛ 609line 604 didn't jump to line 609 because the condition on line 604 was always true
605 title = citation_title_node.get("content")
606 if isinstance(title, str): 606 ↛ 609line 606 didn't jump to line 609 because the condition on line 606 was always true
607 xarticle.title_tex = title
609 if "author" in what: 609 ↛ 638line 609 didn't jump to line 638 because the condition on line 609 was always true
610 # AUTHORS
611 citation_author_nodes = soup.select("meta[name^='citation_author']")
612 current_author: ContributorDict | None = None
613 for citation_author_node in citation_author_nodes:
614 if citation_author_node.get("name") == "citation_author":
615 text_author = citation_author_node.get("content")
616 if not isinstance(text_author, str): 616 ↛ 617line 616 didn't jump to line 617 because the condition on line 616 was never true
617 raise ValueError("Cannot parse author")
618 if text_author == "": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true
619 current_author = None
620 continue
621 current_author = create_contributor(role="author", string_name=text_author)
622 xarticle.contributors.append(current_author)
623 continue
624 if current_author is None: 624 ↛ 625line 624 didn't jump to line 625 because the condition on line 624 was never true
625 self.logger.warning("Couldn't parse citation author")
626 continue
627 if citation_author_node.get("name") == "citation_author_institution":
628 text_institution = citation_author_node.get("content")
629 if not isinstance(text_institution, str): 629 ↛ 630line 629 didn't jump to line 630 because the condition on line 629 was never true
630 continue
631 current_author["addresses"].append(text_institution)
632 if citation_author_node.get("name") == "citation_author_ocrid": 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true
633 text_orcid = citation_author_node.get("content")
634 if not isinstance(text_orcid, str):
635 continue
636 current_author["orcid"] = text_orcid
638 if "pdf" in what:
639 # PDF
640 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
641 if citation_pdf_node:
642 pdf_url = citation_pdf_node.get("content")
643 if isinstance(pdf_url, str): 643 ↛ 646line 643 didn't jump to line 646 because the condition on line 643 was always true
644 add_pdf_link_to_xarticle(xarticle, pdf_url)
646 if "lang" in what:
647 # LANG
648 citation_lang_node = soup.select_one("meta[name='citation_language']")
649 if citation_lang_node: 649 ↛ 655line 649 didn't jump to line 655 because the condition on line 649 was always true
650 # TODO: check other language code
651 content_text = citation_lang_node.get("content")
652 if isinstance(content_text, str): 652 ↛ 655line 652 didn't jump to line 655 because the condition on line 652 was always true
653 xarticle.lang = standardize_tag(content_text)
655 if "abstract" in what:
656 # ABSTRACT
657 abstract_node = soup.select_one("meta[name='citation_abstract']")
658 if abstract_node is not None:
659 abstract = abstract_node.get("content")
660 if not isinstance(abstract, str): 660 ↛ 661line 660 didn't jump to line 661 because the condition on line 660 was never true
661 raise ValueError("Couldn't parse abstract from meta")
662 abstract = BeautifulSoup(abstract, "html.parser").text
663 lang = abstract_node.get("lang")
664 if not isinstance(lang, str):
665 lang = self.detect_language(abstract, xarticle)
666 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
668 if "page" in what:
669 # PAGES
670 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
671 if citation_fpage_node:
672 page = citation_fpage_node.get("content")
673 if isinstance(page, str): 673 ↛ 678line 673 didn't jump to line 678 because the condition on line 673 was always true
674 page = page.split("(")[0]
675 if len(page) < 32: 675 ↛ 678line 675 didn't jump to line 678 because the condition on line 675 was always true
676 xarticle.fpage = page
678 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
679 if citation_lpage_node:
680 page = citation_lpage_node.get("content")
681 if isinstance(page, str): 681 ↛ 686line 681 didn't jump to line 686 because the condition on line 681 was always true
682 page = page.split("(")[0]
683 if len(page) < 32: 683 ↛ 686line 683 didn't jump to line 686 because the condition on line 683 was always true
684 xarticle.lpage = page
686 if "doi" in what:
687 # DOI
688 citation_doi_node = soup.select_one("meta[name='citation_doi']")
689 if citation_doi_node:
690 doi = citation_doi_node.get("content")
691 if isinstance(doi, str): 691 ↛ 698line 691 didn't jump to line 698 because the condition on line 691 was always true
692 doi = doi.strip()
693 pos = doi.find("10.")
694 if pos > 0:
695 doi = doi[pos:]
696 xarticle.doi = doi
698 if "mr" in what:
699 # MR
700 citation_mr_node = soup.select_one("meta[name='citation_mr']")
701 if citation_mr_node:
702 mr = citation_mr_node.get("content")
703 if isinstance(mr, str): 703 ↛ 709line 703 didn't jump to line 709 because the condition on line 703 was always true
704 mr = mr.strip()
705 if mr.find("MR") == 0: 705 ↛ 709line 705 didn't jump to line 709 because the condition on line 705 was always true
706 mr = mr[2:]
707 xarticle.extids.append(("mr-item-id", mr))
709 if "zbl" in what:
710 # ZBL
711 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
712 if citation_zbl_node:
713 zbl = citation_zbl_node.get("content")
714 if isinstance(zbl, str): 714 ↛ 720line 714 didn't jump to line 720 because the condition on line 714 was always true
715 zbl = zbl.strip()
716 if zbl.find("Zbl") == 0: 716 ↛ 720line 716 didn't jump to line 720 because the condition on line 716 was always true
717 zbl = zbl[3:].strip()
718 xarticle.extids.append(("zbl-item-id", zbl))
720 if "publisher" in what:
721 # PUBLISHER
722 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
723 if citation_publisher_node:
724 pub = citation_publisher_node.get("content")
725 if isinstance(pub, str): 725 ↛ 732line 725 didn't jump to line 732 because the condition on line 725 was always true
726 pub = pub.strip()
727 if pub != "": 727 ↛ 732line 727 didn't jump to line 732 because the condition on line 727 was always true
728 xpub = create_publisherdata()
729 xpub.name = pub
730 xissue.publisher = xpub
732 if "keywords" in what:
733 # KEYWORDS
734 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
735 for kwd_node in citation_kwd_nodes:
736 kwds = kwd_node.get("content")
737 if isinstance(kwds, str): 737 ↛ 735line 737 didn't jump to line 735 because the condition on line 737 was always true
738 kwds = kwds.split(",")
739 for kwd in kwds:
740 if kwd == "":
741 continue
742 kwd = kwd.strip()
743 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
745 if "references" in what:
746 citation_references = soup.select("meta[name='citation_reference']")
747 for index, tag in enumerate(citation_references):
748 content = tag.get("content")
749 if not isinstance(content, str): 749 ↛ 750line 749 didn't jump to line 750 because the condition on line 749 was never true
750 raise ValueError("Cannot parse citation_reference meta")
751 label = str(index + 1)
752 if regex.match(r"^\[\d+\].*", content): 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true
753 label = None
754 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
756 def get_metadata_using_dcterms(
757 self,
758 xarticle: ArticleData,
759 soup: "Tag",
760 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
761 ):
762 if "abstract" in what: 762 ↛ 770line 762 didn't jump to line 770 because the condition on line 762 was always true
763 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
764 if abstract_tag: 764 ↛ 770line 764 didn't jump to line 770 because the condition on line 764 was always true
765 abstract_text = self.get_str_attr(abstract_tag, "content")
766 xarticle.abstracts.append(
767 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
768 )
770 if "keywords" in what: 770 ↛ 779line 770 didn't jump to line 779 because the condition on line 770 was always true
771 keyword_tags = soup.select("meta[name='DC.subject']")
772 for tag in keyword_tags:
773 kwd_text = tag.get("content")
774 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 774 ↛ 775line 774 didn't jump to line 775 because the condition on line 774 was never true
775 continue
776 kwd = create_subj(value=kwd_text)
777 xarticle.kwds.append(kwd)
779 if "date_published" in what: 779 ↛ 780line 779 didn't jump to line 780 because the condition on line 779 was never true
780 published_tag = soup.select_one("meta[name='DC.Date.created']")
781 if published_tag:
782 published_text = self.get_str_attr(published_tag, "content")
783 xarticle.date_published = published_text
785 if "article_type" in what: 785 ↛ 786line 785 didn't jump to line 786 because the condition on line 785 was never true
786 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
787 if type_tag:
788 type_text = self.get_str_attr(type_tag, "content")
789 xarticle.atype = type_text
791 def create_xissue(
792 self,
793 url: str | None,
794 year: str,
795 volume_number: str | None,
796 issue_number: str | None = None,
797 vseries: str | None = None,
798 ):
799 if url is not None and url.endswith("/"):
800 url = url[:-1]
801 xissue = create_issuedata()
802 xissue.url = url
804 xissue.pid = self.get_issue_pid(
805 self.collection_id, year, volume_number, issue_number, vseries
806 )
808 xissue.year = year
810 if volume_number is not None:
811 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
813 if issue_number is not None:
814 xissue.number = issue_number.replace(",", "-")
816 if vseries is not None: 816 ↛ 817line 816 didn't jump to line 817 because the condition on line 816 was never true
817 xissue.vseries = vseries
818 return xissue
820 def detect_language(self, text: str, article: ArticleData | None = None):
821 if article and article.lang is not None and article.lang != "und":
822 return article.lang
824 language = self.language_detector.detect_language_of(text)
826 if not language: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true
827 return "und"
828 return language.iso_code_639_1.name.lower()
830 def get_str_attr(self, tag: "Tag", attr: str):
831 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
832 node_attr = tag.get(attr)
833 if isinstance(node_attr, list): 833 ↛ 834line 833 didn't jump to line 834 because the condition on line 833 was never true
834 raise ValueError(
835 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
836 )
837 if node_attr is None: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true
838 raise ValueError(
839 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
840 )
841 return node_attr
843 def create_trans_title(
844 self,
845 resource_type: str,
846 title_tex: str,
847 lang: str,
848 xresource_lang: str,
849 title_type: str = "main",
850 ):
851 tag = "trans-title" if resource_type == "article" else "issue-title"
853 ckeditor_data = build_jats_data_from_html_field(
854 title_tex,
855 tag=tag,
856 text_lang=lang,
857 resource_lang=xresource_lang,
858 delimiter_inline=self.delimiter_inline_formula,
859 delimiter_disp=self.delimiter_disp_formula,
860 )
862 titledata = create_titledata(
863 lang=lang,
864 type="main",
865 title_html=ckeditor_data["value_html"],
866 title_xml=ckeditor_data["value_xml"],
867 )
869 return titledata
871 references_mapping = {
872 "citation_title": get_article_title_xml,
873 "citation_journal_title": get_source_xml,
874 "citation_publication_date": get_year_xml,
875 "citation_firstpage": get_fpage_xml,
876 "citation_lastpage": get_lpage_xml,
877 }
879 @classmethod
880 def __parse_meta_citation_reference(cls, content: str, label=None):
881 categories = content.split(";")
883 if len(categories) == 1:
884 return JatsBase.bake_ref(content, label=label)
886 citation_data = [c.split("=") for c in categories if "=" in c]
887 del categories
889 xml_string = ""
890 authors_parsed = False
891 authors_strings = []
892 for data in citation_data:
893 key = data[0].strip()
894 citation_content = data[1]
895 if key == "citation_author":
896 authors_strings.append(get_author_xml(template_str=citation_content))
897 continue
898 elif not authors_parsed:
899 xml_string += ", ".join(authors_strings)
900 authors_parsed = True
902 if key in cls.references_mapping:
903 xml_string += " " + cls.references_mapping[key](citation_content)
905 return JatsBase.bake_ref(xml_string, label=label)
907 @classmethod
908 def get_or_create_source(cls):
909 source, created = Source.objects.get_or_create(
910 domain=cls.source_domain,
911 defaults={
912 "name": cls.source_name,
913 "website": cls.source_website,
914 "view_id": cls.get_view_id(),
915 },
916 )
917 if created: 917 ↛ 918line 917 didn't jump to line 918 because the condition on line 917 was never true
918 source.save()
919 return source
921 @staticmethod
922 def get_issue_pid(
923 collection_id: str,
924 year: str,
925 volume_number: str | None = None,
926 issue_number: str | None = None,
927 series: str | None = None,
928 ):
929 # Replace any non-word character with an underscore
930 pid = f"{collection_id}_{year}"
931 if series is not None: 931 ↛ 932line 931 didn't jump to line 932 because the condition on line 931 was never true
932 pid += f"_{series}"
933 if volume_number is not None:
934 pid += f"_{volume_number}"
935 if issue_number is not None:
936 pid += f"_{issue_number}"
937 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
938 return pid
940 @staticmethod
941 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
942 pages_split = pages.split(separator)
943 if len(pages_split) == 0: 943 ↛ 944line 943 didn't jump to line 944 because the condition on line 943 was never true
944 article.page_range = pages
945 if len(pages_split) > 0: 945 ↛ exitline 945 didn't return from function 'set_pages' because the condition on line 945 was always true
946 if pages[0].isnumeric(): 946 ↛ exitline 946 didn't return from function 'set_pages' because the condition on line 946 was always true
947 article.fpage = pages_split[0]
948 if ( 948 ↛ 953line 948 didn't jump to line 953 because the condition on line 948 was never true
949 len(pages_split) > 1
950 and pages_split[0] != pages_split[1]
951 and pages_split[1].isnumeric()
952 ):
953 article.lpage = pages_split[1]
955 @staticmethod
956 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
957 content_type = response.headers.get("Content-Type")
958 if regex.match(rb"^%PDF-\d\.\d", chunk):
959 if content_type and "application/pdf" in content_type:
960 # The file is unmistakably a pdf
961 return [
962 True,
963 response,
964 {
965 "status": ExtlinkChecked.Status.OK,
966 "message": "",
967 },
968 ]
969 # The file is a pdf, but the content type advertised by the server is wrong
970 return [
971 True,
972 response,
973 {
974 "status": ExtlinkChecked.Status.WARNING,
975 "message": f"Content-Type header: {content_type}",
976 },
977 ]
979 # Reaching here means we couldn't find the pdf.
980 if not content_type or "application/pdf" not in content_type:
981 return [
982 False,
983 response,
984 {
985 "status": ExtlinkChecked.Status.ERROR,
986 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
987 },
988 ]
990 return [
991 False,
992 response,
993 {
994 "status": ExtlinkChecked.Status.ERROR,
995 "message": f"PDF Header not found: got {chunk}",
996 },
997 ]
999 @classmethod
1000 async def a_check_pdf_link_validity(
1001 cls, url: str, verify=True
1002 ) -> tuple[bool, aiohttp.ClientResponse, dict]:
1003 """
1004 Check the validity of the PDF links.
1005 """
1006 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1007 header = {
1008 "Range": f"bytes=0-{CHUNK_SIZE}",
1009 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1010 }
1011 async with cls.async_session.get(
1012 url, headers=header, allow_redirects=True, ssl=verify
1013 ) as response:
1014 try:
1015 chunk = await response.content.read(CHUNK_SIZE)
1016 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1017 except StopIteration:
1018 return [
1019 False,
1020 response,
1021 {
1022 "status": ExtlinkChecked.Status.ERROR,
1023 "message": "Error reading PDF header",
1024 },
1025 ]
1027 @classmethod
1028 def check_pdf_link_validity(
1029 cls, url: str, verify=True
1030 ) -> tuple[bool, requests.Response | None, dict]:
1031 """
1032 Check the validity of the PDF links.
1033 """
1034 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1035 header = {
1036 "Range": f"bytes=0-{CHUNK_SIZE}",
1037 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1038 }
1039 with cls.session.get(
1040 url, headers=header, allow_redirects=True, verify=verify, stream=True
1041 ) as response:
1042 try:
1043 chunk = next(response.iter_content(CHUNK_SIZE))
1044 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1045 except StopIteration:
1046 return [
1047 False,
1048 response,
1049 {
1050 "status": ExtlinkChecked.Status.ERROR,
1051 "message": "Error reading PDF header",
1052 },
1053 ]
1055 @classmethod
1056 async def check_extlink_validity(cls, extlink: "ExtLink"):
1057 """
1058 Method used by rot_monitoring to check if links have expired
1059 """
1060 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1061 header = {
1062 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1063 }
1064 verify = True
1065 if not cls.verify:
1066 verify = False
1067 try:
1068 if extlink.rel == "article-pdf":
1069 isok, response, message = await cls.a_check_pdf_link_validity(
1070 extlink.location, verify
1071 )
1072 defaults.update(message)
1073 defaults["http_status"] = response.status
1074 else:
1075 async with cls.async_session.get(
1076 url=extlink.location,
1077 headers=header,
1078 allow_redirects=True,
1079 ssl=verify,
1080 ) as response:
1081 defaults["http_status"] = response.status
1082 if response.status not in (200, 206):
1083 defaults["status"] = ExtlinkChecked.Status.ERROR
1085 except aiohttp.ClientSSLError:
1086 cls.logger.error("SSL error for the url: %s", extlink.location)
1087 defaults["status"] = ExtlinkChecked.Status.ERROR
1088 defaults["message"] = "SSL error"
1089 except aiohttp.ClientConnectionError:
1090 cls.logger.error("Connection error for the url: %s", extlink.location)
1091 defaults["status"] = ExtlinkChecked.Status.ERROR
1092 defaults["message"] = "Connection error"
1093 except asyncio.TimeoutError:
1094 cls.logger.error("Timeout error for the url: %s", extlink.location)
1095 defaults["status"] = ExtlinkChecked.Status.ERROR
1096 defaults["message"] = "Timeout error"
1097 finally:
1098 try:
1099 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1100 cls.logger.info(
1101 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location
1102 )
1103 except IntegrityError:
1104 cls.logger.error(
1105 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location
1106 )