Coverage for src / crawler / abstract_crawlers / base_crawler.py: 65%
613 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import logging
2import time
3from collections.abc import Iterable
4from datetime import datetime, timedelta
5from email.policy import EmailPolicy
6from typing import TYPE_CHECKING, Literal
8import aiohttp
9import regex
10import requests
11from bs4 import BeautifulSoup
12from django.conf import settings
13from django.contrib.auth.models import User
14from django.db.utils import IntegrityError
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_subj,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pysolr import SolrError
49from requests.adapters import HTTPAdapter
50from requests_cache import CachedSession
51from urllib3 import Retry
53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
54from crawler.models import Source
55from crawler.models.extlink_checked import ExtlinkChecked
56from crawler.types import CitationLiteral
57from crawler.utils import (
58 add_pdf_link_to_xarticle,
59 cleanup_str,
60 get_all_cols,
61 get_or_create_collection,
62 get_session,
63)
65if TYPE_CHECKING:
66 from typing import Callable
68 from bs4 import Tag
71class CrawlerTitleDict(TitleDict):
72 title_tex: str | None
75class BaseCollectionCrawler:
76 """
77 Base collection for the crawlers.
78 To create a crawler:
79 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
80 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
81 3) update factory.py so that crawler_factory can return your new crawler
82 """
84 logger = logging.getLogger(__name__)
85 tracer = trace.get_tracer(__name__)
87 source_name = ""
88 source_domain = ""
89 source_website = ""
91 issue_href = ""
93 collection = None
94 source = None
95 user = None
96 session: requests.Session | CachedSession
97 async_session: aiohttp.ClientSession
98 is_checkable = True
99 verify = True
100 headers = {
101 "accept_encoding": "utf-8",
102 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
103 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
104 }
106 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
107 "seconds to wait between two http requests"
108 requests_timeout = 60
109 "seconds to wait before aborting the connection (if no bytes are recieved)"
111 latext_parser = LatexNodes2Text()
113 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
114 # do not use the "$" to surround tex formulas
115 delimiter_inline_formula = "$"
116 delimiter_disp_formula = "$"
118 # HACK : Workaround for tests (monkeypatching)
119 # We store the class here, so we can monkeypatch it when running tests
120 # subCrawlers = {
121 # LofplCrawler: None
122 # }
123 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
125 _language_detector: LanguageDetector | None = None
126 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
128 force_refresh = False
130 match_headers = False
131 "Whereas to include headers in requests cache key"
132 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
134 ignore_missing_pdf = True
135 "Set this to False on a Crawler-basis to allow inserting articles without PDFs"
136 pid_year_restrictions: dict[str, int] = {}
137 "pid -> excluded years count"
139 pause_function: "Callable[[int], None]"
140 "Overridable the pause function (used in celery tasks to speedup aborting)"
142 @classmethod
143 def get_view_id(cls):
144 return cls.source_domain
146 @property
147 def language_detector(self):
148 """Crawler Instance singleton for language builder.
149 Late init of LanguageDetector to save on memory"""
150 if not self._language_detector:
151 self._language_detector = self._language_detector_builder.build()
152 return self._language_detector
154 def __init__(
155 self,
156 *args,
157 username: str,
158 collection_id: str,
159 dry: bool = False,
160 publisher: str = "",
161 force_refresh=False,
162 collection_url: str | None = None,
163 backend=None,
164 pause_function=staticmethod(time.sleep),
165 ):
166 if not collection_url: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 all_cols = get_all_cols()
168 col = all_cols[collection_id]
170 collection_url = col["sources"].get(self.source_domain, None)
171 if collection_url is None:
172 raise ValueError(
173 f"Source {self.source_domain} not found for collection {collection_id}"
174 )
175 self.collection_url = collection_url
176 for CrawlerClass in self.subCrawlers: 176 ↛ 177line 176 didn't jump to line 177 because the loop on line 176 never started
177 self.subCrawlers[CrawlerClass] = CrawlerClass(
178 *args,
179 username=username,
180 collection_id=collection_id,
181 dry=dry,
182 publisher=publisher,
183 collection_url=collection_url,
184 )
185 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
186 # self.logger = logging.getLogger(__name__)
188 self.username = username
190 self.collection_id = collection_id
192 self.dry = dry
193 self.publisher = publisher
195 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
196 BaseCollectionCrawler.session = requests.Session()
198 self.pause_function = pause_function
200 # Skipped when running tests
201 self.initialize()
203 self.force_refresh = force_refresh
204 self.backend = backend
206 def initialize(self):
207 """
208 Acts as a "second" init function to skip model accesses during test data generation
209 """
210 self.collection = get_or_create_collection(self.collection_id)
211 self.source = self.get_or_create_source()
212 self.user = User.objects.get(username=self.username)
213 BaseCollectionCrawler.session = get_session()
214 BaseCollectionCrawler.session.verify = self.verify
215 self.session.pause_function = self.pause_function
216 self.session.delay = self.requests_interval
217 retries = Retry(
218 total=0,
219 )
220 self.session.mount("https://", HTTPAdapter(max_retries=retries))
221 self.session.mount("http://", HTTPAdapter(max_retries=retries))
223 @classmethod
224 def can_crawl(cls, pid: str) -> bool:
225 return True
227 def parse_collection_content(self, content: str) -> list[IssueData]:
228 """
229 Parse the HTML content with BeautifulSoup
230 returns a list of xissue.
231 Override this function in a derived class
232 """
233 return []
235 def parse_issue_content(self, content: str, xissue: IssueData):
236 """
237 Parse the HTML content with BeautifulSoup
238 Fills the xissue.articles
239 Override this function in a derived class.
241 CAV : You are supposed to create articles there. Please assign a PID to each article.
242 The PID can be `a + article_index`, like this : `a0` `a21`
243 """
245 def parse_article_content(
246 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
247 ) -> ArticleData | None:
248 """
249 Parse the HTML content with BeautifulSoup
250 returns the xarticle.
251 Override this function in a derived class.
252 The xissue is passed to the function in case the article page has issue information (ex: publisher)
253 The article url is also passed as a parameter
255 CAV : You are supposed to assign articles pid again here
256 """
257 return xarticle
259 @tracer.start_as_current_span("crawl_collection")
260 def crawl_collection(self):
261 # TODO: Comments, filter
262 """
263 Crawl an entire collection. ptf.models.Container objects are created.
264 - get the HTML content of the collection_url
265 - parse the HTML content with beautifulsoup to extract the list of issues
266 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
267 - crawl each issue if col_only is False
268 - Returns the list of merged issues.
269 It is an OrderedDict {pid: {"issues": xissues}}
270 The key is the pid of the merged issues.
271 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
272 the pid is then made with 1999-2000__6_
273 """
275 if self.source is None:
276 raise RuntimeError("ERROR: the source is not set")
278 content = self.download_file(self.collection_url)
279 if content:
280 xissues = self.parse_collection_content(content)
281 else:
282 # download_file returns None (404)
283 return None
285 """
286 Some collections split the same volumes in different pages
287 Ex: Volume 6 (2000) and Volume 6 (1999)
288 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
289 """
290 # merged_xissues = self.merge_xissues(xissues)
292 xissues_dict = {str(i.pid): i for i in xissues}
294 return xissues_dict
296 def start_process_issue(self, xissue: IssueData):
297 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
298 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
299 issue_url = xissue.url
300 if issue_url is not None:
301 if issue_url.endswith(".pdf"):
302 add_pdf_link_to_xarticle(xissue, issue_url)
303 xissue.url = None
304 else:
305 content = self.download_file(issue_url)
306 with self.tracer.start_as_current_span("parse_issue_content"):
307 self.parse_issue_content(content, xissue)
309 @tracer.start_as_current_span("crawl_issue")
310 def crawl_issue(self, xissue: IssueData):
311 """
312 Crawl 1 wag page of an issue.
313 - get the HTML content of the issue
314 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
315 - crawl each article
316 """
318 self.start_process_issue(xissue)
320 xarticles = xissue.articles
322 parsed_xarticles = []
324 for xarticle in xarticles:
325 parsed_xarticle = self.crawl_article(xarticle, xissue)
326 if parsed_xarticle is not None:
327 parsed_xarticles.append(parsed_xarticle)
329 xissue.articles = parsed_xarticles
331 issue_has_pdf = self.article_has_pdf(xissue)
333 if self.ignore_missing_pdf:
334 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
335 if self.dry:
336 return
337 if len(xissue.articles) == 0 and not issue_has_pdf:
338 return
339 self.process_resource_metadata(xissue, resource_type="issue")
341 self.add_xissue_into_database(xissue)
343 @staticmethod
344 def article_has_source(art: ArticleData | IssueData):
345 return (
346 next(
347 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
348 None,
349 )
350 is not None
351 )
353 @staticmethod
354 def article_has_pdf(art: ArticleData | IssueData):
355 return (
356 next(
357 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
358 None,
359 )
360 is not None
361 )
363 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
364 # ARTICLE URL as en ExtLink (to display the link in the article page)
365 if xarticle.url is None:
366 if not self.article_has_source(xarticle): 366 ↛ 376line 366 didn't jump to line 376 because the condition on line 366 was always true
367 if xissue.url:
368 article_source = xissue.url
369 else:
370 article_source = self.collection_url
371 ext_link = create_extlink()
372 ext_link["rel"] = "source"
373 ext_link["location"] = article_source
374 ext_link["metadata"] = self.source_domain
375 xarticle.ext_links.append(ext_link)
376 return self.process_article_metadata(xarticle)
378 parsed_xarticle = xarticle
379 if self.parse_article_content.__func__ != BaseCollectionCrawler.parse_article_content:
380 content = self.download_file(xarticle.url)
381 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
383 try:
384 with self.tracer.start_as_current_span("parse_article_content"):
385 parsed_xarticle = self.parse_article_content(
386 content, xissue, xarticle, xarticle.url
387 )
388 except ValueError as e:
389 self.logger.warning(e)
390 self.logger.warning("Retrying in 5 mins while invalidating cache")
391 self.pause_function(5 * 60)
392 content = self.download_file(xarticle.url, force_refresh=True)
393 with self.tracer.start_as_current_span("parse_article_content"):
394 parsed_xarticle = self.parse_article_content(
395 content, xissue, xarticle, xarticle.url
396 )
398 if parsed_xarticle is None: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 return None
401 if parsed_xarticle.doi:
402 parsed_xarticle.pid = (
403 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
404 )
406 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
407 ext_link = create_extlink()
408 ext_link["rel"] = "source"
409 ext_link["location"] = parsed_xarticle.url
410 ext_link["metadata"] = self.source_domain
411 parsed_xarticle.ext_links.append(ext_link)
413 # The article title may have formulas surrounded with '$'
414 return self.process_article_metadata(parsed_xarticle)
416 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
417 tag = "article-title" if resource_type == "article" else "issue-title"
419 # Process title tex
420 ckeditor_data = build_jats_data_from_html_field(
421 xresource.title_tex,
422 tag=tag,
423 text_lang=xresource.lang,
424 delimiter_inline=self.delimiter_inline_formula,
425 delimiter_disp=self.delimiter_disp_formula,
426 )
428 xresource.title_html = ckeditor_data["value_html"]
429 # xresource.title_tex = ckeditor_data["value_tex"]
430 xresource.title_xml = ckeditor_data["value_xml"]
432 abstracts_to_parse = [
433 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
434 ]
435 # abstract may have formulas surrounded with '$'
436 if len(abstracts_to_parse) > 0:
437 for xabstract in abstracts_to_parse:
438 ckeditor_data = build_jats_data_from_html_field(
439 xabstract["value_tex"],
440 tag="abstract",
441 text_lang=xabstract["lang"],
442 resource_lang=xresource.lang,
443 field_type="abstract",
444 delimiter_inline=self.delimiter_inline_formula,
445 delimiter_disp=self.delimiter_disp_formula,
446 )
448 xabstract["value_html"] = ckeditor_data["value_html"]
449 # xabstract["value_tex"] = ckeditor_data["value_tex"]
450 xabstract["value_xml"] = ckeditor_data["value_xml"]
452 return xresource
454 def process_article_metadata(self, xarticle: ArticleData):
455 self.process_resource_metadata(xarticle)
456 for bibitem in xarticle.bibitems:
457 bibitem.type = "unknown"
458 update_data_for_jats(xarticle, with_label=False)
460 return xarticle
462 def download_file(self, url: str, force_refresh=False, headers={}):
463 """
464 Downloads a page and returns its content (decoded string).
465 """
467 for attempt in range(3):
468 response = self.get(
469 url,
470 force_refresh=force_refresh,
471 headers=headers,
472 pause_function=self.pause_function,
473 )
475 content = self.decode_response(response)
476 if content == "" or not content:
477 self.logger.debug("Got empty content while fetching ! ")
478 # 15 mins, 30 mins, 45 mins
479 delay_minutes = attempt * 15
480 self.logger.debug(
481 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
482 extra={"url": url},
483 )
484 self.pause_function(delay_minutes * 60)
485 continue
486 return content
487 raise ValueError(f"Could not decode content at {url}")
489 @classmethod
490 def get(cls, url, *args, headers={}, force_refresh=False, pause_function=time.sleep, **kwargs):
491 current_exception = Exception(f"Could fetch url {url}")
492 for attempt in range(3):
493 try:
494 kwargs = {
495 "url": url,
496 "headers": {**cls.headers, **headers},
497 "timeout": cls.requests_timeout,
498 "force_refresh": force_refresh,
499 **kwargs,
500 }
501 if attempt > 0 and isinstance(cls.session, CachedSession):
502 kwargs["force_refresh"] = True
503 response = cls.session.get(*args, **kwargs)
504 return response
505 except (
506 requests.ConnectionError,
507 requests.ConnectTimeout,
508 requests.exceptions.HTTPError,
509 ) as e:
510 current_exception = e
511 cls.logger.debug(f"Caught error : {e}", extra={"url": url})
512 # 15 mins, 30 mins, 45 mins
513 delay_minutes = attempt * 15
514 cls.logger.debug(
515 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
516 extra={"url": url},
517 )
518 pause_function(delay_minutes * 60)
520 raise current_exception
522 def decode_response(self, response: requests.Response, encoding: str | None = None):
523 """Override this if the content-type headers from the sources are advertising something else than the actual content
524 SASA needs this"""
525 # Force
526 if encoding:
527 response.encoding = encoding
528 return response.text
530 # Attempt to get encoding using HTTP headers
531 content_type_tag = response.headers.get("Content-Type", None)
533 if content_type_tag: 533 ↛ 540line 533 didn't jump to line 540 because the condition on line 533 was always true
534 charset = self.parse_content_type_charset(content_type_tag)
535 if charset: 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true
536 response.encoding = charset
537 return response.text
539 # Attempt to get encoding using HTML meta charset tag
540 soup = BeautifulSoup(response.text, "html5lib")
541 charset = soup.select_one("meta[charset]")
542 if charset:
543 htmlencoding = charset.get("charset")
544 if isinstance(htmlencoding, str): 544 ↛ 549line 544 didn't jump to line 549 because the condition on line 544 was always true
545 response.encoding = htmlencoding
546 return response.text
548 # Attempt to get encoding using HTML meta content type tag
549 content_type_tag = soup.select_one(
550 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
551 )
552 if content_type_tag:
553 content_type = content_type_tag.get("content")
554 if isinstance(content_type, str): 554 ↛ 560line 554 didn't jump to line 560 because the condition on line 554 was always true
555 charset = self.parse_content_type_charset(content_type)
556 if charset: 556 ↛ 560line 556 didn't jump to line 560 because the condition on line 556 was always true
557 response.encoding = charset
558 return response.text
560 return response.text
562 @staticmethod
563 def parse_content_type_charset(content_type: str):
564 header = EmailPolicy.header_factory("content-type", content_type)
565 if "charset" in header.params:
566 return header.params.get("charset")
568 @tracer.start_as_current_span("add_xissue_to_database")
569 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
570 xissue.journal = self.collection
571 xissue.source = self.source_domain
573 if xissue.year == "":
574 raise ValueError("Failsafe : Cannot insert issue without a year")
576 xpub = create_publisherdata()
577 xpub.name = self.publisher
578 xissue.publisher = xpub
579 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
581 attempt = 1
582 success = False
584 while not success and attempt < 4:
585 try:
586 params = {"xissue": xissue, "use_body": False}
587 cmd = addOrUpdateGDMLIssueXmlCmd(params)
588 cmd.do()
589 success = True
590 self.logger.debug(f"Issue {xissue.pid} inserted in database")
591 return xissue
592 except SolrError:
593 self.logger.warning(
594 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
595 )
596 attempt += 1
597 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
598 self.pause_function(10)
599 except Exception as e:
600 self.logger.error(
601 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
602 )
603 raise e
605 if success is False:
606 raise ConnectionRefusedError("Cannot connect to SolR")
608 assert False, "Unreachable"
610 def get_metadata_using_citation_meta(
611 self,
612 xarticle: ArticleData,
613 xissue: IssueData,
614 soup: BeautifulSoup,
615 what: list[CitationLiteral] = [],
616 ):
617 """
618 :param xarticle: the xarticle that will collect the metadata
619 :param xissue: the xissue that will collect the publisher
620 :param soup: the BeautifulSoup object of tha article page
621 :param what: list of citation_ items to collect.
622 :return: None. The given article is modified
623 """
625 if "title" in what:
626 # TITLE
627 citation_title_node = soup.select_one("meta[name='citation_title']")
628 if citation_title_node: 628 ↛ 633line 628 didn't jump to line 633 because the condition on line 628 was always true
629 title = citation_title_node.get("content")
630 if isinstance(title, str): 630 ↛ 633line 630 didn't jump to line 633 because the condition on line 630 was always true
631 xarticle.title_tex = title
633 if "author" in what: 633 ↛ 662line 633 didn't jump to line 662 because the condition on line 633 was always true
634 # AUTHORS
635 citation_author_nodes = soup.select("meta[name^='citation_author']")
636 current_author: ContributorDict | None = None
637 for citation_author_node in citation_author_nodes:
638 if citation_author_node.get("name") == "citation_author":
639 text_author = citation_author_node.get("content")
640 if not isinstance(text_author, str): 640 ↛ 641line 640 didn't jump to line 641 because the condition on line 640 was never true
641 raise ValueError("Cannot parse author")
642 if text_author == "": 642 ↛ 643line 642 didn't jump to line 643 because the condition on line 642 was never true
643 current_author = None
644 continue
645 current_author = create_contributor(role="author", string_name=text_author)
646 xarticle.contributors.append(current_author)
647 continue
648 if current_author is None: 648 ↛ 649line 648 didn't jump to line 649 because the condition on line 648 was never true
649 self.logger.warning("Couldn't parse citation author")
650 continue
651 if citation_author_node.get("name") == "citation_author_institution":
652 text_institution = citation_author_node.get("content")
653 if not isinstance(text_institution, str): 653 ↛ 654line 653 didn't jump to line 654 because the condition on line 653 was never true
654 continue
655 current_author["addresses"].append(text_institution)
656 if citation_author_node.get("name") == "citation_author_ocrid": 656 ↛ 657line 656 didn't jump to line 657 because the condition on line 656 was never true
657 text_orcid = citation_author_node.get("content")
658 if not isinstance(text_orcid, str):
659 continue
660 current_author["orcid"] = text_orcid
662 if "pdf" in what:
663 # PDF
664 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
665 if citation_pdf_node:
666 pdf_url = citation_pdf_node.get("content")
667 if isinstance(pdf_url, str): 667 ↛ 670line 667 didn't jump to line 670 because the condition on line 667 was always true
668 add_pdf_link_to_xarticle(xarticle, pdf_url)
670 if "lang" in what:
671 # LANG
672 citation_lang_node = soup.select_one("meta[name='citation_language']")
673 if citation_lang_node: 673 ↛ 679line 673 didn't jump to line 679 because the condition on line 673 was always true
674 # TODO: check other language code
675 content_text = citation_lang_node.get("content")
676 if isinstance(content_text, str): 676 ↛ 679line 676 didn't jump to line 679 because the condition on line 676 was always true
677 xarticle.lang = standardize_tag(content_text)
679 if "abstract" in what:
680 # ABSTRACT
681 abstract_node = soup.select_one("meta[name='citation_abstract']")
682 if abstract_node is not None:
683 abstract = abstract_node.get("content")
684 if not isinstance(abstract, str): 684 ↛ 685line 684 didn't jump to line 685 because the condition on line 684 was never true
685 raise ValueError("Couldn't parse abstract from meta")
686 abstract = BeautifulSoup(abstract, "html.parser").text
687 lang = abstract_node.get("lang")
688 if not isinstance(lang, str):
689 lang = self.detect_language(abstract, xarticle)
690 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
692 if "page" in what:
693 # PAGES
694 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
695 if citation_fpage_node:
696 page = citation_fpage_node.get("content")
697 if isinstance(page, str): 697 ↛ 702line 697 didn't jump to line 702 because the condition on line 697 was always true
698 page = page.split("(")[0]
699 if len(page) < 32: 699 ↛ 702line 699 didn't jump to line 702 because the condition on line 699 was always true
700 xarticle.fpage = page
702 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
703 if citation_lpage_node:
704 page = citation_lpage_node.get("content")
705 if isinstance(page, str): 705 ↛ 710line 705 didn't jump to line 710 because the condition on line 705 was always true
706 page = page.split("(")[0]
707 if len(page) < 32: 707 ↛ 710line 707 didn't jump to line 710 because the condition on line 707 was always true
708 xarticle.lpage = page
710 if "doi" in what:
711 # DOI
712 citation_doi_node = soup.select_one("meta[name='citation_doi']")
713 if citation_doi_node:
714 doi = citation_doi_node.get("content")
715 if isinstance(doi, str): 715 ↛ 722line 715 didn't jump to line 722 because the condition on line 715 was always true
716 doi = doi.strip()
717 pos = doi.find("10.")
718 if pos > 0:
719 doi = doi[pos:]
720 xarticle.doi = doi
722 if "mr" in what:
723 # MR
724 citation_mr_node = soup.select_one("meta[name='citation_mr']")
725 if citation_mr_node:
726 mr = citation_mr_node.get("content")
727 if isinstance(mr, str): 727 ↛ 733line 727 didn't jump to line 733 because the condition on line 727 was always true
728 mr = mr.strip()
729 if mr.find("MR") == 0: 729 ↛ 733line 729 didn't jump to line 733 because the condition on line 729 was always true
730 mr = mr[2:]
731 xarticle.extids.append(("mr-item-id", mr))
733 if "zbl" in what:
734 # ZBL
735 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
736 if citation_zbl_node:
737 zbl = citation_zbl_node.get("content")
738 if isinstance(zbl, str): 738 ↛ 744line 738 didn't jump to line 744 because the condition on line 738 was always true
739 zbl = zbl.strip()
740 if zbl.find("Zbl") == 0: 740 ↛ 744line 740 didn't jump to line 744 because the condition on line 740 was always true
741 zbl = zbl[3:].strip()
742 xarticle.extids.append(("zbl-item-id", zbl))
744 if "publisher" in what:
745 # PUBLISHER
746 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
747 if citation_publisher_node:
748 pub = citation_publisher_node.get("content")
749 if isinstance(pub, str): 749 ↛ 756line 749 didn't jump to line 756 because the condition on line 749 was always true
750 pub = pub.strip()
751 if pub != "": 751 ↛ 756line 751 didn't jump to line 756 because the condition on line 751 was always true
752 xpub = create_publisherdata()
753 xpub.name = pub
754 xissue.publisher = xpub
756 if "keywords" in what:
757 # KEYWORDS
758 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
759 for kwd_node in citation_kwd_nodes:
760 kwds = kwd_node.get("content")
761 if isinstance(kwds, str): 761 ↛ 759line 761 didn't jump to line 759 because the condition on line 761 was always true
762 kwds = kwds.split(",")
763 for kwd in kwds:
764 if kwd == "":
765 continue
766 kwd = kwd.strip()
767 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
769 if "references" in what:
770 citation_references = soup.select("meta[name='citation_reference']")
771 for index, tag in enumerate(citation_references):
772 content = tag.get("content")
773 if not isinstance(content, str): 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true
774 raise ValueError("Cannot parse citation_reference meta")
775 label = str(index + 1)
776 if regex.match(r"^\[\d+\].*", content): 776 ↛ 777line 776 didn't jump to line 777 because the condition on line 776 was never true
777 label = None
778 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
780 def get_metadata_using_dcterms(
781 self,
782 xarticle: ArticleData,
783 soup: "Tag",
784 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
785 ):
786 if "abstract" in what: 786 ↛ 794line 786 didn't jump to line 794 because the condition on line 786 was always true
787 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
788 if abstract_tag: 788 ↛ 794line 788 didn't jump to line 794 because the condition on line 788 was always true
789 abstract_text = self.get_str_attr(abstract_tag, "content")
790 xarticle.abstracts.append(
791 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
792 )
794 if "keywords" in what: 794 ↛ 803line 794 didn't jump to line 803 because the condition on line 794 was always true
795 keyword_tags = soup.select("meta[name='DC.subject']")
796 for tag in keyword_tags:
797 kwd_text = tag.get("content")
798 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true
799 continue
800 kwd = create_subj(value=kwd_text)
801 xarticle.kwds.append(kwd)
803 if "date_published" in what: 803 ↛ 804line 803 didn't jump to line 804 because the condition on line 803 was never true
804 published_tag = soup.select_one("meta[name='DC.Date.created']")
805 if published_tag:
806 published_text = self.get_str_attr(published_tag, "content")
807 xarticle.date_published = published_text
809 if "article_type" in what: 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true
810 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
811 if type_tag:
812 type_text = self.get_str_attr(type_tag, "content")
813 xarticle.atype = type_text
815 def create_xissue(
816 self,
817 url: str | None,
818 year: str,
819 volume_number: str | None,
820 issue_number: str | None = None,
821 vseries: str | None = None,
822 ):
823 if url is not None and url.endswith("/"):
824 url = url[:-1]
825 xissue = create_issuedata()
826 xissue.url = url
828 xissue.pid = self.get_issue_pid(
829 self.collection_id, year, volume_number, issue_number, vseries
830 )
832 xissue.year = year
834 if volume_number is not None:
835 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
837 if issue_number is not None:
838 xissue.number = issue_number.replace(",", "-")
840 if vseries is not None: 840 ↛ 841line 840 didn't jump to line 841 because the condition on line 840 was never true
841 xissue.vseries = vseries
842 return xissue
844 def detect_language(self, text: str, article: ArticleData | None = None):
845 if article and article.lang is not None and article.lang != "und":
846 return article.lang
848 language = self.language_detector.detect_language_of(text)
850 if not language: 850 ↛ 851line 850 didn't jump to line 851 because the condition on line 850 was never true
851 return "und"
852 return language.iso_code_639_1.name.lower()
854 def get_str_attr(self, tag: "Tag", attr: str):
855 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
856 node_attr = tag.get(attr)
857 if isinstance(node_attr, list): 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true
858 raise ValueError(
859 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
860 )
861 if node_attr is None: 861 ↛ 862line 861 didn't jump to line 862 because the condition on line 861 was never true
862 raise ValueError(
863 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
864 )
865 return node_attr
867 def create_trans_title(
868 self,
869 resource_type: str,
870 title_str: str,
871 lang: str,
872 xresource_lang: str,
873 title_type: str = "main",
874 ):
875 tag = "trans-title" if resource_type == "article" else "issue-title"
877 ckeditor_data = build_jats_data_from_html_field(
878 title_str,
879 tag=tag,
880 text_lang=lang,
881 resource_lang=xresource_lang,
882 delimiter_inline=self.delimiter_inline_formula,
883 delimiter_disp=self.delimiter_disp_formula,
884 )
886 titledata = create_titledata(
887 lang=lang,
888 type="main",
889 title_html=ckeditor_data["value_html"],
890 title_xml=ckeditor_data["value_xml"],
891 )
893 return titledata
895 references_mapping = {
896 "citation_title": get_article_title_xml,
897 "citation_journal_title": get_source_xml,
898 "citation_publication_date": get_year_xml,
899 "citation_firstpage": get_fpage_xml,
900 "citation_lastpage": get_lpage_xml,
901 }
903 @classmethod
904 def __parse_meta_citation_reference(cls, content: str, label=None):
905 categories = content.split(";")
907 if len(categories) == 1:
908 return JatsBase.bake_ref(content, label=label)
910 citation_data = [c.split("=") for c in categories if "=" in c]
911 del categories
913 xml_string = ""
914 authors_parsed = False
915 authors_strings = []
916 for data in citation_data:
917 key = data[0].strip()
918 citation_content = data[1]
919 if key == "citation_author":
920 authors_strings.append(get_author_xml(template_str=citation_content))
921 continue
922 elif not authors_parsed:
923 xml_string += ", ".join(authors_strings)
924 authors_parsed = True
926 if key in cls.references_mapping:
927 xml_string += " " + cls.references_mapping[key](citation_content)
929 return JatsBase.bake_ref(xml_string, label=label)
931 @classmethod
932 def get_or_create_source(cls):
933 source, created = Source.objects.get_or_create(
934 domain=cls.source_domain,
935 defaults={
936 "name": cls.source_name,
937 "website": cls.source_website,
938 "view_id": cls.get_view_id(),
939 },
940 )
941 if created: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true
942 source.save()
943 return source
945 @staticmethod
946 def get_issue_pid(
947 collection_id: str,
948 year: str,
949 volume_number: str | None = None,
950 issue_number: str | None = None,
951 series: str | None = None,
952 ):
953 # Replace any non-word character with an underscore
954 pid = f"{collection_id}_{year}"
955 if series is not None: 955 ↛ 956line 955 didn't jump to line 956 because the condition on line 955 was never true
956 pid += f"_{series}"
957 if volume_number is not None:
958 pid += f"_{volume_number}"
959 if issue_number is not None:
960 pid += f"_{issue_number}"
961 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
962 return pid
964 @staticmethod
965 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
966 pages_split = pages.split(separator)
967 if len(pages_split) == 0: 967 ↛ 968line 967 didn't jump to line 968 because the condition on line 967 was never true
968 article.page_range = pages
969 if len(pages_split) > 0: 969 ↛ exitline 969 didn't return from function 'set_pages' because the condition on line 969 was always true
970 if pages[0].isnumeric(): 970 ↛ exitline 970 didn't return from function 'set_pages' because the condition on line 970 was always true
971 article.fpage = pages_split[0]
972 if ( 972 ↛ 977line 972 didn't jump to line 977 because the condition on line 972 was never true
973 len(pages_split) > 1
974 and pages_split[0] != pages_split[1]
975 and pages_split[1].isnumeric()
976 ):
977 article.lpage = pages_split[1]
979 @staticmethod
980 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
981 content_type = response.headers.get("Content-Type")
982 if regex.match(rb"^%PDF-\d\.\d", chunk):
983 if content_type and "application/pdf" in content_type:
984 # The file is unmistakably a pdf
985 return [
986 True,
987 response,
988 {
989 "status": ExtlinkChecked.Status.OK,
990 "message": "",
991 },
992 ]
993 # The file is a pdf, but the content type advertised by the server is wrong
994 return [
995 True,
996 response,
997 {
998 "status": ExtlinkChecked.Status.WARNING,
999 "message": f"Content-Type header: {content_type}",
1000 },
1001 ]
1003 # Reaching here means we couldn't find the pdf.
1004 if not content_type or "application/pdf" not in content_type:
1005 return [
1006 False,
1007 response,
1008 {
1009 "status": ExtlinkChecked.Status.ERROR,
1010 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
1011 },
1012 ]
1014 return [
1015 False,
1016 response,
1017 {
1018 "status": ExtlinkChecked.Status.ERROR,
1019 "message": f"PDF Header not found: got {chunk}",
1020 },
1021 ]
1023 @classmethod
1024 async def a_check_pdf_link_validity(
1025 cls, url: str, verify=True
1026 ) -> list[bool | aiohttp.ClientResponse | dict]:
1027 """
1028 Check the validity of the PDF links.
1029 """
1030 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1031 header = {
1032 "Range": f"bytes=0-{CHUNK_SIZE}",
1033 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1034 }
1035 async with cls.async_session.get(
1036 url, headers=header, allow_redirects=True, ssl=verify
1037 ) as response:
1038 try:
1039 chunk = await response.content.read(CHUNK_SIZE)
1040 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1041 except StopIteration:
1042 return [
1043 False,
1044 response,
1045 {
1046 "status": ExtlinkChecked.Status.ERROR,
1047 "message": "Error reading PDF header",
1048 },
1049 ]
1051 @classmethod
1052 def check_pdf_link_validity(
1053 cls, url: str, verify=True
1054 ) -> list[bool | requests.Response | None | dict]:
1055 """
1056 Check the validity of the PDF links.
1057 """
1058 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1059 header = {
1060 "Range": f"bytes=0-{CHUNK_SIZE}",
1061 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1062 }
1063 with cls.get(
1064 url, headers=header, allow_redirects=True, verify=verify, stream=True
1065 ) as response:
1066 try:
1067 chunk = next(response.iter_content(CHUNK_SIZE))
1068 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1069 except StopIteration:
1070 return [
1071 False,
1072 response,
1073 {
1074 "status": ExtlinkChecked.Status.ERROR,
1075 "message": "Error reading PDF header",
1076 },
1077 ]
1079 @classmethod
1080 async def check_extlink_validity(cls, extlink: "ExtLink"):
1081 """
1082 Method used by rot_monitoring to check if links have expired
1083 """
1084 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1085 header = {
1086 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1087 }
1088 verify = True
1089 if not cls.verify:
1090 verify = False
1091 try:
1092 # For the GDZ links, we just check if the http response is 200 or 206
1093 if (
1094 extlink.rel == "article-pdf"
1095 and "gdz.sub.uni-goettingen.de" not in extlink.location
1096 ):
1097 isok, response, message = await cls.a_check_pdf_link_validity(
1098 extlink.location, verify
1099 )
1100 defaults.update(message)
1101 defaults["http_status"] = response.status
1102 else:
1103 async with cls.async_session.get(
1104 url=extlink.location,
1105 headers=header,
1106 allow_redirects=True,
1107 ssl=verify,
1108 ) as response:
1109 defaults["http_status"] = response.status
1110 if response.status not in (200, 206):
1111 defaults["status"] = ExtlinkChecked.Status.ERROR
1113 except aiohttp.ClientSSLError:
1114 cls.logger.error("SSL error for the url: %s", extlink.location)
1115 defaults["status"] = ExtlinkChecked.Status.ERROR
1116 defaults["message"] = "SSL error"
1117 except aiohttp.ClientConnectionError:
1118 cls.logger.error("Connection error for the url: %s", extlink.location)
1119 defaults["status"] = ExtlinkChecked.Status.ERROR
1120 defaults["message"] = "Connection error"
1121 except TimeoutError:
1122 cls.logger.error("Timeout error for the url: %s", extlink.location)
1123 defaults["status"] = ExtlinkChecked.Status.ERROR
1124 defaults["message"] = "Timeout error"
1125 finally:
1126 try:
1127 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1128 cls.logger.info(
1129 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location
1130 )
1131 except IntegrityError:
1132 cls.logger.error(
1133 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location
1134 )
1136 def resolve_year_end(self, pid: str, default: int) -> int:
1137 if pid in self.pid_year_restrictions:
1138 return datetime.now().year - self.pid_year_restrictions[pid]
1139 return default