Coverage for src / crawler / abstract_crawlers / base_crawler.py: 65%
590 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-21 12:58 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-21 12:58 +0000
1import logging
2import time
3from collections.abc import Iterable
4from datetime import datetime, timedelta
5from email.policy import EmailPolicy
6from typing import TYPE_CHECKING, Literal
8import aiohttp
9import regex
10import requests
11from bs4 import BeautifulSoup
12from django.conf import settings
13from django.contrib.auth.models import User
14from django.db.utils import IntegrityError
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_subj,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pysolr import SolrError
49from requests.adapters import HTTPAdapter
50from requests_cache import CachedSession
51from urllib3 import Retry
53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
54from crawler.models import Source
55from crawler.models.extlink_checked import ExtlinkChecked
56from crawler.types import CitationLiteral
57from crawler.utils import (
58 add_pdf_link_to_xarticle,
59 cleanup_str,
60 get_all_cols,
61 get_or_create_collection,
62 get_session,
63)
65if TYPE_CHECKING:
66 from bs4 import Tag
69class CrawlerTitleDict(TitleDict):
70 title_tex: str | None
73class BaseCollectionCrawler:
74 """
75 Base collection for the crawlers.
76 To create a crawler:
77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
79 3) update factory.py so that crawler_factory can return your new crawler
80 """
82 logger = logging.getLogger(__name__)
83 tracer = trace.get_tracer(__name__)
85 source_name = ""
86 source_domain = ""
87 source_website = ""
89 issue_href = ""
91 collection = None
92 source = None
93 user = None
94 session: requests.Session | CachedSession
95 async_session: aiohttp.ClientSession
96 is_checkable = True
97 verify = True
98 headers = {
99 "accept_encoding": "utf-8",
100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
102 }
104 # seconds to wait between two http requests
105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
106 # seconds to wait before aborting the connection (if no bytes are recieved)
107 requests_timeout = 60
109 latext_parser = LatexNodes2Text()
111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
112 # do not use the "$" to surround tex formulas
113 delimiter_inline_formula = "$"
114 delimiter_disp_formula = "$"
116 # HACK : Workaround for tests (monkeypatching)
117 # We store the class here, so we can monkeypatch it when running tests
118 # subCrawlers = {
119 # LofplCrawler: None
120 # }
121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
123 _language_detector: LanguageDetector | None = None
124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
126 force_refresh = False
128 # Whereas to include headers in requests cache key
129 match_headers = False
130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
133 ignore_missing_pdf = True
135 pid_year_restrictions: dict[str, int] = {} # pid -> excluded years count
137 @classmethod
138 def get_view_id(cls):
139 return cls.source_domain
141 @property
142 def language_detector(self):
143 """Crawler Instance singleton for language builder.
144 Late init of LanguageDetector to save on memory"""
145 if not self._language_detector:
146 self._language_detector = self._language_detector_builder.build()
147 return self._language_detector
149 def __init__(
150 self,
151 *args,
152 username: str,
153 collection_id: str,
154 dry: bool = False,
155 publisher: str = "",
156 force_refresh=False,
157 collection_url: str | None = None,
158 backend=None,
159 ):
160 if not collection_url: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 all_cols = get_all_cols()
162 col = all_cols[collection_id]
164 collection_url = col["sources"].get(self.source_domain, None)
165 if collection_url is None:
166 raise ValueError(
167 f"Source {self.source_domain} not found for collection {collection_id}"
168 )
169 self.collection_url = collection_url
170 for CrawlerClass in self.subCrawlers: 170 ↛ 171line 170 didn't jump to line 171 because the loop on line 170 never started
171 self.subCrawlers[CrawlerClass] = CrawlerClass(
172 *args,
173 username=username,
174 collection_id=collection_id,
175 dry=dry,
176 publisher=publisher,
177 collection_url=collection_url,
178 )
179 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
180 # self.logger = logging.getLogger(__name__)
182 self.username = username
184 self.collection_id = collection_id
186 self.dry = dry
187 self.publisher = publisher
189 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
190 BaseCollectionCrawler.session = requests.Session()
192 # Skipped when running tests
193 self.initialize()
195 self.force_refresh = force_refresh
196 self.backend = backend
198 # We implemented custom retry behaviour, so we don't want to make extra requests here
200 def initialize(self):
201 """
202 Acts as a "second" init function to skip model accesses during test data generation
203 """
204 self.collection = get_or_create_collection(self.collection_id)
205 self.source = self.get_or_create_source()
206 self.user = User.objects.get(username=self.username)
207 BaseCollectionCrawler.session = get_session()
208 BaseCollectionCrawler.session.verify = self.verify
209 self.session.delay = self.requests_interval
210 retries = Retry(
211 total=0,
212 )
213 self.session.mount("https://", HTTPAdapter(max_retries=retries))
214 self.session.mount("http://", HTTPAdapter(max_retries=retries))
216 @classmethod
217 def can_crawl(cls, pid: str) -> bool:
218 return True
220 def parse_collection_content(self, content: str) -> list[IssueData]:
221 """
222 Parse the HTML content with BeautifulSoup
223 returns a list of xissue.
224 Override this function in a derived class
225 """
226 return []
228 def parse_issue_content(self, content: str, xissue: IssueData):
229 """
230 Parse the HTML content with BeautifulSoup
231 Fills the xissue.articles
232 Override this function in a derived class.
234 CAV : You are supposed to create articles there. Please assign a PID to each article.
235 The PID can be `a + article_index`, like this : `a0` `a21`
236 """
238 def parse_article_content(
239 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
240 ) -> ArticleData | None:
241 """
242 Parse the HTML content with BeautifulSoup
243 returns the xarticle.
244 Override this function in a derived class.
245 The xissue is passed to the function in case the article page has issue information (ex: publisher)
246 The article url is also passed as a parameter
248 CAV : You are supposed to assign articles pid again here
249 """
250 return xarticle
252 @tracer.start_as_current_span("crawl_collection")
253 def crawl_collection(self):
254 # TODO: Comments, filter
255 """
256 Crawl an entire collection. ptf.models.Container objects are created.
257 - get the HTML content of the collection_url
258 - parse the HTML content with beautifulsoup to extract the list of issues
259 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
260 - crawl each issue if col_only is False
261 - Returns the list of merged issues.
262 It is an OrderedDict {pid: {"issues": xissues}}
263 The key is the pid of the merged issues.
264 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
265 the pid is then made with 1999-2000__6_
266 """
268 if self.source is None:
269 raise RuntimeError("ERROR: the source is not set")
271 content = self.download_file(self.collection_url)
272 if content:
273 xissues = self.parse_collection_content(content)
274 else:
275 # download_file returns None (404)
276 return None
278 """
279 Some collections split the same volumes in different pages
280 Ex: Volume 6 (2000) and Volume 6 (1999)
281 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
282 """
283 # merged_xissues = self.merge_xissues(xissues)
285 xissues_dict = {str(i.pid): i for i in xissues}
287 return xissues_dict
289 @tracer.start_as_current_span("crawl_issue")
290 def crawl_issue(self, xissue: IssueData):
291 """
292 Crawl 1 wag page of an issue.
293 - get the HTML content of the issue
294 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
295 - crawl each article
296 """
297 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
298 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
299 issue_url = xissue.url
300 if issue_url is not None:
301 if issue_url.endswith(".pdf"):
302 add_pdf_link_to_xarticle(xissue, issue_url)
303 xissue.url = None
304 else:
305 content = self.download_file(issue_url)
306 with self.tracer.start_as_current_span("parse_issue_content"):
307 self.parse_issue_content(content, xissue)
309 xarticles = xissue.articles
311 parsed_xarticles = []
313 for xarticle in xarticles:
314 parsed_xarticle = self.crawl_article(xarticle, xissue)
315 if parsed_xarticle is not None:
316 parsed_xarticles.append(parsed_xarticle)
318 xissue.articles = parsed_xarticles
320 issue_has_pdf = self.article_has_pdf(xissue)
322 if self.ignore_missing_pdf:
323 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
324 if self.dry:
325 return
326 if len(xissue.articles) == 0 and not issue_has_pdf:
327 return
328 self.process_resource_metadata(xissue, resource_type="issue")
330 self.add_xissue_into_database(xissue)
332 @staticmethod
333 def article_has_source(art: ArticleData | IssueData):
334 return (
335 next(
336 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
337 None,
338 )
339 is not None
340 )
342 @staticmethod
343 def article_has_pdf(art: ArticleData | IssueData):
344 return (
345 next(
346 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
347 None,
348 )
349 is not None
350 )
352 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
353 # ARTICLE URL as en ExtLink (to display the link in the article page)
354 if xarticle.url is None:
355 if not self.article_has_source(xarticle): 355 ↛ 365line 355 didn't jump to line 365 because the condition on line 355 was always true
356 if xissue.url:
357 article_source = xissue.url
358 else:
359 article_source = self.collection_url
360 ext_link = create_extlink()
361 ext_link["rel"] = "source"
362 ext_link["location"] = article_source
363 ext_link["metadata"] = self.source_domain
364 xarticle.ext_links.append(ext_link)
365 return self.process_article_metadata(xarticle)
367 content = self.download_file(xarticle.url)
368 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
370 try:
371 with self.tracer.start_as_current_span("parse_article_content"):
372 parsed_xarticle = self.parse_article_content(
373 content, xissue, xarticle, xarticle.url
374 )
375 except ValueError as e:
376 self.logger.warning(e)
377 self.logger.warning("Retrying in 5 mins while invalidating cache")
378 time.sleep(5 * 60)
379 content = self.download_file(xarticle.url, force_refresh=True)
380 with self.tracer.start_as_current_span("parse_article_content"):
381 parsed_xarticle = self.parse_article_content(
382 content, xissue, xarticle, xarticle.url
383 )
385 if parsed_xarticle is None: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true
386 return None
388 if parsed_xarticle.doi:
389 parsed_xarticle.pid = (
390 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
391 )
393 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
394 ext_link = create_extlink()
395 ext_link["rel"] = "source"
396 ext_link["location"] = parsed_xarticle.url
397 ext_link["metadata"] = self.source_domain
398 parsed_xarticle.ext_links.append(ext_link)
400 # The article title may have formulas surrounded with '$'
401 return self.process_article_metadata(parsed_xarticle)
403 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
404 tag = "article-title" if resource_type == "article" else "issue-title"
406 # Process title tex
407 ckeditor_data = build_jats_data_from_html_field(
408 xresource.title_tex,
409 tag=tag,
410 text_lang=xresource.lang,
411 delimiter_inline=self.delimiter_inline_formula,
412 delimiter_disp=self.delimiter_disp_formula,
413 )
415 xresource.title_html = ckeditor_data["value_html"]
416 # xresource.title_tex = ckeditor_data["value_tex"]
417 xresource.title_xml = ckeditor_data["value_xml"]
419 abstracts_to_parse = [
420 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
421 ]
422 # abstract may have formulas surrounded with '$'
423 if len(abstracts_to_parse) > 0:
424 for xabstract in abstracts_to_parse:
425 ckeditor_data = build_jats_data_from_html_field(
426 xabstract["value_tex"],
427 tag="abstract",
428 text_lang=xabstract["lang"],
429 resource_lang=xresource.lang,
430 field_type="abstract",
431 delimiter_inline=self.delimiter_inline_formula,
432 delimiter_disp=self.delimiter_disp_formula,
433 )
435 xabstract["value_html"] = ckeditor_data["value_html"]
436 # xabstract["value_tex"] = ckeditor_data["value_tex"]
437 xabstract["value_xml"] = ckeditor_data["value_xml"]
439 return xresource
441 def process_article_metadata(self, xarticle: ArticleData):
442 self.process_resource_metadata(xarticle)
443 for bibitem in xarticle.bibitems:
444 bibitem.type = "unknown"
445 update_data_for_jats(xarticle, with_label=False)
447 return xarticle
449 def download_file(self, url: str, force_refresh=False, headers={}):
450 """
451 Downloads a page and returns its content (decoded string).
452 This function handles retries and decoding
453 """
454 current_exception: Exception | None = None
455 for attempt in range(3):
456 try:
457 kwargs = {
458 "url": url,
459 "headers": {**self.headers, **headers},
460 "timeout": self.requests_timeout,
461 }
462 if attempt > 0 and isinstance(self.session, CachedSession):
463 kwargs["force_refresh"] = True
464 response = self.session.get(**kwargs)
466 content = self.decode_response(response)
467 if content == "" or not content:
468 raise requests.exceptions.HTTPError(response)
470 return content
471 except (
472 requests.ConnectionError,
473 requests.ConnectTimeout,
474 requests.exceptions.HTTPError,
475 ) as e:
476 current_exception = e
477 self.logger.debug(f"Caught error : {e}", extra={"url": url})
478 # 15 mins, 30 mins, 45 mins
479 delay_minutes = attempt * 15
480 self.logger.debug(
481 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
482 extra={"url": url},
483 )
484 time.sleep(delay_minutes * 60)
486 raise current_exception
488 def decode_response(self, response: requests.Response, encoding: str | None = None):
489 """Override this if the content-type headers from the sources are advertising something else than the actual content
490 SASA needs this"""
491 # Force
492 if encoding:
493 response.encoding = encoding
494 return response.text
496 # Attempt to get encoding using HTTP headers
497 content_type_tag = response.headers.get("Content-Type", None)
499 if content_type_tag: 499 ↛ 506line 499 didn't jump to line 506 because the condition on line 499 was always true
500 charset = self.parse_content_type_charset(content_type_tag)
501 if charset: 501 ↛ 502line 501 didn't jump to line 502 because the condition on line 501 was never true
502 response.encoding = charset
503 return response.text
505 # Attempt to get encoding using HTML meta charset tag
506 soup = BeautifulSoup(response.text, "html5lib")
507 charset = soup.select_one("meta[charset]")
508 if charset:
509 htmlencoding = charset.get("charset")
510 if isinstance(htmlencoding, str): 510 ↛ 515line 510 didn't jump to line 515 because the condition on line 510 was always true
511 response.encoding = htmlencoding
512 return response.text
514 # Attempt to get encoding using HTML meta content type tag
515 content_type_tag = soup.select_one(
516 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
517 )
518 if content_type_tag:
519 content_type = content_type_tag.get("content")
520 if isinstance(content_type, str): 520 ↛ 526line 520 didn't jump to line 526 because the condition on line 520 was always true
521 charset = self.parse_content_type_charset(content_type)
522 if charset: 522 ↛ 526line 522 didn't jump to line 526 because the condition on line 522 was always true
523 response.encoding = charset
524 return response.text
526 return response.text
528 @staticmethod
529 def parse_content_type_charset(content_type: str):
530 header = EmailPolicy.header_factory("content-type", content_type)
531 if "charset" in header.params:
532 return header.params.get("charset")
534 @tracer.start_as_current_span("add_xissue_to_database")
535 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
536 xissue.journal = self.collection
537 xissue.source = self.source_domain
539 if xissue.year == "":
540 raise ValueError("Failsafe : Cannot insert issue without a year")
542 xpub = create_publisherdata()
543 xpub.name = self.publisher
544 xissue.publisher = xpub
545 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
547 attempt = 1
548 success = False
550 while not success and attempt < 4:
551 try:
552 params = {"xissue": xissue, "use_body": False}
553 cmd = addOrUpdateGDMLIssueXmlCmd(params)
554 cmd.do()
555 success = True
556 self.logger.debug(f"Issue {xissue.pid} inserted in database")
557 return xissue
558 except SolrError:
559 self.logger.warning(
560 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
561 )
562 attempt += 1
563 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
564 time.sleep(10)
565 except Exception as e:
566 self.logger.error(
567 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
568 )
569 raise e
571 if success is False:
572 raise ConnectionRefusedError("Cannot connect to SolR")
574 assert False, "Unreachable"
576 def get_metadata_using_citation_meta(
577 self,
578 xarticle: ArticleData,
579 xissue: IssueData,
580 soup: BeautifulSoup,
581 what: list[CitationLiteral] = [],
582 ):
583 """
584 :param xarticle: the xarticle that will collect the metadata
585 :param xissue: the xissue that will collect the publisher
586 :param soup: the BeautifulSoup object of tha article page
587 :param what: list of citation_ items to collect.
588 :return: None. The given article is modified
589 """
591 if "title" in what:
592 # TITLE
593 citation_title_node = soup.select_one("meta[name='citation_title']")
594 if citation_title_node: 594 ↛ 599line 594 didn't jump to line 599 because the condition on line 594 was always true
595 title = citation_title_node.get("content")
596 if isinstance(title, str): 596 ↛ 599line 596 didn't jump to line 599 because the condition on line 596 was always true
597 xarticle.title_tex = title
599 if "author" in what: 599 ↛ 628line 599 didn't jump to line 628 because the condition on line 599 was always true
600 # AUTHORS
601 citation_author_nodes = soup.select("meta[name^='citation_author']")
602 current_author: ContributorDict | None = None
603 for citation_author_node in citation_author_nodes:
604 if citation_author_node.get("name") == "citation_author":
605 text_author = citation_author_node.get("content")
606 if not isinstance(text_author, str): 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true
607 raise ValueError("Cannot parse author")
608 if text_author == "": 608 ↛ 609line 608 didn't jump to line 609 because the condition on line 608 was never true
609 current_author = None
610 continue
611 current_author = create_contributor(role="author", string_name=text_author)
612 xarticle.contributors.append(current_author)
613 continue
614 if current_author is None: 614 ↛ 615line 614 didn't jump to line 615 because the condition on line 614 was never true
615 self.logger.warning("Couldn't parse citation author")
616 continue
617 if citation_author_node.get("name") == "citation_author_institution":
618 text_institution = citation_author_node.get("content")
619 if not isinstance(text_institution, str): 619 ↛ 620line 619 didn't jump to line 620 because the condition on line 619 was never true
620 continue
621 current_author["addresses"].append(text_institution)
622 if citation_author_node.get("name") == "citation_author_ocrid": 622 ↛ 623line 622 didn't jump to line 623 because the condition on line 622 was never true
623 text_orcid = citation_author_node.get("content")
624 if not isinstance(text_orcid, str):
625 continue
626 current_author["orcid"] = text_orcid
628 if "pdf" in what:
629 # PDF
630 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
631 if citation_pdf_node:
632 pdf_url = citation_pdf_node.get("content")
633 if isinstance(pdf_url, str): 633 ↛ 636line 633 didn't jump to line 636 because the condition on line 633 was always true
634 add_pdf_link_to_xarticle(xarticle, pdf_url)
636 if "lang" in what:
637 # LANG
638 citation_lang_node = soup.select_one("meta[name='citation_language']")
639 if citation_lang_node: 639 ↛ 645line 639 didn't jump to line 645 because the condition on line 639 was always true
640 # TODO: check other language code
641 content_text = citation_lang_node.get("content")
642 if isinstance(content_text, str): 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true
643 xarticle.lang = standardize_tag(content_text)
645 if "abstract" in what:
646 # ABSTRACT
647 abstract_node = soup.select_one("meta[name='citation_abstract']")
648 if abstract_node is not None:
649 abstract = abstract_node.get("content")
650 if not isinstance(abstract, str): 650 ↛ 651line 650 didn't jump to line 651 because the condition on line 650 was never true
651 raise ValueError("Couldn't parse abstract from meta")
652 abstract = BeautifulSoup(abstract, "html.parser").text
653 lang = abstract_node.get("lang")
654 if not isinstance(lang, str):
655 lang = self.detect_language(abstract, xarticle)
656 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
658 if "page" in what:
659 # PAGES
660 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
661 if citation_fpage_node:
662 page = citation_fpage_node.get("content")
663 if isinstance(page, str): 663 ↛ 668line 663 didn't jump to line 668 because the condition on line 663 was always true
664 page = page.split("(")[0]
665 if len(page) < 32: 665 ↛ 668line 665 didn't jump to line 668 because the condition on line 665 was always true
666 xarticle.fpage = page
668 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
669 if citation_lpage_node:
670 page = citation_lpage_node.get("content")
671 if isinstance(page, str): 671 ↛ 676line 671 didn't jump to line 676 because the condition on line 671 was always true
672 page = page.split("(")[0]
673 if len(page) < 32: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true
674 xarticle.lpage = page
676 if "doi" in what:
677 # DOI
678 citation_doi_node = soup.select_one("meta[name='citation_doi']")
679 if citation_doi_node:
680 doi = citation_doi_node.get("content")
681 if isinstance(doi, str): 681 ↛ 688line 681 didn't jump to line 688 because the condition on line 681 was always true
682 doi = doi.strip()
683 pos = doi.find("10.")
684 if pos > 0:
685 doi = doi[pos:]
686 xarticle.doi = doi
688 if "mr" in what:
689 # MR
690 citation_mr_node = soup.select_one("meta[name='citation_mr']")
691 if citation_mr_node:
692 mr = citation_mr_node.get("content")
693 if isinstance(mr, str): 693 ↛ 699line 693 didn't jump to line 699 because the condition on line 693 was always true
694 mr = mr.strip()
695 if mr.find("MR") == 0: 695 ↛ 699line 695 didn't jump to line 699 because the condition on line 695 was always true
696 mr = mr[2:]
697 xarticle.extids.append(("mr-item-id", mr))
699 if "zbl" in what:
700 # ZBL
701 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
702 if citation_zbl_node:
703 zbl = citation_zbl_node.get("content")
704 if isinstance(zbl, str): 704 ↛ 710line 704 didn't jump to line 710 because the condition on line 704 was always true
705 zbl = zbl.strip()
706 if zbl.find("Zbl") == 0: 706 ↛ 710line 706 didn't jump to line 710 because the condition on line 706 was always true
707 zbl = zbl[3:].strip()
708 xarticle.extids.append(("zbl-item-id", zbl))
710 if "publisher" in what:
711 # PUBLISHER
712 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
713 if citation_publisher_node:
714 pub = citation_publisher_node.get("content")
715 if isinstance(pub, str): 715 ↛ 722line 715 didn't jump to line 722 because the condition on line 715 was always true
716 pub = pub.strip()
717 if pub != "": 717 ↛ 722line 717 didn't jump to line 722 because the condition on line 717 was always true
718 xpub = create_publisherdata()
719 xpub.name = pub
720 xissue.publisher = xpub
722 if "keywords" in what:
723 # KEYWORDS
724 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
725 for kwd_node in citation_kwd_nodes:
726 kwds = kwd_node.get("content")
727 if isinstance(kwds, str): 727 ↛ 725line 727 didn't jump to line 725 because the condition on line 727 was always true
728 kwds = kwds.split(",")
729 for kwd in kwds:
730 if kwd == "":
731 continue
732 kwd = kwd.strip()
733 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
735 if "references" in what:
736 citation_references = soup.select("meta[name='citation_reference']")
737 for index, tag in enumerate(citation_references):
738 content = tag.get("content")
739 if not isinstance(content, str): 739 ↛ 740line 739 didn't jump to line 740 because the condition on line 739 was never true
740 raise ValueError("Cannot parse citation_reference meta")
741 label = str(index + 1)
742 if regex.match(r"^\[\d+\].*", content): 742 ↛ 743line 742 didn't jump to line 743 because the condition on line 742 was never true
743 label = None
744 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
746 def get_metadata_using_dcterms(
747 self,
748 xarticle: ArticleData,
749 soup: "Tag",
750 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
751 ):
752 if "abstract" in what: 752 ↛ 760line 752 didn't jump to line 760 because the condition on line 752 was always true
753 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
754 if abstract_tag: 754 ↛ 760line 754 didn't jump to line 760 because the condition on line 754 was always true
755 abstract_text = self.get_str_attr(abstract_tag, "content")
756 xarticle.abstracts.append(
757 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
758 )
760 if "keywords" in what: 760 ↛ 769line 760 didn't jump to line 769 because the condition on line 760 was always true
761 keyword_tags = soup.select("meta[name='DC.subject']")
762 for tag in keyword_tags:
763 kwd_text = tag.get("content")
764 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true
765 continue
766 kwd = create_subj(value=kwd_text)
767 xarticle.kwds.append(kwd)
769 if "date_published" in what: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true
770 published_tag = soup.select_one("meta[name='DC.Date.created']")
771 if published_tag:
772 published_text = self.get_str_attr(published_tag, "content")
773 xarticle.date_published = published_text
775 if "article_type" in what: 775 ↛ 776line 775 didn't jump to line 776 because the condition on line 775 was never true
776 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
777 if type_tag:
778 type_text = self.get_str_attr(type_tag, "content")
779 xarticle.atype = type_text
781 def create_xissue(
782 self,
783 url: str | None,
784 year: str,
785 volume_number: str | None,
786 issue_number: str | None = None,
787 vseries: str | None = None,
788 ):
789 if url is not None and url.endswith("/"):
790 url = url[:-1]
791 xissue = create_issuedata()
792 xissue.url = url
794 xissue.pid = self.get_issue_pid(
795 self.collection_id, year, volume_number, issue_number, vseries
796 )
798 xissue.year = year
800 if volume_number is not None:
801 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
803 if issue_number is not None:
804 xissue.number = issue_number.replace(",", "-")
806 if vseries is not None: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true
807 xissue.vseries = vseries
808 return xissue
810 def detect_language(self, text: str, article: ArticleData | None = None):
811 if article and article.lang is not None and article.lang != "und":
812 return article.lang
814 language = self.language_detector.detect_language_of(text)
816 if not language: 816 ↛ 817line 816 didn't jump to line 817 because the condition on line 816 was never true
817 return "und"
818 return language.iso_code_639_1.name.lower()
820 def get_str_attr(self, tag: "Tag", attr: str):
821 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
822 node_attr = tag.get(attr)
823 if isinstance(node_attr, list): 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true
824 raise ValueError(
825 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
826 )
827 if node_attr is None: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true
828 raise ValueError(
829 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
830 )
831 return node_attr
833 def create_trans_title(
834 self,
835 resource_type: str,
836 title_str: str,
837 lang: str,
838 xresource_lang: str,
839 title_type: str = "main",
840 ):
841 tag = "trans-title" if resource_type == "article" else "issue-title"
843 ckeditor_data = build_jats_data_from_html_field(
844 title_str,
845 tag=tag,
846 text_lang=lang,
847 resource_lang=xresource_lang,
848 delimiter_inline=self.delimiter_inline_formula,
849 delimiter_disp=self.delimiter_disp_formula,
850 )
852 titledata = create_titledata(
853 lang=lang,
854 type="main",
855 title_html=ckeditor_data["value_html"],
856 title_xml=ckeditor_data["value_xml"],
857 )
859 return titledata
861 references_mapping = {
862 "citation_title": get_article_title_xml,
863 "citation_journal_title": get_source_xml,
864 "citation_publication_date": get_year_xml,
865 "citation_firstpage": get_fpage_xml,
866 "citation_lastpage": get_lpage_xml,
867 }
869 @classmethod
870 def __parse_meta_citation_reference(cls, content: str, label=None):
871 categories = content.split(";")
873 if len(categories) == 1:
874 return JatsBase.bake_ref(content, label=label)
876 citation_data = [c.split("=") for c in categories if "=" in c]
877 del categories
879 xml_string = ""
880 authors_parsed = False
881 authors_strings = []
882 for data in citation_data:
883 key = data[0].strip()
884 citation_content = data[1]
885 if key == "citation_author":
886 authors_strings.append(get_author_xml(template_str=citation_content))
887 continue
888 elif not authors_parsed:
889 xml_string += ", ".join(authors_strings)
890 authors_parsed = True
892 if key in cls.references_mapping:
893 xml_string += " " + cls.references_mapping[key](citation_content)
895 return JatsBase.bake_ref(xml_string, label=label)
897 @classmethod
898 def get_or_create_source(cls):
899 source, created = Source.objects.get_or_create(
900 domain=cls.source_domain,
901 defaults={
902 "name": cls.source_name,
903 "website": cls.source_website,
904 "view_id": cls.get_view_id(),
905 },
906 )
907 if created: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true
908 source.save()
909 return source
911 @staticmethod
912 def get_issue_pid(
913 collection_id: str,
914 year: str,
915 volume_number: str | None = None,
916 issue_number: str | None = None,
917 series: str | None = None,
918 ):
919 # Replace any non-word character with an underscore
920 pid = f"{collection_id}_{year}"
921 if series is not None: 921 ↛ 922line 921 didn't jump to line 922 because the condition on line 921 was never true
922 pid += f"_{series}"
923 if volume_number is not None:
924 pid += f"_{volume_number}"
925 if issue_number is not None:
926 pid += f"_{issue_number}"
927 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
928 return pid
930 @staticmethod
931 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
932 pages_split = pages.split(separator)
933 if len(pages_split) == 0: 933 ↛ 934line 933 didn't jump to line 934 because the condition on line 933 was never true
934 article.page_range = pages
935 if len(pages_split) > 0: 935 ↛ exitline 935 didn't return from function 'set_pages' because the condition on line 935 was always true
936 if pages[0].isnumeric(): 936 ↛ exitline 936 didn't return from function 'set_pages' because the condition on line 936 was always true
937 article.fpage = pages_split[0]
938 if ( 938 ↛ 943line 938 didn't jump to line 943 because the condition on line 938 was never true
939 len(pages_split) > 1
940 and pages_split[0] != pages_split[1]
941 and pages_split[1].isnumeric()
942 ):
943 article.lpage = pages_split[1]
945 @staticmethod
946 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
947 content_type = response.headers.get("Content-Type")
948 if regex.match(rb"^%PDF-\d\.\d", chunk):
949 if content_type and "application/pdf" in content_type:
950 # The file is unmistakably a pdf
951 return [
952 True,
953 response,
954 {
955 "status": ExtlinkChecked.Status.OK,
956 "message": "",
957 },
958 ]
959 # The file is a pdf, but the content type advertised by the server is wrong
960 return [
961 True,
962 response,
963 {
964 "status": ExtlinkChecked.Status.WARNING,
965 "message": f"Content-Type header: {content_type}",
966 },
967 ]
969 # Reaching here means we couldn't find the pdf.
970 if not content_type or "application/pdf" not in content_type:
971 return [
972 False,
973 response,
974 {
975 "status": ExtlinkChecked.Status.ERROR,
976 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
977 },
978 ]
980 return [
981 False,
982 response,
983 {
984 "status": ExtlinkChecked.Status.ERROR,
985 "message": f"PDF Header not found: got {chunk}",
986 },
987 ]
989 @classmethod
990 async def a_check_pdf_link_validity(
991 cls, url: str, verify=True
992 ) -> tuple[bool, aiohttp.ClientResponse, dict]:
993 """
994 Check the validity of the PDF links.
995 """
996 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
997 header = {
998 "Range": f"bytes=0-{CHUNK_SIZE}",
999 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1000 }
1001 async with cls.async_session.get(
1002 url, headers=header, allow_redirects=True, ssl=verify
1003 ) as response:
1004 try:
1005 chunk = await response.content.read(CHUNK_SIZE)
1006 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1007 except StopIteration:
1008 return [
1009 False,
1010 response,
1011 {
1012 "status": ExtlinkChecked.Status.ERROR,
1013 "message": "Error reading PDF header",
1014 },
1015 ]
1017 @classmethod
1018 def check_pdf_link_validity(
1019 cls, url: str, verify=True
1020 ) -> tuple[bool, requests.Response | None, dict]:
1021 """
1022 Check the validity of the PDF links.
1023 """
1024 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1025 header = {
1026 "Range": f"bytes=0-{CHUNK_SIZE}",
1027 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1028 }
1029 with cls.session.get(
1030 url, headers=header, allow_redirects=True, verify=verify, stream=True
1031 ) as response:
1032 try:
1033 chunk = next(response.iter_content(CHUNK_SIZE))
1034 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1035 except StopIteration:
1036 return [
1037 False,
1038 response,
1039 {
1040 "status": ExtlinkChecked.Status.ERROR,
1041 "message": "Error reading PDF header",
1042 },
1043 ]
1045 @classmethod
1046 async def check_extlink_validity(cls, extlink: "ExtLink"):
1047 """
1048 Method used by rot_monitoring to check if links have expired
1049 """
1050 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1051 header = {
1052 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1053 }
1054 verify = True
1055 if not cls.verify:
1056 verify = False
1057 try:
1058 # For the GDZ links, we just check if the http response is 200 or 206
1059 if (
1060 extlink.rel == "article-pdf"
1061 and "gdz.sub.uni-goettingen.de" not in extlink.location
1062 ):
1063 isok, response, message = await cls.a_check_pdf_link_validity(
1064 extlink.location, verify
1065 )
1066 defaults.update(message)
1067 defaults["http_status"] = response.status
1068 else:
1069 async with cls.async_session.get(
1070 url=extlink.location,
1071 headers=header,
1072 allow_redirects=True,
1073 ssl=verify,
1074 ) as response:
1075 defaults["http_status"] = response.status
1076 if response.status not in (200, 206):
1077 defaults["status"] = ExtlinkChecked.Status.ERROR
1079 except aiohttp.ClientSSLError:
1080 cls.logger.error("SSL error for the url: %s", extlink.location)
1081 defaults["status"] = ExtlinkChecked.Status.ERROR
1082 defaults["message"] = "SSL error"
1083 except aiohttp.ClientConnectionError:
1084 cls.logger.error("Connection error for the url: %s", extlink.location)
1085 defaults["status"] = ExtlinkChecked.Status.ERROR
1086 defaults["message"] = "Connection error"
1087 except TimeoutError:
1088 cls.logger.error("Timeout error for the url: %s", extlink.location)
1089 defaults["status"] = ExtlinkChecked.Status.ERROR
1090 defaults["message"] = "Timeout error"
1091 finally:
1092 try:
1093 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1094 cls.logger.info(
1095 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location
1096 )
1097 except IntegrityError:
1098 cls.logger.error(
1099 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location
1100 )
1102 def resolve_year_end(self, pid: str, default: int) -> int:
1103 if pid in self.pid_year_restrictions:
1104 return datetime.now().year - self.pid_year_restrictions[pid]
1105 return default