Coverage for src / crawler / abstract_crawlers / base_crawler.py: 65%
588 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1import logging
2import time
3from collections.abc import Iterable
4from datetime import datetime, timedelta
5from email.policy import EmailPolicy
6from typing import TYPE_CHECKING, Literal
8import aiohttp
9import regex
10import requests
11from bs4 import BeautifulSoup
12from django.conf import settings
13from django.contrib.auth.models import User
14from django.db.utils import IntegrityError
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_subj,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pysolr import SolrError
49from requests.adapters import HTTPAdapter
50from requests_cache import CachedSession
51from urllib3 import Retry
53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
54from crawler.models import Source
55from crawler.models.extlink_checked import ExtlinkChecked
56from crawler.types import CitationLiteral
57from crawler.utils import (
58 add_pdf_link_to_xarticle,
59 cleanup_str,
60 get_all_cols,
61 get_or_create_collection,
62 get_session,
63)
65if TYPE_CHECKING:
66 from bs4 import Tag
69class CrawlerTitleDict(TitleDict):
70 title_tex: str | None
73class BaseCollectionCrawler:
74 """
75 Base collection for the crawlers.
76 To create a crawler:
77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
79 3) update factory.py so that crawler_factory can return your new crawler
80 """
82 logger = logging.getLogger(__name__)
83 tracer = trace.get_tracer(__name__)
85 source_name = ""
86 source_domain = ""
87 source_website = ""
89 issue_href = ""
91 collection = None
92 source = None
93 user = None
94 session: requests.Session | CachedSession
95 async_session: aiohttp.ClientSession
96 is_checkable = True
97 verify = True
98 headers = {
99 "accept_encoding": "utf-8",
100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
102 }
104 # seconds to wait between two http requests
105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
106 # seconds to wait before aborting the connection (if no bytes are recieved)
107 requests_timeout = 60
109 latext_parser = LatexNodes2Text()
111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
112 # do not use the "$" to surround tex formulas
113 delimiter_inline_formula = "$"
114 delimiter_disp_formula = "$"
116 # HACK : Workaround for tests (monkeypatching)
117 # We store the class here, so we can monkeypatch it when running tests
118 # subCrawlers = {
119 # LofplCrawler: None
120 # }
121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
123 _language_detector: LanguageDetector | None = None
124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
126 force_refresh = False
128 # Whereas to include headers in requests cache key
129 match_headers = False
130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
133 ignore_missing_pdf = True
135 @classmethod
136 def get_view_id(cls):
137 return cls.source_domain
139 @property
140 def language_detector(self):
141 """Crawler Instance singleton for language builder.
142 Late init of LanguageDetector to save on memory"""
143 if not self._language_detector:
144 self._language_detector = self._language_detector_builder.build()
145 return self._language_detector
147 def __init__(
148 self,
149 *args,
150 username: str,
151 collection_id: str,
152 dry: bool = False,
153 publisher: str = "",
154 force_refresh=False,
155 collection_url: str | None = None,
156 ):
157 if not collection_url: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 all_cols = get_all_cols()
159 col = all_cols[collection_id]
161 collection_url = col["sources"].get(self.source_domain, None)
162 if collection_url is None:
163 raise ValueError(
164 f"Source {self.source_domain} not found for collection {collection_id}"
165 )
166 self.collection_url = collection_url
167 for CrawlerClass in self.subCrawlers: 167 ↛ 168line 167 didn't jump to line 168 because the loop on line 167 never started
168 self.subCrawlers[CrawlerClass] = CrawlerClass(
169 *args,
170 username=username,
171 collection_id=collection_id,
172 dry=dry,
173 publisher=publisher,
174 collection_url=collection_url,
175 )
176 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
177 # self.logger = logging.getLogger(__name__)
179 self.username = username
181 self.collection_id = collection_id
183 self.dry = dry
184 self.publisher = publisher
186 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
187 BaseCollectionCrawler.session = requests.Session()
189 # Skipped when running tests
190 self.initialize()
192 self.force_refresh = force_refresh
194 # We implemented custom retry behaviour, so we don't want to make extra requests here
196 def initialize(self):
197 """
198 Acts as a "second" init function to skip model accesses during test data generation
199 """
200 self.collection = get_or_create_collection(self.collection_id)
201 self.source = self.get_or_create_source()
202 self.user = User.objects.get(username=self.username)
203 BaseCollectionCrawler.session = get_session()
204 BaseCollectionCrawler.session.verify = self.verify
205 self.session.delay = self.requests_interval
206 retries = Retry(
207 total=0,
208 )
209 self.session.mount("https://", HTTPAdapter(max_retries=retries))
210 self.session.mount("http://", HTTPAdapter(max_retries=retries))
212 @classmethod
213 def can_crawl(cls, pid: str) -> bool:
214 return True
216 def parse_collection_content(self, content: str) -> list[IssueData]:
217 """
218 Parse the HTML content with BeautifulSoup
219 returns a list of xissue.
220 Override this function in a derived class
221 """
222 return []
224 def parse_issue_content(self, content: str, xissue: IssueData):
225 """
226 Parse the HTML content with BeautifulSoup
227 Fills the xissue.articles
228 Override this function in a derived class.
230 CAV : You are supposed to create articles there. Please assign a PID to each article.
231 The PID can be `a + article_index`, like this : `a0` `a21`
232 """
234 def parse_article_content(
235 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
236 ) -> ArticleData | None:
237 """
238 Parse the HTML content with BeautifulSoup
239 returns the xarticle.
240 Override this function in a derived class.
241 The xissue is passed to the function in case the article page has issue information (ex: publisher)
242 The article url is also passed as a parameter
244 CAV : You are supposed to assign articles pid again here
245 """
246 return xarticle
248 @tracer.start_as_current_span("crawl_collection")
249 def crawl_collection(self):
250 # TODO: Comments, filter
251 """
252 Crawl an entire collection. ptf.models.Container objects are created.
253 - get the HTML content of the collection_url
254 - parse the HTML content with beautifulsoup to extract the list of issues
255 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
256 - crawl each issue if col_only is False
257 - Returns the list of merged issues.
258 It is an OrderedDict {pid: {"issues": xissues}}
259 The key is the pid of the merged issues.
260 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
261 the pid is then made with 1999-2000__6_
262 """
264 if self.source is None:
265 raise RuntimeError("ERROR: the source is not set")
267 content = self.download_file(self.collection_url)
268 if content:
269 xissues = self.parse_collection_content(content)
270 else:
271 # download_file returns None (404)
272 return None
274 """
275 Some collections split the same volumes in different pages
276 Ex: Volume 6 (2000) and Volume 6 (1999)
277 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
278 """
279 # merged_xissues = self.merge_xissues(xissues)
281 xissues_dict = {str(i.pid): i for i in xissues}
283 return xissues_dict
285 @tracer.start_as_current_span("crawl_issue")
286 def crawl_issue(self, xissue: IssueData):
287 """
288 Crawl 1 wag page of an issue.
289 - get the HTML content of the issue
290 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
291 - crawl each article
292 """
293 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
294 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
295 issue_url = xissue.url
296 if issue_url is not None:
297 if issue_url.endswith(".pdf"):
298 add_pdf_link_to_xarticle(xissue, issue_url)
299 xissue.url = None
300 else:
301 content = self.download_file(issue_url)
302 with self.tracer.start_as_current_span("parse_issue_content"):
303 self.parse_issue_content(content, xissue)
305 xarticles = xissue.articles
307 parsed_xarticles = []
309 for xarticle in xarticles:
310 parsed_xarticle = self.crawl_article(xarticle, xissue)
311 if parsed_xarticle is not None:
312 parsed_xarticles.append(parsed_xarticle)
314 xissue.articles = parsed_xarticles
316 issue_has_pdf = self.article_has_pdf(xissue)
318 if self.ignore_missing_pdf:
319 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
320 if self.dry:
321 return
322 if len(xissue.articles) == 0 and not issue_has_pdf:
323 return
324 self.process_resource_metadata(xissue, resource_type="issue")
326 self.add_xissue_into_database(xissue)
328 @staticmethod
329 def article_has_source(art: ArticleData | IssueData):
330 return (
331 next(
332 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
333 None,
334 )
335 is not None
336 )
338 @staticmethod
339 def article_has_pdf(art: ArticleData | IssueData):
340 return (
341 next(
342 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
343 None,
344 )
345 is not None
346 )
348 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
349 # ARTICLE URL as en ExtLink (to display the link in the article page)
350 if xarticle.url is None:
351 if not self.article_has_source(xarticle): 351 ↛ 361line 351 didn't jump to line 361 because the condition on line 351 was always true
352 if xissue.url:
353 article_source = xissue.url
354 else:
355 article_source = self.collection_url
356 ext_link = create_extlink()
357 ext_link["rel"] = "source"
358 ext_link["location"] = article_source
359 ext_link["metadata"] = self.source_domain
360 xarticle.ext_links.append(ext_link)
361 return self.process_article_metadata(xarticle)
363 content = self.download_file(xarticle.url)
364 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
366 try:
367 with self.tracer.start_as_current_span("parse_article_content"):
368 parsed_xarticle = self.parse_article_content(
369 content, xissue, xarticle, xarticle.url
370 )
371 except ValueError as e:
372 self.logger.warning(e)
373 self.logger.warning("Retrying in 5 mins while invalidating cache")
374 time.sleep(5 * 60)
375 content = self.download_file(xarticle.url, force_refresh=True)
376 with self.tracer.start_as_current_span("parse_article_content"):
377 parsed_xarticle = self.parse_article_content(
378 content, xissue, xarticle, xarticle.url
379 )
381 if parsed_xarticle is None: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 return None
384 if parsed_xarticle.doi:
385 parsed_xarticle.pid = (
386 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
387 )
389 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
390 ext_link = create_extlink()
391 ext_link["rel"] = "source"
392 ext_link["location"] = parsed_xarticle.url
393 ext_link["metadata"] = self.source_domain
394 parsed_xarticle.ext_links.append(ext_link)
396 # The article title may have formulas surrounded with '$'
397 return self.process_article_metadata(parsed_xarticle)
399 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
400 tag = "article-title" if resource_type == "article" else "issue-title"
402 # Process title tex
403 ckeditor_data = build_jats_data_from_html_field(
404 xresource.title_tex,
405 tag=tag,
406 text_lang=xresource.lang,
407 delimiter_inline=self.delimiter_inline_formula,
408 delimiter_disp=self.delimiter_disp_formula,
409 )
411 xresource.title_html = ckeditor_data["value_html"]
412 # xresource.title_tex = ckeditor_data["value_tex"]
413 xresource.title_xml = ckeditor_data["value_xml"]
415 # Process trans_title tex
416 if xresource.trans_title_tex: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 self.logger.warning(
418 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
419 )
420 trans_title = self.create_trans_title(
421 xresource_lang=xresource.lang,
422 resource_type=resource_type,
423 title_tex=xresource.trans_title_tex,
424 lang=xresource.trans_lang,
425 )
426 xresource.titles.append(trans_title)
428 abstracts_to_parse = [
429 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
430 ]
431 # abstract may have formulas surrounded with '$'
432 if len(abstracts_to_parse) > 0:
433 for xabstract in abstracts_to_parse:
434 ckeditor_data = build_jats_data_from_html_field(
435 xabstract["value_tex"],
436 tag="abstract",
437 text_lang=xabstract["lang"],
438 resource_lang=xresource.lang,
439 field_type="abstract",
440 delimiter_inline=self.delimiter_inline_formula,
441 delimiter_disp=self.delimiter_disp_formula,
442 )
444 xabstract["value_html"] = ckeditor_data["value_html"]
445 # xabstract["value_tex"] = ckeditor_data["value_tex"]
446 xabstract["value_xml"] = ckeditor_data["value_xml"]
448 return xresource
450 def process_article_metadata(self, xarticle: ArticleData):
451 self.process_resource_metadata(xarticle)
452 for bibitem in xarticle.bibitems:
453 bibitem.type = "unknown"
454 update_data_for_jats(xarticle, with_label=False)
456 return xarticle
458 def download_file(self, url: str, force_refresh=False, headers={}):
459 """
460 Downloads a page and returns its content (decoded string).
461 This function handles retries and decoding
462 """
463 current_exception: Exception | None = None
464 for attempt in range(3):
465 try:
466 kwargs = {
467 "url": url,
468 "headers": {**self.headers, **headers},
469 "timeout": self.requests_timeout,
470 }
471 if attempt > 0 and isinstance(self.session, CachedSession):
472 kwargs["force_refresh"] = True
473 response = self.session.get(**kwargs)
475 content = self.decode_response(response)
476 if content == "" or not content:
477 raise requests.exceptions.HTTPError(response)
479 return content
480 except (
481 requests.ConnectionError,
482 requests.ConnectTimeout,
483 requests.exceptions.HTTPError,
484 ) as e:
485 current_exception = e
486 self.logger.debug(f"Caught error : {e}", extra={"url": url})
487 # 15 mins, 30 mins, 45 mins
488 delay_minutes = attempt * 15
489 self.logger.debug(
490 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
491 extra={"url": url},
492 )
493 time.sleep(delay_minutes * 60)
495 raise current_exception
497 def decode_response(self, response: requests.Response, encoding: str | None = None):
498 """Override this if the content-type headers from the sources are advertising something else than the actual content
499 SASA needs this"""
500 # Force
501 if encoding:
502 response.encoding = encoding
503 return response.text
505 # Attempt to get encoding using HTTP headers
506 content_type_tag = response.headers.get("Content-Type", None)
508 if content_type_tag: 508 ↛ 515line 508 didn't jump to line 515 because the condition on line 508 was always true
509 charset = self.parse_content_type_charset(content_type_tag)
510 if charset: 510 ↛ 511line 510 didn't jump to line 511 because the condition on line 510 was never true
511 response.encoding = charset
512 return response.text
514 # Attempt to get encoding using HTML meta charset tag
515 soup = BeautifulSoup(response.text, "html5lib")
516 charset = soup.select_one("meta[charset]")
517 if charset:
518 htmlencoding = charset.get("charset")
519 if isinstance(htmlencoding, str): 519 ↛ 524line 519 didn't jump to line 524 because the condition on line 519 was always true
520 response.encoding = htmlencoding
521 return response.text
523 # Attempt to get encoding using HTML meta content type tag
524 content_type_tag = soup.select_one(
525 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
526 )
527 if content_type_tag:
528 content_type = content_type_tag.get("content")
529 if isinstance(content_type, str): 529 ↛ 535line 529 didn't jump to line 535 because the condition on line 529 was always true
530 charset = self.parse_content_type_charset(content_type)
531 if charset: 531 ↛ 535line 531 didn't jump to line 535 because the condition on line 531 was always true
532 response.encoding = charset
533 return response.text
535 return response.text
537 @staticmethod
538 def parse_content_type_charset(content_type: str):
539 header = EmailPolicy.header_factory("content-type", content_type)
540 if "charset" in header.params:
541 return header.params.get("charset")
543 @tracer.start_as_current_span("add_xissue_to_database")
544 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
545 xissue.journal = self.collection
546 xissue.source = self.source_domain
548 if xissue.year == "":
549 raise ValueError("Failsafe : Cannot insert issue without a year")
551 xpub = create_publisherdata()
552 xpub.name = self.publisher
553 xissue.publisher = xpub
554 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
556 attempt = 1
557 success = False
559 while not success and attempt < 4:
560 try:
561 params = {"xissue": xissue, "use_body": False}
562 cmd = addOrUpdateGDMLIssueXmlCmd(params)
563 cmd.do()
564 success = True
565 self.logger.debug(f"Issue {xissue.pid} inserted in database")
566 return xissue
567 except SolrError:
568 self.logger.warning(
569 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
570 )
571 attempt += 1
572 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
573 time.sleep(10)
574 except Exception as e:
575 self.logger.error(
576 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
577 )
578 raise e
580 if success is False:
581 raise ConnectionRefusedError("Cannot connect to SolR")
583 assert False, "Unreachable"
585 def get_metadata_using_citation_meta(
586 self,
587 xarticle: ArticleData,
588 xissue: IssueData,
589 soup: BeautifulSoup,
590 what: list[CitationLiteral] = [],
591 ):
592 """
593 :param xarticle: the xarticle that will collect the metadata
594 :param xissue: the xissue that will collect the publisher
595 :param soup: the BeautifulSoup object of tha article page
596 :param what: list of citation_ items to collect.
597 :return: None. The given article is modified
598 """
600 if "title" in what:
601 # TITLE
602 citation_title_node = soup.select_one("meta[name='citation_title']")
603 if citation_title_node: 603 ↛ 608line 603 didn't jump to line 608 because the condition on line 603 was always true
604 title = citation_title_node.get("content")
605 if isinstance(title, str): 605 ↛ 608line 605 didn't jump to line 608 because the condition on line 605 was always true
606 xarticle.title_tex = title
608 if "author" in what: 608 ↛ 637line 608 didn't jump to line 637 because the condition on line 608 was always true
609 # AUTHORS
610 citation_author_nodes = soup.select("meta[name^='citation_author']")
611 current_author: ContributorDict | None = None
612 for citation_author_node in citation_author_nodes:
613 if citation_author_node.get("name") == "citation_author":
614 text_author = citation_author_node.get("content")
615 if not isinstance(text_author, str): 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true
616 raise ValueError("Cannot parse author")
617 if text_author == "": 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true
618 current_author = None
619 continue
620 current_author = create_contributor(role="author", string_name=text_author)
621 xarticle.contributors.append(current_author)
622 continue
623 if current_author is None: 623 ↛ 624line 623 didn't jump to line 624 because the condition on line 623 was never true
624 self.logger.warning("Couldn't parse citation author")
625 continue
626 if citation_author_node.get("name") == "citation_author_institution":
627 text_institution = citation_author_node.get("content")
628 if not isinstance(text_institution, str): 628 ↛ 629line 628 didn't jump to line 629 because the condition on line 628 was never true
629 continue
630 current_author["addresses"].append(text_institution)
631 if citation_author_node.get("name") == "citation_author_ocrid": 631 ↛ 632line 631 didn't jump to line 632 because the condition on line 631 was never true
632 text_orcid = citation_author_node.get("content")
633 if not isinstance(text_orcid, str):
634 continue
635 current_author["orcid"] = text_orcid
637 if "pdf" in what:
638 # PDF
639 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
640 if citation_pdf_node:
641 pdf_url = citation_pdf_node.get("content")
642 if isinstance(pdf_url, str): 642 ↛ 645line 642 didn't jump to line 645 because the condition on line 642 was always true
643 add_pdf_link_to_xarticle(xarticle, pdf_url)
645 if "lang" in what:
646 # LANG
647 citation_lang_node = soup.select_one("meta[name='citation_language']")
648 if citation_lang_node: 648 ↛ 654line 648 didn't jump to line 654 because the condition on line 648 was always true
649 # TODO: check other language code
650 content_text = citation_lang_node.get("content")
651 if isinstance(content_text, str): 651 ↛ 654line 651 didn't jump to line 654 because the condition on line 651 was always true
652 xarticle.lang = standardize_tag(content_text)
654 if "abstract" in what:
655 # ABSTRACT
656 abstract_node = soup.select_one("meta[name='citation_abstract']")
657 if abstract_node is not None:
658 abstract = abstract_node.get("content")
659 if not isinstance(abstract, str): 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true
660 raise ValueError("Couldn't parse abstract from meta")
661 abstract = BeautifulSoup(abstract, "html.parser").text
662 lang = abstract_node.get("lang")
663 if not isinstance(lang, str):
664 lang = self.detect_language(abstract, xarticle)
665 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
667 if "page" in what:
668 # PAGES
669 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
670 if citation_fpage_node:
671 page = citation_fpage_node.get("content")
672 if isinstance(page, str): 672 ↛ 677line 672 didn't jump to line 677 because the condition on line 672 was always true
673 page = page.split("(")[0]
674 if len(page) < 32: 674 ↛ 677line 674 didn't jump to line 677 because the condition on line 674 was always true
675 xarticle.fpage = page
677 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
678 if citation_lpage_node:
679 page = citation_lpage_node.get("content")
680 if isinstance(page, str): 680 ↛ 685line 680 didn't jump to line 685 because the condition on line 680 was always true
681 page = page.split("(")[0]
682 if len(page) < 32: 682 ↛ 685line 682 didn't jump to line 685 because the condition on line 682 was always true
683 xarticle.lpage = page
685 if "doi" in what:
686 # DOI
687 citation_doi_node = soup.select_one("meta[name='citation_doi']")
688 if citation_doi_node:
689 doi = citation_doi_node.get("content")
690 if isinstance(doi, str): 690 ↛ 697line 690 didn't jump to line 697 because the condition on line 690 was always true
691 doi = doi.strip()
692 pos = doi.find("10.")
693 if pos > 0:
694 doi = doi[pos:]
695 xarticle.doi = doi
697 if "mr" in what:
698 # MR
699 citation_mr_node = soup.select_one("meta[name='citation_mr']")
700 if citation_mr_node:
701 mr = citation_mr_node.get("content")
702 if isinstance(mr, str): 702 ↛ 708line 702 didn't jump to line 708 because the condition on line 702 was always true
703 mr = mr.strip()
704 if mr.find("MR") == 0: 704 ↛ 708line 704 didn't jump to line 708 because the condition on line 704 was always true
705 mr = mr[2:]
706 xarticle.extids.append(("mr-item-id", mr))
708 if "zbl" in what:
709 # ZBL
710 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
711 if citation_zbl_node:
712 zbl = citation_zbl_node.get("content")
713 if isinstance(zbl, str): 713 ↛ 719line 713 didn't jump to line 719 because the condition on line 713 was always true
714 zbl = zbl.strip()
715 if zbl.find("Zbl") == 0: 715 ↛ 719line 715 didn't jump to line 719 because the condition on line 715 was always true
716 zbl = zbl[3:].strip()
717 xarticle.extids.append(("zbl-item-id", zbl))
719 if "publisher" in what:
720 # PUBLISHER
721 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
722 if citation_publisher_node:
723 pub = citation_publisher_node.get("content")
724 if isinstance(pub, str): 724 ↛ 731line 724 didn't jump to line 731 because the condition on line 724 was always true
725 pub = pub.strip()
726 if pub != "": 726 ↛ 731line 726 didn't jump to line 731 because the condition on line 726 was always true
727 xpub = create_publisherdata()
728 xpub.name = pub
729 xissue.publisher = xpub
731 if "keywords" in what:
732 # KEYWORDS
733 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
734 for kwd_node in citation_kwd_nodes:
735 kwds = kwd_node.get("content")
736 if isinstance(kwds, str): 736 ↛ 734line 736 didn't jump to line 734 because the condition on line 736 was always true
737 kwds = kwds.split(",")
738 for kwd in kwds:
739 if kwd == "":
740 continue
741 kwd = kwd.strip()
742 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
744 if "references" in what:
745 citation_references = soup.select("meta[name='citation_reference']")
746 for index, tag in enumerate(citation_references):
747 content = tag.get("content")
748 if not isinstance(content, str): 748 ↛ 749line 748 didn't jump to line 749 because the condition on line 748 was never true
749 raise ValueError("Cannot parse citation_reference meta")
750 label = str(index + 1)
751 if regex.match(r"^\[\d+\].*", content): 751 ↛ 752line 751 didn't jump to line 752 because the condition on line 751 was never true
752 label = None
753 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
755 def get_metadata_using_dcterms(
756 self,
757 xarticle: ArticleData,
758 soup: "Tag",
759 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
760 ):
761 if "abstract" in what: 761 ↛ 769line 761 didn't jump to line 769 because the condition on line 761 was always true
762 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
763 if abstract_tag: 763 ↛ 769line 763 didn't jump to line 769 because the condition on line 763 was always true
764 abstract_text = self.get_str_attr(abstract_tag, "content")
765 xarticle.abstracts.append(
766 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
767 )
769 if "keywords" in what: 769 ↛ 778line 769 didn't jump to line 778 because the condition on line 769 was always true
770 keyword_tags = soup.select("meta[name='DC.subject']")
771 for tag in keyword_tags:
772 kwd_text = tag.get("content")
773 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true
774 continue
775 kwd = create_subj(value=kwd_text)
776 xarticle.kwds.append(kwd)
778 if "date_published" in what: 778 ↛ 779line 778 didn't jump to line 779 because the condition on line 778 was never true
779 published_tag = soup.select_one("meta[name='DC.Date.created']")
780 if published_tag:
781 published_text = self.get_str_attr(published_tag, "content")
782 xarticle.date_published = published_text
784 if "article_type" in what: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true
785 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
786 if type_tag:
787 type_text = self.get_str_attr(type_tag, "content")
788 xarticle.atype = type_text
790 def create_xissue(
791 self,
792 url: str | None,
793 year: str,
794 volume_number: str | None,
795 issue_number: str | None = None,
796 vseries: str | None = None,
797 ):
798 if url is not None and url.endswith("/"):
799 url = url[:-1]
800 xissue = create_issuedata()
801 xissue.url = url
803 xissue.pid = self.get_issue_pid(
804 self.collection_id, year, volume_number, issue_number, vseries
805 )
807 xissue.year = year
809 if volume_number is not None:
810 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
812 if issue_number is not None:
813 xissue.number = issue_number.replace(",", "-")
815 if vseries is not None: 815 ↛ 816line 815 didn't jump to line 816 because the condition on line 815 was never true
816 xissue.vseries = vseries
817 return xissue
819 def detect_language(self, text: str, article: ArticleData | None = None):
820 if article and article.lang is not None and article.lang != "und":
821 return article.lang
823 language = self.language_detector.detect_language_of(text)
825 if not language: 825 ↛ 826line 825 didn't jump to line 826 because the condition on line 825 was never true
826 return "und"
827 return language.iso_code_639_1.name.lower()
829 def get_str_attr(self, tag: "Tag", attr: str):
830 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
831 node_attr = tag.get(attr)
832 if isinstance(node_attr, list): 832 ↛ 833line 832 didn't jump to line 833 because the condition on line 832 was never true
833 raise ValueError(
834 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
835 )
836 if node_attr is None: 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true
837 raise ValueError(
838 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
839 )
840 return node_attr
842 def create_trans_title(
843 self,
844 resource_type: str,
845 title_str: str,
846 lang: str,
847 xresource_lang: str,
848 title_type: str = "main",
849 ):
850 tag = "trans-title" if resource_type == "article" else "issue-title"
852 ckeditor_data = build_jats_data_from_html_field(
853 title_str,
854 tag=tag,
855 text_lang=lang,
856 resource_lang=xresource_lang,
857 delimiter_inline=self.delimiter_inline_formula,
858 delimiter_disp=self.delimiter_disp_formula,
859 )
861 titledata = create_titledata(
862 lang=lang,
863 type="main",
864 title_html=ckeditor_data["value_html"],
865 title_xml=ckeditor_data["value_xml"],
866 )
868 return titledata
870 references_mapping = {
871 "citation_title": get_article_title_xml,
872 "citation_journal_title": get_source_xml,
873 "citation_publication_date": get_year_xml,
874 "citation_firstpage": get_fpage_xml,
875 "citation_lastpage": get_lpage_xml,
876 }
878 @classmethod
879 def __parse_meta_citation_reference(cls, content: str, label=None):
880 categories = content.split(";")
882 if len(categories) == 1:
883 return JatsBase.bake_ref(content, label=label)
885 citation_data = [c.split("=") for c in categories if "=" in c]
886 del categories
888 xml_string = ""
889 authors_parsed = False
890 authors_strings = []
891 for data in citation_data:
892 key = data[0].strip()
893 citation_content = data[1]
894 if key == "citation_author":
895 authors_strings.append(get_author_xml(template_str=citation_content))
896 continue
897 elif not authors_parsed:
898 xml_string += ", ".join(authors_strings)
899 authors_parsed = True
901 if key in cls.references_mapping:
902 xml_string += " " + cls.references_mapping[key](citation_content)
904 return JatsBase.bake_ref(xml_string, label=label)
906 @classmethod
907 def get_or_create_source(cls):
908 source, created = Source.objects.get_or_create(
909 domain=cls.source_domain,
910 defaults={
911 "name": cls.source_name,
912 "website": cls.source_website,
913 "view_id": cls.get_view_id(),
914 },
915 )
916 if created: 916 ↛ 917line 916 didn't jump to line 917 because the condition on line 916 was never true
917 source.save()
918 return source
920 @staticmethod
921 def get_issue_pid(
922 collection_id: str,
923 year: str,
924 volume_number: str | None = None,
925 issue_number: str | None = None,
926 series: str | None = None,
927 ):
928 # Replace any non-word character with an underscore
929 pid = f"{collection_id}_{year}"
930 if series is not None: 930 ↛ 931line 930 didn't jump to line 931 because the condition on line 930 was never true
931 pid += f"_{series}"
932 if volume_number is not None:
933 pid += f"_{volume_number}"
934 if issue_number is not None:
935 pid += f"_{issue_number}"
936 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
937 return pid
939 @staticmethod
940 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
941 pages_split = pages.split(separator)
942 if len(pages_split) == 0: 942 ↛ 943line 942 didn't jump to line 943 because the condition on line 942 was never true
943 article.page_range = pages
944 if len(pages_split) > 0: 944 ↛ exitline 944 didn't return from function 'set_pages' because the condition on line 944 was always true
945 if pages[0].isnumeric(): 945 ↛ exitline 945 didn't return from function 'set_pages' because the condition on line 945 was always true
946 article.fpage = pages_split[0]
947 if ( 947 ↛ 952line 947 didn't jump to line 952 because the condition on line 947 was never true
948 len(pages_split) > 1
949 and pages_split[0] != pages_split[1]
950 and pages_split[1].isnumeric()
951 ):
952 article.lpage = pages_split[1]
954 @staticmethod
955 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
956 content_type = response.headers.get("Content-Type")
957 if regex.match(rb"^%PDF-\d\.\d", chunk):
958 if content_type and "application/pdf" in content_type:
959 # The file is unmistakably a pdf
960 return [
961 True,
962 response,
963 {
964 "status": ExtlinkChecked.Status.OK,
965 "message": "",
966 },
967 ]
968 # The file is a pdf, but the content type advertised by the server is wrong
969 return [
970 True,
971 response,
972 {
973 "status": ExtlinkChecked.Status.WARNING,
974 "message": f"Content-Type header: {content_type}",
975 },
976 ]
978 # Reaching here means we couldn't find the pdf.
979 if not content_type or "application/pdf" not in content_type:
980 return [
981 False,
982 response,
983 {
984 "status": ExtlinkChecked.Status.ERROR,
985 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
986 },
987 ]
989 return [
990 False,
991 response,
992 {
993 "status": ExtlinkChecked.Status.ERROR,
994 "message": f"PDF Header not found: got {chunk}",
995 },
996 ]
998 @classmethod
999 async def a_check_pdf_link_validity(
1000 cls, url: str, verify=True
1001 ) -> tuple[bool, aiohttp.ClientResponse, dict]:
1002 """
1003 Check the validity of the PDF links.
1004 """
1005 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1006 header = {
1007 "Range": f"bytes=0-{CHUNK_SIZE}",
1008 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1009 }
1010 async with cls.async_session.get(
1011 url, headers=header, allow_redirects=True, ssl=verify
1012 ) as response:
1013 try:
1014 chunk = await response.content.read(CHUNK_SIZE)
1015 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1016 except StopIteration:
1017 return [
1018 False,
1019 response,
1020 {
1021 "status": ExtlinkChecked.Status.ERROR,
1022 "message": "Error reading PDF header",
1023 },
1024 ]
1026 @classmethod
1027 def check_pdf_link_validity(
1028 cls, url: str, verify=True
1029 ) -> tuple[bool, requests.Response | None, dict]:
1030 """
1031 Check the validity of the PDF links.
1032 """
1033 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1034 header = {
1035 "Range": f"bytes=0-{CHUNK_SIZE}",
1036 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1037 }
1038 with cls.session.get(
1039 url, headers=header, allow_redirects=True, verify=verify, stream=True
1040 ) as response:
1041 try:
1042 chunk = next(response.iter_content(CHUNK_SIZE))
1043 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1044 except StopIteration:
1045 return [
1046 False,
1047 response,
1048 {
1049 "status": ExtlinkChecked.Status.ERROR,
1050 "message": "Error reading PDF header",
1051 },
1052 ]
1054 @classmethod
1055 async def check_extlink_validity(cls, extlink: "ExtLink"):
1056 """
1057 Method used by rot_monitoring to check if links have expired
1058 """
1059 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1060 header = {
1061 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1062 }
1063 verify = True
1064 if not cls.verify:
1065 verify = False
1066 try:
1067 if extlink.rel == "article-pdf":
1068 isok, response, message = await cls.a_check_pdf_link_validity(
1069 extlink.location, verify
1070 )
1071 defaults.update(message)
1072 defaults["http_status"] = response.status
1073 else:
1074 async with cls.async_session.get(
1075 url=extlink.location,
1076 headers=header,
1077 allow_redirects=True,
1078 ssl=verify,
1079 ) as response:
1080 defaults["http_status"] = response.status
1081 if response.status not in (200, 206):
1082 defaults["status"] = ExtlinkChecked.Status.ERROR
1084 except aiohttp.ClientSSLError:
1085 cls.logger.error("SSL error for the url: %s", extlink.location)
1086 defaults["status"] = ExtlinkChecked.Status.ERROR
1087 defaults["message"] = "SSL error"
1088 except aiohttp.ClientConnectionError:
1089 cls.logger.error("Connection error for the url: %s", extlink.location)
1090 defaults["status"] = ExtlinkChecked.Status.ERROR
1091 defaults["message"] = "Connection error"
1092 except TimeoutError:
1093 cls.logger.error("Timeout error for the url: %s", extlink.location)
1094 defaults["status"] = ExtlinkChecked.Status.ERROR
1095 defaults["message"] = "Timeout error"
1096 finally:
1097 try:
1098 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1099 cls.logger.info(
1100 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location
1101 )
1102 except IntegrityError:
1103 cls.logger.error(
1104 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location
1105 )