Coverage for src / crawler / base_crawler.py: 66%
575 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import logging
2import time
3from datetime import datetime, timedelta
4from email.policy import EmailPolicy
5from typing import TYPE_CHECKING, Iterable, Literal
7import aiohttp
8import regex
9import requests
10from bs4 import BeautifulSoup
11from django.conf import settings
12from django.contrib.auth.models import User
13from django.utils import timezone
14from langcodes import standardize_tag
15from lingua import LanguageDetector, LanguageDetectorBuilder
16from opentelemetry import trace
17from ptf.cmds.xml.ckeditor.utils import (
18 build_jats_data_from_html_field,
19)
20from ptf.cmds.xml.jats.builder.references import (
21 get_article_title_xml,
22 get_author_xml,
23 get_fpage_xml,
24 get_lpage_xml,
25 get_source_xml,
26 get_year_xml,
27)
28from ptf.cmds.xml.jats.jats_parser import JatsBase
29from ptf.model_data import (
30 ArticleData,
31 ContributorDict,
32 IssueData,
33 ResourceData,
34 TitleDict,
35 create_abstract,
36 create_contributor,
37 create_extlink,
38 create_issuedata,
39 create_publisherdata,
40 create_subj,
41 create_titledata,
42)
43from ptf.model_data_converter import update_data_for_jats
44from ptf.models import ExtLink
45from pylatexenc.latex2text import LatexNodes2Text
46from pysolr import SolrError
47from requests.adapters import HTTPAdapter
48from requests_cache import CachedSession
49from urllib3 import Retry
51from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
52from crawler.models import Source
53from crawler.models.extlink_checked import ExtlinkChecked
54from crawler.types import CitationLiteral
55from crawler.utils import (
56 add_pdf_link_to_xarticle,
57 cleanup_str,
58 get_all_cols,
59 get_or_create_collection,
60 get_session,
61)
63if TYPE_CHECKING:
64 from bs4 import Tag
67class CrawlerTitleDict(TitleDict):
68 title_tex: str | None
71class BaseCollectionCrawler:
72 """
73 Base collection for the crawlers.
74 To create a crawler:
75 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
76 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
77 3) update factory.py so that crawler_factory can return your new crawler
78 """
80 logger = logging.getLogger(__name__)
81 tracer = trace.get_tracer(__name__)
83 source_name = ""
84 source_domain = ""
85 source_website = ""
87 issue_href = ""
89 collection = None
90 source = None
91 user = None
92 session: requests.Session | CachedSession
93 async_session: aiohttp.ClientSession
94 is_checkable = True
95 verify = True
96 headers = {
97 "accept_encoding": "utf-8",
98 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
99 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
100 }
102 # seconds to wait between two http requests
103 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
104 # seconds to wait before aborting the connection (if no bytes are recieved)
105 requests_timeout = 60
107 latext_parser = LatexNodes2Text()
109 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
110 # do not use the "$" to surround tex formulas
111 delimiter_inline_formula = "$"
112 delimiter_disp_formula = "$"
114 # HACK : Workaround for tests (monkeypatching)
115 # We store the class here, so we can monkeypatch it when running tests
116 # subCrawlers = {
117 # LofplCrawler: None
118 # }
119 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
121 _language_detector: LanguageDetector | None = None
122 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
124 force_refresh = False
126 # Whereas to include headers in requests cache key
127 match_headers = False
128 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
130 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
131 ignore_missing_pdf = True
133 @classmethod
134 def get_view_id(cls):
135 return cls.source_domain
137 @property
138 def language_detector(self):
139 """Crawler Instance singleton for language builder.
140 Late init of LanguageDetector to save on memory"""
141 if not self._language_detector:
142 self._language_detector = self._language_detector_builder.build()
143 return self._language_detector
145 def __init__(
146 self,
147 *args,
148 username: str,
149 collection_id: str,
150 dry: bool = False,
151 publisher: str = "",
152 force_refresh=False,
153 collection_url: str | None = None,
154 ):
155 if not collection_url: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 all_cols = get_all_cols()
157 col = all_cols[collection_id]
159 collection_url = col["sources"].get(self.source_domain, None)
160 if collection_url is None:
161 raise ValueError(
162 f"Source {self.source_domain} not found for collection {collection_id}"
163 )
164 self.collection_url = collection_url
165 for CrawlerClass in self.subCrawlers: 165 ↛ 166line 165 didn't jump to line 166 because the loop on line 165 never started
166 self.subCrawlers[CrawlerClass] = CrawlerClass(
167 *args,
168 username=username,
169 collection_id=collection_id,
170 dry=dry,
171 publisher=publisher,
172 collection_url=collection_url,
173 )
174 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
175 # self.logger = logging.getLogger(__name__)
177 self.username = username
179 self.collection_id = collection_id
181 self.dry = dry
182 self.publisher = publisher
184 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
185 BaseCollectionCrawler.session: requests.Session
187 # Skipped when running tests
188 self.initialize()
190 self.force_refresh = force_refresh
192 # We implemented custom retry behaviour, so we don't want to make extra requests here
194 def initialize(self):
195 """
196 Acts as a "second" init function to skip model accesses during test data generation
197 """
198 self.collection = get_or_create_collection(self.collection_id)
199 self.source = self.get_or_create_source()
200 self.user = User.objects.get(username=self.username)
201 BaseCollectionCrawler.session = get_session()
202 BaseCollectionCrawler.session.verify = self.verify
203 self.session.delay = self.requests_interval
204 retries = Retry(
205 total=0,
206 )
207 self.session.mount("https://", HTTPAdapter(max_retries=retries))
208 self.session.mount("http://", HTTPAdapter(max_retries=retries))
210 @classmethod
211 def can_crawl(cls, pid: str) -> bool:
212 return True
214 def parse_collection_content(self, content: str) -> list[IssueData]:
215 """
216 Parse the HTML content with BeautifulSoup
217 returns a list of xissue.
218 Override this function in a derived class
219 """
220 return []
222 def parse_issue_content(self, content: str, xissue: IssueData):
223 """
224 Parse the HTML content with BeautifulSoup
225 Fills the xissue.articles
226 Override this function in a derived class.
228 CAV : You are supposed to create articles there. Please assign a PID to each article.
229 The PID can be `a + article_index`, like this : `a0` `a21`
230 """
232 def parse_article_content(
233 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
234 ) -> ArticleData | None:
235 """
236 Parse the HTML content with BeautifulSoup
237 returns the xarticle.
238 Override this function in a derived class.
239 The xissue is passed to the function in case the article page has issue information (ex: publisher)
240 The article url is also passed as a parameter
242 CAV : You are supposed to assign articles pid again here
243 """
244 return xarticle
246 @tracer.start_as_current_span("crawl_collection")
247 def crawl_collection(self):
248 # TODO: Comments, filter
249 """
250 Crawl an entire collection. ptf.models.Container objects are created.
251 - get the HTML content of the collection_url
252 - parse the HTML content with beautifulsoup to extract the list of issues
253 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
254 - crawl each issue if col_only is False
255 - Returns the list of merged issues.
256 It is an OrderedDict {pid: {"issues": xissues}}
257 The key is the pid of the merged issues.
258 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
259 the pid is then made with 1999-2000__6_
260 """
262 if self.source is None:
263 raise RuntimeError("ERROR: the source is not set")
265 content = self.download_file(self.collection_url)
266 if content:
267 xissues = self.parse_collection_content(content)
268 else:
269 # download_file returns None (404)
270 return None
272 """
273 Some collections split the same volumes in different pages
274 Ex: Volume 6 (2000) and Volume 6 (1999)
275 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
276 """
277 # merged_xissues = self.merge_xissues(xissues)
279 xissues_dict = {str(i.pid): i for i in xissues}
281 return xissues_dict
283 @tracer.start_as_current_span("crawl_issue")
284 def crawl_issue(self, xissue: IssueData):
285 """
286 Crawl 1 wag page of an issue.
287 - get the HTML content of the issue
288 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
289 - crawl each article
290 """
292 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
293 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
294 issue_url = xissue.url
295 if issue_url is not None:
296 if issue_url.endswith(".pdf"):
297 add_pdf_link_to_xarticle(xissue, issue_url)
298 xissue.url = None
299 else:
300 content = self.download_file(issue_url)
301 with self.tracer.start_as_current_span("parse_issue_content"):
302 self.parse_issue_content(content, xissue)
304 xarticles = xissue.articles
306 parsed_xarticles = []
308 for xarticle in xarticles:
309 parsed_xarticle = self.crawl_article(xarticle, xissue)
310 if parsed_xarticle is not None:
311 parsed_xarticles.append(parsed_xarticle)
313 xissue.articles = parsed_xarticles
315 issue_has_pdf = self.article_has_pdf(xissue)
317 if self.ignore_missing_pdf:
318 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
319 if self.dry:
320 return
321 if len(xissue.articles) == 0 and not issue_has_pdf:
322 return
323 self.process_resource_metadata(xissue, resource_type="issue")
325 self.add_xissue_into_database(xissue)
327 @staticmethod
328 def article_has_source(art: ArticleData | IssueData):
329 return (
330 next(
331 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
332 None,
333 )
334 is not None
335 )
337 @staticmethod
338 def article_has_pdf(art: ArticleData | IssueData):
339 return (
340 next(
341 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
342 None,
343 )
344 is not None
345 )
347 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
348 # ARTICLE URL as en ExtLink (to display the link in the article page)
349 if xarticle.url is None:
350 if not self.article_has_source(xarticle): 350 ↛ 360line 350 didn't jump to line 360 because the condition on line 350 was always true
351 if xissue.url:
352 article_source = xissue.url
353 else:
354 article_source = self.collection_url
355 ext_link = create_extlink()
356 ext_link["rel"] = "source"
357 ext_link["location"] = article_source
358 ext_link["metadata"] = self.source_domain
359 xarticle.ext_links.append(ext_link)
360 return self.process_article_metadata(xarticle)
362 content = self.download_file(xarticle.url)
363 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
365 try:
366 with self.tracer.start_as_current_span("parse_article_content"):
367 parsed_xarticle = self.parse_article_content(
368 content, xissue, xarticle, xarticle.url
369 )
370 except ValueError as e:
371 self.logger.warning(e)
372 self.logger.warning("Retrying in 5 mins while invalidating cache")
373 time.sleep(5 * 60)
374 content = self.download_file(xarticle.url, force_refresh=True)
375 with self.tracer.start_as_current_span("parse_article_content"):
376 parsed_xarticle = self.parse_article_content(
377 content, xissue, xarticle, xarticle.url
378 )
380 if parsed_xarticle is None: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true
381 return None
383 if parsed_xarticle.doi:
384 parsed_xarticle.pid = (
385 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
386 )
388 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
389 ext_link = create_extlink()
390 ext_link["rel"] = "source"
391 ext_link["location"] = parsed_xarticle.url
392 ext_link["metadata"] = self.source_domain
393 parsed_xarticle.ext_links.append(ext_link)
395 # The article title may have formulas surrounded with '$'
396 return self.process_article_metadata(parsed_xarticle)
398 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
399 tag = "article-title" if resource_type == "article" else "issue-title"
401 # Process title tex
402 ckeditor_data = build_jats_data_from_html_field(
403 xresource.title_tex,
404 tag=tag,
405 text_lang=xresource.lang,
406 delimiter_inline=self.delimiter_inline_formula,
407 delimiter_disp=self.delimiter_disp_formula,
408 )
410 xresource.title_html = ckeditor_data["value_html"]
411 # xresource.title_tex = ckeditor_data["value_tex"]
412 xresource.title_xml = ckeditor_data["value_xml"]
414 # Process trans_title tex
415 if xresource.trans_title_tex: 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true
416 self.logger.warning(
417 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
418 )
419 trans_title = self.create_trans_title(
420 xresource_lang=xresource.lang,
421 resource_type=resource_type,
422 title_tex=xresource.trans_title_tex,
423 lang=xresource.trans_lang,
424 )
425 xresource.titles.append(trans_title)
427 abstracts_to_parse = [
428 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
429 ]
430 # abstract may have formulas surrounded with '$'
431 if len(abstracts_to_parse) > 0:
432 for xabstract in abstracts_to_parse:
433 ckeditor_data = build_jats_data_from_html_field(
434 xabstract["value_tex"],
435 tag="abstract",
436 text_lang=xabstract["lang"],
437 resource_lang=xresource.lang,
438 field_type="abstract",
439 delimiter_inline=self.delimiter_inline_formula,
440 delimiter_disp=self.delimiter_disp_formula,
441 )
443 xabstract["value_html"] = ckeditor_data["value_html"]
444 # xabstract["value_tex"] = ckeditor_data["value_tex"]
445 xabstract["value_xml"] = ckeditor_data["value_xml"]
447 return xresource
449 def process_article_metadata(self, xarticle: ArticleData):
450 self.process_resource_metadata(xarticle)
451 for bibitem in xarticle.bibitems:
452 bibitem.type = "unknown"
453 update_data_for_jats(xarticle, with_label=False)
455 return xarticle
457 def download_file(self, url: str, force_refresh=False, headers={}):
458 """
459 Downloads a page and returns its content (decoded string).
460 This function handles retries and decoding
461 """
462 current_exception: Exception | None = None
463 for attempt in range(3):
464 try:
465 kwargs = {
466 "url": url,
467 "headers": {**self.headers, **headers},
468 "timeout": self.requests_timeout,
469 }
470 if attempt > 0 and isinstance(self.session, CachedSession):
471 kwargs["force_refresh"] = True
472 response = self.session.get(**kwargs)
474 content = self.decode_response(response)
475 if content == "" or not content:
476 raise requests.exceptions.HTTPError(response)
478 return content
479 except (
480 requests.ConnectionError,
481 requests.ConnectTimeout,
482 requests.exceptions.HTTPError,
483 ) as e:
484 current_exception = e
485 self.logger.debug(f"Caught error : {e}", extra={"url": url})
486 # 15 mins, 30 mins, 45 mins
487 delay_minutes = attempt * 15
488 self.logger.debug(
489 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
490 extra={"url": url},
491 )
492 time.sleep(delay_minutes * 60)
494 raise current_exception
496 def decode_response(self, response: requests.Response, encoding: str | None = None):
497 """Override this if the content-type headers from the sources are advertising something else than the actual content
498 SASA needs this"""
499 # Force
500 if encoding:
501 response.encoding = encoding
502 return response.text
504 # Attempt to get encoding using HTTP headers
505 content_type_tag = response.headers.get("Content-Type", None)
507 if content_type_tag: 507 ↛ 514line 507 didn't jump to line 514 because the condition on line 507 was always true
508 charset = self.parse_content_type_charset(content_type_tag)
509 if charset: 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true
510 response.encoding = charset
511 return response.text
513 # Attempt to get encoding using HTML meta charset tag
514 soup = BeautifulSoup(response.text, "html5lib")
515 charset = soup.select_one("meta[charset]")
516 if charset:
517 htmlencoding = charset.get("charset")
518 if isinstance(htmlencoding, str): 518 ↛ 523line 518 didn't jump to line 523 because the condition on line 518 was always true
519 response.encoding = htmlencoding
520 return response.text
522 # Attempt to get encoding using HTML meta content type tag
523 content_type_tag = soup.select_one(
524 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
525 )
526 if content_type_tag:
527 content_type = content_type_tag.get("content")
528 if isinstance(content_type, str): 528 ↛ 534line 528 didn't jump to line 534 because the condition on line 528 was always true
529 charset = self.parse_content_type_charset(content_type)
530 if charset: 530 ↛ 534line 530 didn't jump to line 534 because the condition on line 530 was always true
531 response.encoding = charset
532 return response.text
534 return response.text
536 @staticmethod
537 def parse_content_type_charset(content_type: str):
538 header = EmailPolicy.header_factory("content-type", content_type)
539 if "charset" in header.params:
540 return header.params.get("charset")
542 @tracer.start_as_current_span("add_xissue_to_database")
543 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
544 xissue.journal = self.collection
545 xissue.source = self.source_domain
547 if xissue.year == "":
548 raise ValueError("Failsafe : Cannot insert issue without a year")
550 xpub = create_publisherdata()
551 xpub.name = self.publisher
552 xissue.publisher = xpub
553 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
555 attempt = 1
556 success = False
558 while not success and attempt < 4:
559 try:
560 params = {"xissue": xissue, "use_body": False}
561 cmd = addOrUpdateGDMLIssueXmlCmd(params)
562 cmd.do()
563 success = True
564 self.logger.debug(f"Issue {xissue.pid} inserted in database")
565 return xissue
566 except SolrError:
567 self.logger.warning(
568 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
569 )
570 attempt += 1
571 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
572 time.sleep(10)
573 except Exception as e:
574 self.logger.error(
575 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
576 )
577 raise e
579 if success is False:
580 raise ConnectionRefusedError("Cannot connect to SolR")
582 assert False, "Unreachable"
584 def get_metadata_using_citation_meta(
585 self,
586 xarticle: ArticleData,
587 xissue: IssueData,
588 soup: BeautifulSoup,
589 what: list[CitationLiteral] = [],
590 ):
591 """
592 :param xarticle: the xarticle that will collect the metadata
593 :param xissue: the xissue that will collect the publisher
594 :param soup: the BeautifulSoup object of tha article page
595 :param what: list of citation_ items to collect.
596 :return: None. The given article is modified
597 """
599 if "title" in what:
600 # TITLE
601 citation_title_node = soup.select_one("meta[name='citation_title']")
602 if citation_title_node: 602 ↛ 607line 602 didn't jump to line 607 because the condition on line 602 was always true
603 title = citation_title_node.get("content")
604 if isinstance(title, str): 604 ↛ 607line 604 didn't jump to line 607 because the condition on line 604 was always true
605 xarticle.title_tex = title
607 if "author" in what: 607 ↛ 636line 607 didn't jump to line 636 because the condition on line 607 was always true
608 # AUTHORS
609 citation_author_nodes = soup.select("meta[name^='citation_author']")
610 current_author: ContributorDict | None = None
611 for citation_author_node in citation_author_nodes:
612 if citation_author_node.get("name") == "citation_author":
613 text_author = citation_author_node.get("content")
614 if not isinstance(text_author, str): 614 ↛ 615line 614 didn't jump to line 615 because the condition on line 614 was never true
615 raise ValueError("Cannot parse author")
616 if text_author == "": 616 ↛ 617line 616 didn't jump to line 617 because the condition on line 616 was never true
617 current_author = None
618 continue
619 current_author = create_contributor(role="author", string_name=text_author)
620 xarticle.contributors.append(current_author)
621 continue
622 if current_author is None: 622 ↛ 623line 622 didn't jump to line 623 because the condition on line 622 was never true
623 self.logger.warning("Couldn't parse citation author")
624 continue
625 if citation_author_node.get("name") == "citation_author_institution":
626 text_institution = citation_author_node.get("content")
627 if not isinstance(text_institution, str): 627 ↛ 628line 627 didn't jump to line 628 because the condition on line 627 was never true
628 continue
629 current_author["addresses"].append(text_institution)
630 if citation_author_node.get("name") == "citation_author_ocrid": 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true
631 text_orcid = citation_author_node.get("content")
632 if not isinstance(text_orcid, str):
633 continue
634 current_author["orcid"] = text_orcid
636 if "pdf" in what:
637 # PDF
638 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
639 if citation_pdf_node:
640 pdf_url = citation_pdf_node.get("content")
641 if isinstance(pdf_url, str): 641 ↛ 644line 641 didn't jump to line 644 because the condition on line 641 was always true
642 add_pdf_link_to_xarticle(xarticle, pdf_url)
644 if "lang" in what:
645 # LANG
646 citation_lang_node = soup.select_one("meta[name='citation_language']")
647 if citation_lang_node: 647 ↛ 653line 647 didn't jump to line 653 because the condition on line 647 was always true
648 # TODO: check other language code
649 content_text = citation_lang_node.get("content")
650 if isinstance(content_text, str): 650 ↛ 653line 650 didn't jump to line 653 because the condition on line 650 was always true
651 xarticle.lang = standardize_tag(content_text)
653 if "abstract" in what:
654 # ABSTRACT
655 abstract_node = soup.select_one("meta[name='citation_abstract']")
656 if abstract_node is not None:
657 abstract = abstract_node.get("content")
658 if not isinstance(abstract, str): 658 ↛ 659line 658 didn't jump to line 659 because the condition on line 658 was never true
659 raise ValueError("Couldn't parse abstract from meta")
660 abstract = BeautifulSoup(abstract, "html.parser").text
661 lang = abstract_node.get("lang")
662 if not isinstance(lang, str):
663 lang = self.detect_language(abstract, xarticle)
664 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
666 if "page" in what:
667 # PAGES
668 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
669 if citation_fpage_node:
670 page = citation_fpage_node.get("content")
671 if isinstance(page, str): 671 ↛ 676line 671 didn't jump to line 676 because the condition on line 671 was always true
672 page = page.split("(")[0]
673 if len(page) < 32: 673 ↛ 676line 673 didn't jump to line 676 because the condition on line 673 was always true
674 xarticle.fpage = page
676 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
677 if citation_lpage_node:
678 page = citation_lpage_node.get("content")
679 if isinstance(page, str): 679 ↛ 684line 679 didn't jump to line 684 because the condition on line 679 was always true
680 page = page.split("(")[0]
681 if len(page) < 32: 681 ↛ 684line 681 didn't jump to line 684 because the condition on line 681 was always true
682 xarticle.lpage = page
684 if "doi" in what:
685 # DOI
686 citation_doi_node = soup.select_one("meta[name='citation_doi']")
687 if citation_doi_node:
688 doi = citation_doi_node.get("content")
689 if isinstance(doi, str): 689 ↛ 696line 689 didn't jump to line 696 because the condition on line 689 was always true
690 doi = doi.strip()
691 pos = doi.find("10.")
692 if pos > 0:
693 doi = doi[pos:]
694 xarticle.doi = doi
696 if "mr" in what:
697 # MR
698 citation_mr_node = soup.select_one("meta[name='citation_mr']")
699 if citation_mr_node:
700 mr = citation_mr_node.get("content")
701 if isinstance(mr, str): 701 ↛ 707line 701 didn't jump to line 707 because the condition on line 701 was always true
702 mr = mr.strip()
703 if mr.find("MR") == 0: 703 ↛ 707line 703 didn't jump to line 707 because the condition on line 703 was always true
704 mr = mr[2:]
705 xarticle.extids.append(("mr-item-id", mr))
707 if "zbl" in what:
708 # ZBL
709 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
710 if citation_zbl_node:
711 zbl = citation_zbl_node.get("content")
712 if isinstance(zbl, str): 712 ↛ 718line 712 didn't jump to line 718 because the condition on line 712 was always true
713 zbl = zbl.strip()
714 if zbl.find("Zbl") == 0: 714 ↛ 718line 714 didn't jump to line 718 because the condition on line 714 was always true
715 zbl = zbl[3:].strip()
716 xarticle.extids.append(("zbl-item-id", zbl))
718 if "publisher" in what:
719 # PUBLISHER
720 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
721 if citation_publisher_node:
722 pub = citation_publisher_node.get("content")
723 if isinstance(pub, str): 723 ↛ 730line 723 didn't jump to line 730 because the condition on line 723 was always true
724 pub = pub.strip()
725 if pub != "": 725 ↛ 730line 725 didn't jump to line 730 because the condition on line 725 was always true
726 xpub = create_publisherdata()
727 xpub.name = pub
728 xissue.publisher = xpub
730 if "keywords" in what:
731 # KEYWORDS
732 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
733 for kwd_node in citation_kwd_nodes:
734 kwds = kwd_node.get("content")
735 if isinstance(kwds, str): 735 ↛ 733line 735 didn't jump to line 733 because the condition on line 735 was always true
736 kwds = kwds.split(",")
737 for kwd in kwds:
738 if kwd == "":
739 continue
740 kwd = kwd.strip()
741 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
743 if "references" in what:
744 citation_references = soup.select("meta[name='citation_reference']")
745 for index, tag in enumerate(citation_references):
746 content = tag.get("content")
747 if not isinstance(content, str): 747 ↛ 748line 747 didn't jump to line 748 because the condition on line 747 was never true
748 raise ValueError("Cannot parse citation_reference meta")
749 label = str(index + 1)
750 if regex.match(r"^\[\d+\].*", content): 750 ↛ 751line 750 didn't jump to line 751 because the condition on line 750 was never true
751 label = None
752 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
754 def get_metadata_using_dcterms(
755 self,
756 xarticle: ArticleData,
757 soup: "Tag",
758 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
759 ):
760 if "abstract" in what: 760 ↛ 768line 760 didn't jump to line 768 because the condition on line 760 was always true
761 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
762 if abstract_tag: 762 ↛ 768line 762 didn't jump to line 768 because the condition on line 762 was always true
763 abstract_text = self.get_str_attr(abstract_tag, "content")
764 xarticle.abstracts.append(
765 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
766 )
768 if "keywords" in what: 768 ↛ 777line 768 didn't jump to line 777 because the condition on line 768 was always true
769 keyword_tags = soup.select("meta[name='DC.subject']")
770 for tag in keyword_tags:
771 kwd_text = tag.get("content")
772 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 772 ↛ 773line 772 didn't jump to line 773 because the condition on line 772 was never true
773 continue
774 kwd = create_subj(value=kwd_text)
775 xarticle.kwds.append(kwd)
777 if "date_published" in what: 777 ↛ 778line 777 didn't jump to line 778 because the condition on line 777 was never true
778 published_tag = soup.select_one("meta[name='DC.Date.created']")
779 if published_tag:
780 published_text = self.get_str_attr(published_tag, "content")
781 xarticle.date_published = published_text
783 if "article_type" in what: 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true
784 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
785 if type_tag:
786 type_text = self.get_str_attr(type_tag, "content")
787 xarticle.atype = type_text
789 def create_xissue(
790 self,
791 url: str | None,
792 year: str,
793 volume_number: str | None,
794 issue_number: str | None = None,
795 vseries: str | None = None,
796 ):
797 if url is not None and url.endswith("/"):
798 url = url[:-1]
799 xissue = create_issuedata()
800 xissue.url = url
802 xissue.pid = self.get_issue_pid(
803 self.collection_id, year, volume_number, issue_number, vseries
804 )
806 xissue.year = year
808 if volume_number is not None:
809 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
811 if issue_number is not None:
812 xissue.number = issue_number.replace(",", "-")
814 if vseries is not None: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true
815 xissue.vseries = vseries
816 return xissue
818 def detect_language(self, text: str, article: ArticleData | None = None):
819 if article and article.lang is not None and article.lang != "und":
820 return article.lang
822 language = self.language_detector.detect_language_of(text)
824 if not language: 824 ↛ 825line 824 didn't jump to line 825 because the condition on line 824 was never true
825 return "und"
826 return language.iso_code_639_1.name.lower()
828 def get_str_attr(self, tag: "Tag", attr: str):
829 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
830 node_attr = tag.get(attr)
831 if isinstance(node_attr, list): 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true
832 raise ValueError(
833 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
834 )
835 if node_attr is None: 835 ↛ 836line 835 didn't jump to line 836 because the condition on line 835 was never true
836 raise ValueError(
837 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
838 )
839 return node_attr
841 def create_trans_title(
842 self,
843 resource_type: str,
844 title_tex: str,
845 lang: str,
846 xresource_lang: str,
847 title_type: str = "main",
848 ):
849 tag = "trans-title" if resource_type == "article" else "issue-title"
851 ckeditor_data = build_jats_data_from_html_field(
852 title_tex,
853 tag=tag,
854 text_lang=lang,
855 resource_lang=xresource_lang,
856 delimiter_inline=self.delimiter_inline_formula,
857 delimiter_disp=self.delimiter_disp_formula,
858 )
860 titledata = create_titledata(
861 lang=lang,
862 type="main",
863 title_html=ckeditor_data["value_html"],
864 title_xml=ckeditor_data["value_xml"],
865 )
867 return titledata
869 references_mapping = {
870 "citation_title": get_article_title_xml,
871 "citation_journal_title": get_source_xml,
872 "citation_publication_date": get_year_xml,
873 "citation_firstpage": get_fpage_xml,
874 "citation_lastpage": get_lpage_xml,
875 }
877 @classmethod
878 def __parse_meta_citation_reference(cls, content: str, label=None):
879 categories = content.split(";")
881 if len(categories) == 1:
882 return JatsBase.bake_ref(content, label=label)
884 citation_data = [c.split("=") for c in categories if "=" in c]
885 del categories
887 xml_string = ""
888 authors_parsed = False
889 authors_strings = []
890 for data in citation_data:
891 key = data[0].strip()
892 citation_content = data[1]
893 if key == "citation_author":
894 authors_strings.append(get_author_xml(template_str=citation_content))
895 continue
896 elif not authors_parsed:
897 xml_string += ", ".join(authors_strings)
898 authors_parsed = True
900 if key in cls.references_mapping:
901 xml_string += " " + cls.references_mapping[key](citation_content)
903 return JatsBase.bake_ref(xml_string, label=label)
905 @classmethod
906 def get_or_create_source(cls):
907 source, created = Source.objects.get_or_create(
908 domain=cls.source_domain,
909 defaults={
910 "name": cls.source_name,
911 "website": cls.source_website,
912 "view_id": cls.get_view_id(),
913 },
914 )
915 if created: 915 ↛ 916line 915 didn't jump to line 916 because the condition on line 915 was never true
916 source.save()
917 return source
919 @staticmethod
920 def get_issue_pid(
921 collection_id: str,
922 year: str,
923 volume_number: str | None = None,
924 issue_number: str | None = None,
925 series: str | None = None,
926 ):
927 # Replace any non-word character with an underscore
928 pid = f"{collection_id}_{year}"
929 if series is not None: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true
930 pid += f"_{series}"
931 if volume_number is not None:
932 pid += f"_{volume_number}"
933 if issue_number is not None:
934 pid += f"_{issue_number}"
935 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
936 return pid
938 @staticmethod
939 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
940 pages_split = pages.split(separator)
941 if len(pages_split) == 0: 941 ↛ 942line 941 didn't jump to line 942 because the condition on line 941 was never true
942 article.page_range = pages
943 if len(pages_split) > 0: 943 ↛ exitline 943 didn't return from function 'set_pages' because the condition on line 943 was always true
944 if pages[0].isnumeric(): 944 ↛ exitline 944 didn't return from function 'set_pages' because the condition on line 944 was always true
945 article.fpage = pages_split[0]
946 if ( 946 ↛ 951line 946 didn't jump to line 951 because the condition on line 946 was never true
947 len(pages_split) > 1
948 and pages_split[0] != pages_split[1]
949 and pages_split[1].isnumeric()
950 ):
951 article.lpage = pages_split[1]
953 @staticmethod
954 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
955 content_type = response.headers.get("Content-Type")
956 if regex.match(rb"^%PDF-\d\.\d", chunk):
957 if content_type and "application/pdf" in content_type:
958 # The file is unmistakably a pdf
959 return [
960 True,
961 response,
962 {
963 "status": ExtlinkChecked.Status.OK,
964 "message": "",
965 },
966 ]
967 # The file is a pdf, but the content type advertised by the server is wrong
968 return [
969 True,
970 response,
971 {
972 "status": ExtlinkChecked.Status.WARNING,
973 "message": f"Content-Type header: {content_type}",
974 },
975 ]
977 # Reaching here means we couldn't find the pdf.
978 if not content_type or "application/pdf" not in content_type:
979 return [
980 False,
981 response,
982 {
983 "status": ExtlinkChecked.Status.ERROR,
984 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
985 },
986 ]
988 return [
989 False,
990 response,
991 {
992 "status": ExtlinkChecked.Status.ERROR,
993 "message": f"PDF Header not found: got {chunk}",
994 },
995 ]
997 @classmethod
998 async def a_check_pdf_link_validity(
999 cls, url: str, verify=True
1000 ) -> tuple[bool, aiohttp.ClientResponse, dict]:
1001 """
1002 Check the validity of the PDF links.
1003 """
1004 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1005 header = {
1006 "Range": f"bytes=0-{CHUNK_SIZE}",
1007 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1008 }
1009 async with cls.async_session.get(
1010 url, headers=header, allow_redirects=True, ssl=verify
1011 ) as response:
1012 try:
1013 chunk = await response.content.read(CHUNK_SIZE)
1014 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1015 except StopIteration:
1016 return [
1017 False,
1018 response,
1019 {
1020 "status": ExtlinkChecked.Status.ERROR,
1021 "message": "Error reading PDF header",
1022 },
1023 ]
1025 @classmethod
1026 def check_pdf_link_validity(
1027 cls, url: str, verify=True
1028 ) -> tuple[bool, requests.Response | None, dict]:
1029 """
1030 Check the validity of the PDF links.
1031 """
1032 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1033 header = {
1034 "Range": f"bytes=0-{CHUNK_SIZE}",
1035 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1036 }
1037 with cls.session.get(
1038 url, headers=header, allow_redirects=True, verify=verify, stream=True
1039 ) as response:
1040 try:
1041 chunk = next(response.iter_content(CHUNK_SIZE))
1042 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1043 except StopIteration:
1044 return [
1045 False,
1046 response,
1047 {
1048 "status": ExtlinkChecked.Status.ERROR,
1049 "message": "Error reading PDF header",
1050 },
1051 ]
1053 @classmethod
1054 async def check_extlink_validity(cls, extlink: "ExtLink"):
1055 """
1056 Method used by rot_monitoring to check if links have expired
1057 """
1058 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1059 header = {
1060 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1061 }
1062 verify = True
1063 if not cls.verify:
1064 verify = False
1065 try:
1066 if extlink.rel == "article-pdf":
1067 isok, response, message = await cls.a_check_pdf_link_validity(
1068 extlink.location, verify
1069 )
1070 defaults.update(message)
1071 defaults["http_status"] = response.status
1072 else:
1073 async with cls.async_session.get(
1074 url=extlink.location,
1075 headers=header,
1076 allow_redirects=True,
1077 ssl=verify,
1078 ) as response:
1079 defaults["http_status"] = response.status
1080 if response.status not in (200, 206):
1081 defaults["status"] = ExtlinkChecked.Status.ERROR
1083 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1084 cls.logger.info("DB Update, source: %s, url: %s", cls.source_domain, extlink.location)
1086 except aiohttp.ClientSSLError:
1087 cls.logger.error("SSL error for the url: %s", extlink.location)
1088 except aiohttp.ClientConnectionError:
1089 cls.logger.error("Connection error for the url: %s", extlink.location)