Coverage for src / crawler / abstract_crawlers / base_crawler.py: 66%
584 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-30 12:41 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-30 12:41 +0000
1import logging
2import time
3from collections.abc import Iterable
4from datetime import datetime, timedelta
5from email.policy import EmailPolicy
6from typing import TYPE_CHECKING, Literal
8import aiohttp
9import regex
10import requests
11from bs4 import BeautifulSoup
12from django.conf import settings
13from django.contrib.auth.models import User
14from django.db.utils import IntegrityError
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetector, LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_subj,
43 create_titledata,
44)
45from ptf.model_data_converter import update_data_for_jats
46from ptf.models import ExtLink
47from pylatexenc.latex2text import LatexNodes2Text
48from pysolr import SolrError
49from requests.adapters import HTTPAdapter
50from requests_cache import CachedSession
51from urllib3 import Retry
53from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
54from crawler.models import Source
55from crawler.models.extlink_checked import ExtlinkChecked
56from crawler.types import CitationLiteral
57from crawler.utils import (
58 add_pdf_link_to_xarticle,
59 cleanup_str,
60 get_all_cols,
61 get_or_create_collection,
62 get_session,
63)
65if TYPE_CHECKING:
66 from bs4 import Tag
69class CrawlerTitleDict(TitleDict):
70 title_tex: str | None
73class BaseCollectionCrawler:
74 """
75 Base collection for the crawlers.
76 To create a crawler:
77 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
78 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
79 3) update factory.py so that crawler_factory can return your new crawler
80 """
82 logger = logging.getLogger(__name__)
83 tracer = trace.get_tracer(__name__)
85 source_name = ""
86 source_domain = ""
87 source_website = ""
89 issue_href = ""
91 collection = None
92 source = None
93 user = None
94 session: requests.Session | CachedSession
95 async_session: aiohttp.ClientSession
96 is_checkable = True
97 verify = True
98 headers = {
99 "accept_encoding": "utf-8",
100 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
101 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
102 }
104 # seconds to wait between two http requests
105 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
106 # seconds to wait before aborting the connection (if no bytes are recieved)
107 requests_timeout = 60
109 latext_parser = LatexNodes2Text()
111 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
112 # do not use the "$" to surround tex formulas
113 delimiter_inline_formula = "$"
114 delimiter_disp_formula = "$"
116 # HACK : Workaround for tests (monkeypatching)
117 # We store the class here, so we can monkeypatch it when running tests
118 # subCrawlers = {
119 # LofplCrawler: None
120 # }
121 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
123 _language_detector: LanguageDetector | None = None
124 _language_detector_builder = LanguageDetectorBuilder.from_all_languages()
126 force_refresh = False
128 # Whereas to include headers in requests cache key
129 match_headers = False
130 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
132 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
133 ignore_missing_pdf = True
135 @classmethod
136 def get_view_id(cls):
137 return cls.source_domain
139 @property
140 def language_detector(self):
141 """Crawler Instance singleton for language builder.
142 Late init of LanguageDetector to save on memory"""
143 if not self._language_detector:
144 self._language_detector = self._language_detector_builder.build()
145 return self._language_detector
147 def __init__(
148 self,
149 *args,
150 username: str,
151 collection_id: str,
152 dry: bool = False,
153 publisher: str = "",
154 force_refresh=False,
155 collection_url: str | None = None,
156 ):
157 if not collection_url: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 all_cols = get_all_cols()
159 col = all_cols[collection_id]
161 collection_url = col["sources"].get(self.source_domain, None)
162 if collection_url is None:
163 raise ValueError(
164 f"Source {self.source_domain} not found for collection {collection_id}"
165 )
166 self.collection_url = collection_url
167 for CrawlerClass in self.subCrawlers: 167 ↛ 168line 167 didn't jump to line 168 because the loop on line 167 never started
168 self.subCrawlers[CrawlerClass] = CrawlerClass(
169 *args,
170 username=username,
171 collection_id=collection_id,
172 dry=dry,
173 publisher=publisher,
174 collection_url=collection_url,
175 )
176 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
177 # self.logger = logging.getLogger(__name__)
179 self.username = username
181 self.collection_id = collection_id
183 self.dry = dry
184 self.publisher = publisher
186 # Classproperty : We sometimes want to use the session without initializing the class (rot monitoring)
187 BaseCollectionCrawler.session = requests.Session()
189 # Skipped when running tests
190 self.initialize()
192 self.force_refresh = force_refresh
194 # We implemented custom retry behaviour, so we don't want to make extra requests here
196 def initialize(self):
197 """
198 Acts as a "second" init function to skip model accesses during test data generation
199 """
200 self.collection = get_or_create_collection(self.collection_id)
201 self.source = self.get_or_create_source()
202 self.user = User.objects.get(username=self.username)
203 BaseCollectionCrawler.session = get_session()
204 BaseCollectionCrawler.session.verify = self.verify
205 self.session.delay = self.requests_interval
206 retries = Retry(
207 total=0,
208 )
209 self.session.mount("https://", HTTPAdapter(max_retries=retries))
210 self.session.mount("http://", HTTPAdapter(max_retries=retries))
212 @classmethod
213 def can_crawl(cls, pid: str) -> bool:
214 return True
216 def parse_collection_content(self, content: str) -> list[IssueData]:
217 """
218 Parse the HTML content with BeautifulSoup
219 returns a list of xissue.
220 Override this function in a derived class
221 """
222 return []
224 def parse_issue_content(self, content: str, xissue: IssueData):
225 """
226 Parse the HTML content with BeautifulSoup
227 Fills the xissue.articles
228 Override this function in a derived class.
230 CAV : You are supposed to create articles there. Please assign a PID to each article.
231 The PID can be `a + article_index`, like this : `a0` `a21`
232 """
234 def parse_article_content(
235 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
236 ) -> ArticleData | None:
237 """
238 Parse the HTML content with BeautifulSoup
239 returns the xarticle.
240 Override this function in a derived class.
241 The xissue is passed to the function in case the article page has issue information (ex: publisher)
242 The article url is also passed as a parameter
244 CAV : You are supposed to assign articles pid again here
245 """
246 return xarticle
248 @tracer.start_as_current_span("crawl_collection")
249 def crawl_collection(self):
250 # TODO: Comments, filter
251 """
252 Crawl an entire collection. ptf.models.Container objects are created.
253 - get the HTML content of the collection_url
254 - parse the HTML content with beautifulsoup to extract the list of issues
255 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
256 - crawl each issue if col_only is False
257 - Returns the list of merged issues.
258 It is an OrderedDict {pid: {"issues": xissues}}
259 The key is the pid of the merged issues.
260 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
261 the pid is then made with 1999-2000__6_
262 """
264 if self.source is None:
265 raise RuntimeError("ERROR: the source is not set")
267 content = self.download_file(self.collection_url)
268 if content:
269 xissues = self.parse_collection_content(content)
270 else:
271 # download_file returns None (404)
272 return None
274 """
275 Some collections split the same volumes in different pages
276 Ex: Volume 6 (2000) and Volume 6 (1999)
277 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
278 """
279 # merged_xissues = self.merge_xissues(xissues)
281 xissues_dict = {str(i.pid): i for i in xissues}
283 return xissues_dict
285 @tracer.start_as_current_span("crawl_issue")
286 def crawl_issue(self, xissue: IssueData):
287 """
288 Crawl 1 wag page of an issue.
289 - get the HTML content of the issue
290 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
291 - crawl each article
292 """
293 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
294 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
295 issue_url = xissue.url
296 if issue_url is not None:
297 if issue_url.endswith(".pdf"):
298 add_pdf_link_to_xarticle(xissue, issue_url)
299 xissue.url = None
300 else:
301 content = self.download_file(issue_url)
302 with self.tracer.start_as_current_span("parse_issue_content"):
303 self.parse_issue_content(content, xissue)
305 xarticles = xissue.articles
307 parsed_xarticles = []
309 for xarticle in xarticles:
310 parsed_xarticle = self.crawl_article(xarticle, xissue)
311 if parsed_xarticle is not None:
312 parsed_xarticles.append(parsed_xarticle)
314 xissue.articles = parsed_xarticles
316 issue_has_pdf = self.article_has_pdf(xissue)
318 if self.ignore_missing_pdf:
319 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
320 if self.dry:
321 return
322 if len(xissue.articles) == 0 and not issue_has_pdf:
323 return
324 self.process_resource_metadata(xissue, resource_type="issue")
326 self.add_xissue_into_database(xissue)
328 @staticmethod
329 def article_has_source(art: ArticleData | IssueData):
330 return (
331 next(
332 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
333 None,
334 )
335 is not None
336 )
338 @staticmethod
339 def article_has_pdf(art: ArticleData | IssueData):
340 return (
341 next(
342 (link for link in art.ext_links if link["rel"] in ["article-pdf", "article-html"]),
343 None,
344 )
345 is not None
346 )
348 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
349 # ARTICLE URL as en ExtLink (to display the link in the article page)
350 if xarticle.url is None:
351 if not self.article_has_source(xarticle): 351 ↛ 361line 351 didn't jump to line 361 because the condition on line 351 was always true
352 if xissue.url:
353 article_source = xissue.url
354 else:
355 article_source = self.collection_url
356 ext_link = create_extlink()
357 ext_link["rel"] = "source"
358 ext_link["location"] = article_source
359 ext_link["metadata"] = self.source_domain
360 xarticle.ext_links.append(ext_link)
361 return self.process_article_metadata(xarticle)
363 content = self.download_file(xarticle.url)
364 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
366 try:
367 with self.tracer.start_as_current_span("parse_article_content"):
368 parsed_xarticle = self.parse_article_content(
369 content, xissue, xarticle, xarticle.url
370 )
371 except ValueError as e:
372 self.logger.warning(e)
373 self.logger.warning("Retrying in 5 mins while invalidating cache")
374 time.sleep(5 * 60)
375 content = self.download_file(xarticle.url, force_refresh=True)
376 with self.tracer.start_as_current_span("parse_article_content"):
377 parsed_xarticle = self.parse_article_content(
378 content, xissue, xarticle, xarticle.url
379 )
381 if parsed_xarticle is None: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true
382 return None
384 if parsed_xarticle.doi:
385 parsed_xarticle.pid = (
386 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
387 )
389 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
390 ext_link = create_extlink()
391 ext_link["rel"] = "source"
392 ext_link["location"] = parsed_xarticle.url
393 ext_link["metadata"] = self.source_domain
394 parsed_xarticle.ext_links.append(ext_link)
396 # The article title may have formulas surrounded with '$'
397 return self.process_article_metadata(parsed_xarticle)
399 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
400 tag = "article-title" if resource_type == "article" else "issue-title"
402 # Process title tex
403 ckeditor_data = build_jats_data_from_html_field(
404 xresource.title_tex,
405 tag=tag,
406 text_lang=xresource.lang,
407 delimiter_inline=self.delimiter_inline_formula,
408 delimiter_disp=self.delimiter_disp_formula,
409 )
411 xresource.title_html = ckeditor_data["value_html"]
412 # xresource.title_tex = ckeditor_data["value_tex"]
413 xresource.title_xml = ckeditor_data["value_xml"]
415 abstracts_to_parse = [
416 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
417 ]
418 # abstract may have formulas surrounded with '$'
419 if len(abstracts_to_parse) > 0:
420 for xabstract in abstracts_to_parse:
421 ckeditor_data = build_jats_data_from_html_field(
422 xabstract["value_tex"],
423 tag="abstract",
424 text_lang=xabstract["lang"],
425 resource_lang=xresource.lang,
426 field_type="abstract",
427 delimiter_inline=self.delimiter_inline_formula,
428 delimiter_disp=self.delimiter_disp_formula,
429 )
431 xabstract["value_html"] = ckeditor_data["value_html"]
432 # xabstract["value_tex"] = ckeditor_data["value_tex"]
433 xabstract["value_xml"] = ckeditor_data["value_xml"]
435 return xresource
437 def process_article_metadata(self, xarticle: ArticleData):
438 self.process_resource_metadata(xarticle)
439 for bibitem in xarticle.bibitems:
440 bibitem.type = "unknown"
441 update_data_for_jats(xarticle, with_label=False)
443 return xarticle
445 def download_file(self, url: str, force_refresh=False, headers={}):
446 """
447 Downloads a page and returns its content (decoded string).
448 This function handles retries and decoding
449 """
450 current_exception: Exception | None = None
451 for attempt in range(3):
452 try:
453 kwargs = {
454 "url": url,
455 "headers": {**self.headers, **headers},
456 "timeout": self.requests_timeout,
457 }
458 if attempt > 0 and isinstance(self.session, CachedSession):
459 kwargs["force_refresh"] = True
460 response = self.session.get(**kwargs)
462 content = self.decode_response(response)
463 if content == "" or not content:
464 raise requests.exceptions.HTTPError(response)
466 return content
467 except (
468 requests.ConnectionError,
469 requests.ConnectTimeout,
470 requests.exceptions.HTTPError,
471 ) as e:
472 current_exception = e
473 self.logger.debug(f"Caught error : {e}", extra={"url": url})
474 # 15 mins, 30 mins, 45 mins
475 delay_minutes = attempt * 15
476 self.logger.debug(
477 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
478 extra={"url": url},
479 )
480 time.sleep(delay_minutes * 60)
482 raise current_exception
484 def decode_response(self, response: requests.Response, encoding: str | None = None):
485 """Override this if the content-type headers from the sources are advertising something else than the actual content
486 SASA needs this"""
487 # Force
488 if encoding:
489 response.encoding = encoding
490 return response.text
492 # Attempt to get encoding using HTTP headers
493 content_type_tag = response.headers.get("Content-Type", None)
495 if content_type_tag: 495 ↛ 502line 495 didn't jump to line 502 because the condition on line 495 was always true
496 charset = self.parse_content_type_charset(content_type_tag)
497 if charset: 497 ↛ 498line 497 didn't jump to line 498 because the condition on line 497 was never true
498 response.encoding = charset
499 return response.text
501 # Attempt to get encoding using HTML meta charset tag
502 soup = BeautifulSoup(response.text, "html5lib")
503 charset = soup.select_one("meta[charset]")
504 if charset:
505 htmlencoding = charset.get("charset")
506 if isinstance(htmlencoding, str): 506 ↛ 511line 506 didn't jump to line 511 because the condition on line 506 was always true
507 response.encoding = htmlencoding
508 return response.text
510 # Attempt to get encoding using HTML meta content type tag
511 content_type_tag = soup.select_one(
512 'meta[http-equiv="Content-Type"],meta[http-equiv="content-type"]'
513 )
514 if content_type_tag:
515 content_type = content_type_tag.get("content")
516 if isinstance(content_type, str): 516 ↛ 522line 516 didn't jump to line 522 because the condition on line 516 was always true
517 charset = self.parse_content_type_charset(content_type)
518 if charset: 518 ↛ 522line 518 didn't jump to line 522 because the condition on line 518 was always true
519 response.encoding = charset
520 return response.text
522 return response.text
524 @staticmethod
525 def parse_content_type_charset(content_type: str):
526 header = EmailPolicy.header_factory("content-type", content_type)
527 if "charset" in header.params:
528 return header.params.get("charset")
530 @tracer.start_as_current_span("add_xissue_to_database")
531 def add_xissue_into_database(self, xissue: IssueData) -> IssueData:
532 xissue.journal = self.collection
533 xissue.source = self.source_domain
535 if xissue.year == "":
536 raise ValueError("Failsafe : Cannot insert issue without a year")
538 xpub = create_publisherdata()
539 xpub.name = self.publisher
540 xissue.publisher = xpub
541 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
543 attempt = 1
544 success = False
546 while not success and attempt < 4:
547 try:
548 params = {"xissue": xissue, "use_body": False}
549 cmd = addOrUpdateGDMLIssueXmlCmd(params)
550 cmd.do()
551 success = True
552 self.logger.debug(f"Issue {xissue.pid} inserted in database")
553 return xissue
554 except SolrError:
555 self.logger.warning(
556 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
557 )
558 attempt += 1
559 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
560 time.sleep(10)
561 except Exception as e:
562 self.logger.error(
563 f"Got exception while attempting to insert {xissue.pid} in database : {e}"
564 )
565 raise e
567 if success is False:
568 raise ConnectionRefusedError("Cannot connect to SolR")
570 assert False, "Unreachable"
572 def get_metadata_using_citation_meta(
573 self,
574 xarticle: ArticleData,
575 xissue: IssueData,
576 soup: BeautifulSoup,
577 what: list[CitationLiteral] = [],
578 ):
579 """
580 :param xarticle: the xarticle that will collect the metadata
581 :param xissue: the xissue that will collect the publisher
582 :param soup: the BeautifulSoup object of tha article page
583 :param what: list of citation_ items to collect.
584 :return: None. The given article is modified
585 """
587 if "title" in what:
588 # TITLE
589 citation_title_node = soup.select_one("meta[name='citation_title']")
590 if citation_title_node: 590 ↛ 595line 590 didn't jump to line 595 because the condition on line 590 was always true
591 title = citation_title_node.get("content")
592 if isinstance(title, str): 592 ↛ 595line 592 didn't jump to line 595 because the condition on line 592 was always true
593 xarticle.title_tex = title
595 if "author" in what: 595 ↛ 624line 595 didn't jump to line 624 because the condition on line 595 was always true
596 # AUTHORS
597 citation_author_nodes = soup.select("meta[name^='citation_author']")
598 current_author: ContributorDict | None = None
599 for citation_author_node in citation_author_nodes:
600 if citation_author_node.get("name") == "citation_author":
601 text_author = citation_author_node.get("content")
602 if not isinstance(text_author, str): 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true
603 raise ValueError("Cannot parse author")
604 if text_author == "": 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true
605 current_author = None
606 continue
607 current_author = create_contributor(role="author", string_name=text_author)
608 xarticle.contributors.append(current_author)
609 continue
610 if current_author is None: 610 ↛ 611line 610 didn't jump to line 611 because the condition on line 610 was never true
611 self.logger.warning("Couldn't parse citation author")
612 continue
613 if citation_author_node.get("name") == "citation_author_institution":
614 text_institution = citation_author_node.get("content")
615 if not isinstance(text_institution, str): 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true
616 continue
617 current_author["addresses"].append(text_institution)
618 if citation_author_node.get("name") == "citation_author_ocrid": 618 ↛ 619line 618 didn't jump to line 619 because the condition on line 618 was never true
619 text_orcid = citation_author_node.get("content")
620 if not isinstance(text_orcid, str):
621 continue
622 current_author["orcid"] = text_orcid
624 if "pdf" in what:
625 # PDF
626 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
627 if citation_pdf_node:
628 pdf_url = citation_pdf_node.get("content")
629 if isinstance(pdf_url, str): 629 ↛ 632line 629 didn't jump to line 632 because the condition on line 629 was always true
630 add_pdf_link_to_xarticle(xarticle, pdf_url)
632 if "lang" in what:
633 # LANG
634 citation_lang_node = soup.select_one("meta[name='citation_language']")
635 if citation_lang_node: 635 ↛ 641line 635 didn't jump to line 641 because the condition on line 635 was always true
636 # TODO: check other language code
637 content_text = citation_lang_node.get("content")
638 if isinstance(content_text, str): 638 ↛ 641line 638 didn't jump to line 641 because the condition on line 638 was always true
639 xarticle.lang = standardize_tag(content_text)
641 if "abstract" in what:
642 # ABSTRACT
643 abstract_node = soup.select_one("meta[name='citation_abstract']")
644 if abstract_node is not None:
645 abstract = abstract_node.get("content")
646 if not isinstance(abstract, str): 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true
647 raise ValueError("Couldn't parse abstract from meta")
648 abstract = BeautifulSoup(abstract, "html.parser").text
649 lang = abstract_node.get("lang")
650 if not isinstance(lang, str):
651 lang = self.detect_language(abstract, xarticle)
652 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
654 if "page" in what:
655 # PAGES
656 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
657 if citation_fpage_node:
658 page = citation_fpage_node.get("content")
659 if isinstance(page, str): 659 ↛ 664line 659 didn't jump to line 664 because the condition on line 659 was always true
660 page = page.split("(")[0]
661 if len(page) < 32: 661 ↛ 664line 661 didn't jump to line 664 because the condition on line 661 was always true
662 xarticle.fpage = page
664 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
665 if citation_lpage_node:
666 page = citation_lpage_node.get("content")
667 if isinstance(page, str): 667 ↛ 672line 667 didn't jump to line 672 because the condition on line 667 was always true
668 page = page.split("(")[0]
669 if len(page) < 32: 669 ↛ 672line 669 didn't jump to line 672 because the condition on line 669 was always true
670 xarticle.lpage = page
672 if "doi" in what:
673 # DOI
674 citation_doi_node = soup.select_one("meta[name='citation_doi']")
675 if citation_doi_node:
676 doi = citation_doi_node.get("content")
677 if isinstance(doi, str): 677 ↛ 684line 677 didn't jump to line 684 because the condition on line 677 was always true
678 doi = doi.strip()
679 pos = doi.find("10.")
680 if pos > 0:
681 doi = doi[pos:]
682 xarticle.doi = doi
684 if "mr" in what:
685 # MR
686 citation_mr_node = soup.select_one("meta[name='citation_mr']")
687 if citation_mr_node:
688 mr = citation_mr_node.get("content")
689 if isinstance(mr, str): 689 ↛ 695line 689 didn't jump to line 695 because the condition on line 689 was always true
690 mr = mr.strip()
691 if mr.find("MR") == 0: 691 ↛ 695line 691 didn't jump to line 695 because the condition on line 691 was always true
692 mr = mr[2:]
693 xarticle.extids.append(("mr-item-id", mr))
695 if "zbl" in what:
696 # ZBL
697 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
698 if citation_zbl_node:
699 zbl = citation_zbl_node.get("content")
700 if isinstance(zbl, str): 700 ↛ 706line 700 didn't jump to line 706 because the condition on line 700 was always true
701 zbl = zbl.strip()
702 if zbl.find("Zbl") == 0: 702 ↛ 706line 702 didn't jump to line 706 because the condition on line 702 was always true
703 zbl = zbl[3:].strip()
704 xarticle.extids.append(("zbl-item-id", zbl))
706 if "publisher" in what:
707 # PUBLISHER
708 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
709 if citation_publisher_node:
710 pub = citation_publisher_node.get("content")
711 if isinstance(pub, str): 711 ↛ 718line 711 didn't jump to line 718 because the condition on line 711 was always true
712 pub = pub.strip()
713 if pub != "": 713 ↛ 718line 713 didn't jump to line 718 because the condition on line 713 was always true
714 xpub = create_publisherdata()
715 xpub.name = pub
716 xissue.publisher = xpub
718 if "keywords" in what:
719 # KEYWORDS
720 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
721 for kwd_node in citation_kwd_nodes:
722 kwds = kwd_node.get("content")
723 if isinstance(kwds, str): 723 ↛ 721line 723 didn't jump to line 721 because the condition on line 723 was always true
724 kwds = kwds.split(",")
725 for kwd in kwds:
726 if kwd == "":
727 continue
728 kwd = kwd.strip()
729 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
731 if "references" in what:
732 citation_references = soup.select("meta[name='citation_reference']")
733 for index, tag in enumerate(citation_references):
734 content = tag.get("content")
735 if not isinstance(content, str): 735 ↛ 736line 735 didn't jump to line 736 because the condition on line 735 was never true
736 raise ValueError("Cannot parse citation_reference meta")
737 label = str(index + 1)
738 if regex.match(r"^\[\d+\].*", content): 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true
739 label = None
740 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
742 def get_metadata_using_dcterms(
743 self,
744 xarticle: ArticleData,
745 soup: "Tag",
746 what: "Iterable[Literal['abstract', 'keywords', 'date_published', 'article_type']]",
747 ):
748 if "abstract" in what: 748 ↛ 756line 748 didn't jump to line 756 because the condition on line 748 was always true
749 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
750 if abstract_tag: 750 ↛ 756line 750 didn't jump to line 756 because the condition on line 750 was always true
751 abstract_text = self.get_str_attr(abstract_tag, "content")
752 xarticle.abstracts.append(
753 create_abstract(lang="en", value_tex=cleanup_str(abstract_text))
754 )
756 if "keywords" in what: 756 ↛ 765line 756 didn't jump to line 765 because the condition on line 756 was always true
757 keyword_tags = soup.select("meta[name='DC.subject']")
758 for tag in keyword_tags:
759 kwd_text = tag.get("content")
760 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true
761 continue
762 kwd = create_subj(value=kwd_text)
763 xarticle.kwds.append(kwd)
765 if "date_published" in what: 765 ↛ 766line 765 didn't jump to line 766 because the condition on line 765 was never true
766 published_tag = soup.select_one("meta[name='DC.Date.created']")
767 if published_tag:
768 published_text = self.get_str_attr(published_tag, "content")
769 xarticle.date_published = published_text
771 if "article_type" in what: 771 ↛ 772line 771 didn't jump to line 772 because the condition on line 771 was never true
772 type_tag = soup.select_one("meta[name='DC.Type.articleType']")
773 if type_tag:
774 type_text = self.get_str_attr(type_tag, "content")
775 xarticle.atype = type_text
777 def create_xissue(
778 self,
779 url: str | None,
780 year: str,
781 volume_number: str | None,
782 issue_number: str | None = None,
783 vseries: str | None = None,
784 ):
785 if url is not None and url.endswith("/"):
786 url = url[:-1]
787 xissue = create_issuedata()
788 xissue.url = url
790 xissue.pid = self.get_issue_pid(
791 self.collection_id, year, volume_number, issue_number, vseries
792 )
794 xissue.year = year
796 if volume_number is not None:
797 xissue.volume = regex.sub(r"[^\w-]+", "_", volume_number)
799 if issue_number is not None:
800 xissue.number = issue_number.replace(",", "-")
802 if vseries is not None: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true
803 xissue.vseries = vseries
804 return xissue
806 def detect_language(self, text: str, article: ArticleData | None = None):
807 if article and article.lang is not None and article.lang != "und":
808 return article.lang
810 language = self.language_detector.detect_language_of(text)
812 if not language: 812 ↛ 813line 812 didn't jump to line 813 because the condition on line 812 was never true
813 return "und"
814 return language.iso_code_639_1.name.lower()
816 def get_str_attr(self, tag: "Tag", attr: str):
817 """Equivalent of `tag.get(attr)`, but ensures the return value is a string"""
818 node_attr = tag.get(attr)
819 if isinstance(node_attr, list): 819 ↛ 820line 819 didn't jump to line 820 because the condition on line 819 was never true
820 raise ValueError(
821 f"[{self.source_domain}] {self.collection_id} : html tag has multiple {attr} attributes."
822 )
823 if node_attr is None: 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true
824 raise ValueError(
825 f"[{self.source_domain}] {self.collection_id} : html tag doesn't have any {attr} attributes"
826 )
827 return node_attr
829 def create_trans_title(
830 self,
831 resource_type: str,
832 title_str: str,
833 lang: str,
834 xresource_lang: str,
835 title_type: str = "main",
836 ):
837 tag = "trans-title" if resource_type == "article" else "issue-title"
839 ckeditor_data = build_jats_data_from_html_field(
840 title_str,
841 tag=tag,
842 text_lang=lang,
843 resource_lang=xresource_lang,
844 delimiter_inline=self.delimiter_inline_formula,
845 delimiter_disp=self.delimiter_disp_formula,
846 )
848 titledata = create_titledata(
849 lang=lang,
850 type="main",
851 title_html=ckeditor_data["value_html"],
852 title_xml=ckeditor_data["value_xml"],
853 )
855 return titledata
857 references_mapping = {
858 "citation_title": get_article_title_xml,
859 "citation_journal_title": get_source_xml,
860 "citation_publication_date": get_year_xml,
861 "citation_firstpage": get_fpage_xml,
862 "citation_lastpage": get_lpage_xml,
863 }
865 @classmethod
866 def __parse_meta_citation_reference(cls, content: str, label=None):
867 categories = content.split(";")
869 if len(categories) == 1:
870 return JatsBase.bake_ref(content, label=label)
872 citation_data = [c.split("=") for c in categories if "=" in c]
873 del categories
875 xml_string = ""
876 authors_parsed = False
877 authors_strings = []
878 for data in citation_data:
879 key = data[0].strip()
880 citation_content = data[1]
881 if key == "citation_author":
882 authors_strings.append(get_author_xml(template_str=citation_content))
883 continue
884 elif not authors_parsed:
885 xml_string += ", ".join(authors_strings)
886 authors_parsed = True
888 if key in cls.references_mapping:
889 xml_string += " " + cls.references_mapping[key](citation_content)
891 return JatsBase.bake_ref(xml_string, label=label)
893 @classmethod
894 def get_or_create_source(cls):
895 source, created = Source.objects.get_or_create(
896 domain=cls.source_domain,
897 defaults={
898 "name": cls.source_name,
899 "website": cls.source_website,
900 "view_id": cls.get_view_id(),
901 },
902 )
903 if created: 903 ↛ 904line 903 didn't jump to line 904 because the condition on line 903 was never true
904 source.save()
905 return source
907 @staticmethod
908 def get_issue_pid(
909 collection_id: str,
910 year: str,
911 volume_number: str | None = None,
912 issue_number: str | None = None,
913 series: str | None = None,
914 ):
915 # Replace any non-word character with an underscore
916 pid = f"{collection_id}_{year}"
917 if series is not None: 917 ↛ 918line 917 didn't jump to line 918 because the condition on line 917 was never true
918 pid += f"_{series}"
919 if volume_number is not None:
920 pid += f"_{volume_number}"
921 if issue_number is not None:
922 pid += f"_{issue_number}"
923 pid = regex.sub(r"[^\w-]+", "_", cleanup_str(pid))
924 return pid
926 @staticmethod
927 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
928 pages_split = pages.split(separator)
929 if len(pages_split) == 0: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true
930 article.page_range = pages
931 if len(pages_split) > 0: 931 ↛ exitline 931 didn't return from function 'set_pages' because the condition on line 931 was always true
932 if pages[0].isnumeric(): 932 ↛ exitline 932 didn't return from function 'set_pages' because the condition on line 932 was always true
933 article.fpage = pages_split[0]
934 if ( 934 ↛ 939line 934 didn't jump to line 939 because the condition on line 934 was never true
935 len(pages_split) > 1
936 and pages_split[0] != pages_split[1]
937 and pages_split[1].isnumeric()
938 ):
939 article.lpage = pages_split[1]
941 @staticmethod
942 def _process_pdf_header(chunk: str, response: requests.Response | aiohttp.ClientResponse):
943 content_type = response.headers.get("Content-Type")
944 if regex.match(rb"^%PDF-\d\.\d", chunk):
945 if content_type and "application/pdf" in content_type:
946 # The file is unmistakably a pdf
947 return [
948 True,
949 response,
950 {
951 "status": ExtlinkChecked.Status.OK,
952 "message": "",
953 },
954 ]
955 # The file is a pdf, but the content type advertised by the server is wrong
956 return [
957 True,
958 response,
959 {
960 "status": ExtlinkChecked.Status.WARNING,
961 "message": f"Content-Type header: {content_type}",
962 },
963 ]
965 # Reaching here means we couldn't find the pdf.
966 if not content_type or "application/pdf" not in content_type:
967 return [
968 False,
969 response,
970 {
971 "status": ExtlinkChecked.Status.ERROR,
972 "message": f"Content-Type header: {content_type}; PDF Header not found: got {chunk}",
973 },
974 ]
976 return [
977 False,
978 response,
979 {
980 "status": ExtlinkChecked.Status.ERROR,
981 "message": f"PDF Header not found: got {chunk}",
982 },
983 ]
985 @classmethod
986 async def a_check_pdf_link_validity(
987 cls, url: str, verify=True
988 ) -> tuple[bool, aiohttp.ClientResponse, dict]:
989 """
990 Check the validity of the PDF links.
991 """
992 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
993 header = {
994 "Range": f"bytes=0-{CHUNK_SIZE}",
995 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
996 }
997 async with cls.async_session.get(
998 url, headers=header, allow_redirects=True, ssl=verify
999 ) as response:
1000 try:
1001 chunk = await response.content.read(CHUNK_SIZE)
1002 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1003 except StopIteration:
1004 return [
1005 False,
1006 response,
1007 {
1008 "status": ExtlinkChecked.Status.ERROR,
1009 "message": "Error reading PDF header",
1010 },
1011 ]
1013 @classmethod
1014 def check_pdf_link_validity(
1015 cls, url: str, verify=True
1016 ) -> tuple[bool, requests.Response | None, dict]:
1017 """
1018 Check the validity of the PDF links.
1019 """
1020 CHUNK_SIZE = 10 # Nombre de caractères à récupérer
1021 header = {
1022 "Range": f"bytes=0-{CHUNK_SIZE}",
1023 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
1024 }
1025 with cls.session.get(
1026 url, headers=header, allow_redirects=True, verify=verify, stream=True
1027 ) as response:
1028 try:
1029 chunk = next(response.iter_content(CHUNK_SIZE))
1030 return BaseCollectionCrawler._process_pdf_header(chunk, response)
1031 except StopIteration:
1032 return [
1033 False,
1034 response,
1035 {
1036 "status": ExtlinkChecked.Status.ERROR,
1037 "message": "Error reading PDF header",
1038 },
1039 ]
1041 @classmethod
1042 async def check_extlink_validity(cls, extlink: "ExtLink"):
1043 """
1044 Method used by rot_monitoring to check if links have expired
1045 """
1046 defaults: dict = {"date": datetime.now(), "status": ExtlinkChecked.Status.OK}
1047 header = {
1048 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
1049 }
1050 verify = True
1051 if not cls.verify:
1052 verify = False
1053 try:
1054 if extlink.rel == "article-pdf":
1055 isok, response, message = await cls.a_check_pdf_link_validity(
1056 extlink.location, verify
1057 )
1058 defaults.update(message)
1059 defaults["http_status"] = response.status
1060 else:
1061 async with cls.async_session.get(
1062 url=extlink.location,
1063 headers=header,
1064 allow_redirects=True,
1065 ssl=verify,
1066 ) as response:
1067 defaults["http_status"] = response.status
1068 if response.status not in (200, 206):
1069 defaults["status"] = ExtlinkChecked.Status.ERROR
1071 except aiohttp.ClientSSLError:
1072 cls.logger.error("SSL error for the url: %s", extlink.location)
1073 defaults["status"] = ExtlinkChecked.Status.ERROR
1074 defaults["message"] = "SSL error"
1075 except aiohttp.ClientConnectionError:
1076 cls.logger.error("Connection error for the url: %s", extlink.location)
1077 defaults["status"] = ExtlinkChecked.Status.ERROR
1078 defaults["message"] = "Connection error"
1079 except TimeoutError:
1080 cls.logger.error("Timeout error for the url: %s", extlink.location)
1081 defaults["status"] = ExtlinkChecked.Status.ERROR
1082 defaults["message"] = "Timeout error"
1083 finally:
1084 try:
1085 await ExtlinkChecked.objects.aupdate_or_create(extlink=extlink, defaults=defaults)
1086 cls.logger.info(
1087 "DB Update, source: %s, url: %s", cls.source_domain, extlink.location
1088 )
1089 except IntegrityError:
1090 cls.logger.error(
1091 "Extlink was deleted, source: %s, url: %s", cls.source_domain, extlink.location
1092 )