Coverage for src/crawler/base_crawler.py: 70%
493 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1import logging
2import time
3from concurrent.futures import (
4 Executor,
5 ThreadPoolExecutor,
6)
7from datetime import datetime, timedelta
8from email.policy import EmailPolicy
10import regex
11import requests
12from bs4 import BeautifulSoup
13from django.conf import settings
14from django.contrib.auth.models import User
15from django.utils import timezone
16from langcodes import standardize_tag
17from lingua import LanguageDetectorBuilder
18from opentelemetry import trace
19from ptf.cmds.xml.ckeditor.utils import (
20 build_jats_data_from_html_field,
21)
22from ptf.cmds.xml.jats.builder.references import (
23 get_article_title_xml,
24 get_author_xml,
25 get_fpage_xml,
26 get_lpage_xml,
27 get_source_xml,
28 get_year_xml,
29)
30from ptf.cmds.xml.jats.jats_parser import JatsBase
31from ptf.model_data import (
32 ArticleData,
33 ContributorDict,
34 IssueData,
35 ResourceData,
36 TitleDict,
37 create_abstract,
38 create_contributor,
39 create_extlink,
40 create_issuedata,
41 create_publisherdata,
42 create_titledata,
43)
44from ptf.model_data_converter import update_data_for_jats
45from pylatexenc.latex2text import LatexNodes2Text
46from pymongo.errors import DocumentTooLarge
47from pysolr import SolrError
48from requests.adapters import HTTPAdapter
49from requests_cache import CachedSession, MongoCache
50from urllib3 import Retry
52from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
53from crawler.models import Source
54from crawler.types import CitationLiteral
55from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
57# TODO: pass a class factory instead of a dependency to a site
58# TODO: pass a class factory instead of a dependency to a site
61class CrawlerTitleDict(TitleDict):
62 title_tex: str | None
65class BaseCollectionCrawler:
66 """
67 Base collection for the crawlers.
68 To create a crawler:
69 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
70 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
71 3) update factory.py so that crawler_factory can return your new crawler
72 """
74 logger = logging.getLogger(__name__)
75 tracer = trace.get_tracer(__name__)
77 source_name = ""
78 source_domain = ""
79 source_website = ""
81 issue_href = ""
83 collection = None
84 source = None
85 user = None
86 session: requests.Session | CachedSession
88 verify = True
89 headers = {
90 "accept_encoding": "utf-8",
91 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
92 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
93 }
95 next_allowed_request: float = time.time()
97 # seconds to wait between two http requests
98 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
99 # seconds to wait before aborting the connection (if no bytes are recieved)
100 requests_timeout = 10
102 latext_parser = LatexNodes2Text()
104 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
105 # do not use the "$" to surround tex formulas
106 delimiter_inline_formula = "$"
107 delimiter_disp_formula = "$"
109 # HACK : Workaround for tests (monkeypatching)
110 # We store the class here, so we can monkeypatch it when running tests
111 # subCrawlers = {
112 # LofplCrawler: None
113 # }
114 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
116 language_detector = LanguageDetectorBuilder.from_all_languages().build()
118 force_refresh = False
120 # Whereas to include headers in requests cache key
121 match_headers = False
122 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
124 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
125 ignore_missing_pdf = True
127 database_executor: Executor
129 @classmethod
130 def get_view_id(cls):
131 return cls.source_domain
133 def __init__(
134 self,
135 *args,
136 username: str,
137 collection_id: str,
138 collection_url: str,
139 test_mode: bool = False,
140 publisher: str = "mathdoc",
141 force_refresh=False,
142 ):
143 for CrawlerClass in self.subCrawlers: 143 ↛ 144line 143 didn't jump to line 144 because the loop on line 143 never started
144 self.subCrawlers[CrawlerClass] = CrawlerClass(
145 *args,
146 username=username,
147 collection_id=collection_id,
148 collection_url=collection_url,
149 test_mode=test_mode,
150 publisher=publisher,
151 )
152 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
154 self.username = username
156 self.collection_id = collection_id
157 self.collection_url = (
158 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
159 )
161 self.test_mode = test_mode
162 self.publisher = publisher
164 self.session = requests.session()
166 # Skipped when running tests
167 self.initialize()
168 self.session.verify = self.verify
169 self.force_refresh = force_refresh
171 # We implemented custom retry behaviour, so we don't want to make extra requests here
172 retries = Retry(
173 total=0,
174 )
175 self.session.mount("https://", HTTPAdapter(max_retries=retries))
176 self.session.mount("http://", HTTPAdapter(max_retries=retries))
178 self.database_executor = ThreadPoolExecutor(
179 max_workers=2, thread_name_prefix="crawler_database_thread"
180 )
182 def initialize(self):
183 """
184 Acts as a "second" init function to skip model accesses during test data generation
185 """
186 self.collection = get_or_create_collection(self.collection_id)
187 self.source = self.get_or_create_source()
188 self.user = User.objects.get(username=self.username)
189 self.session = CachedSession(
190 match_headers=self.match_headers,
191 headers=self.headers,
192 backend=MongoCache(
193 host=getattr(settings, "MONGO_HOSTNAME", "localhost"),
194 ),
195 expire_after=timedelta(days=30),
196 )
198 @classmethod
199 def can_crawl(cls, pid: str) -> bool:
200 return True
202 def parse_collection_content(self, content: str) -> list[IssueData]:
203 """
204 Parse the HTML content with BeautifulSoup
205 returns a list of xissue.
206 Override this function in a derived class
207 """
208 return []
210 def parse_issue_content(self, content: str, xissue: IssueData):
211 """
212 Parse the HTML content with BeautifulSoup
213 Fills the xissue.articles
214 Override this function in a derived class.
216 CAV : You are supposed to create articles there. Please assign a PID to each article.
217 The PID can be `a + article_index`, like this : `a0` `a21`
218 """
220 def parse_article_content(
221 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
222 ) -> ArticleData | None:
223 """
224 Parse the HTML content with BeautifulSoup
225 returns the xarticle.
226 Override this function in a derived class.
227 The xissue is passed to the function in case the article page has issue information (ex: publisher)
228 The article url is also passed as a parameter
230 CAV : You are supposed to assign articles pid again here
231 """
232 return xarticle
234 @tracer.start_as_current_span("crawl_collection")
235 def crawl_collection(self):
236 # TODO: Comments, filter
237 """
238 Crawl an entire collection. ptf.models.Container objects are created.
239 - get the HTML content of the collection_url
240 - parse the HTML content with beautifulsoup to extract the list of issues
241 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
242 - crawl each issue if col_only is False
243 - Returns the list of merged issues.
244 It is an OrderedDict {pid: {"issues": xissues}}
245 The key is the pid of the merged issues.
246 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
247 the pid is then made with 1999-2000__6_
248 """
250 if self.source is None:
251 raise RuntimeError("ERROR: the source is not set")
253 content = self.download_file(self.collection_url)
254 xissues = self.parse_collection_content(content)
256 """
257 Some collections split the same volumes in different pages
258 Ex: Volume 6 (2000) and Volume 6 (1999)
259 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
260 """
261 # merged_xissues = self.merge_xissues(xissues)
263 xissues_dict = {str(i.pid): i for i in xissues}
265 return xissues_dict
267 @tracer.start_as_current_span("crawl_issue")
268 def crawl_issue(self, xissue: IssueData):
269 """
270 Crawl 1 wag page of an issue.
271 - get the HTML content of the issue
272 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
273 - crawl each article
274 """
276 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
277 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
279 issue_url = xissue.url
280 if issue_url is not None:
281 if issue_url.endswith(".pdf"):
282 add_pdf_link_to_xarticle(xissue, issue_url)
283 xissue.url = None
284 else:
285 content = self.download_file(issue_url)
286 with self.tracer.start_as_current_span("parse_issue_content"):
287 self.parse_issue_content(content, xissue)
289 xarticles = xissue.articles
291 parsed_xarticles = []
293 for xarticle in xarticles:
294 parsed_xarticle = self.crawl_article(xarticle, xissue)
295 if parsed_xarticle is not None:
296 parsed_xarticles.append(parsed_xarticle)
298 xissue.articles = parsed_xarticles
300 article_has_pdf = self.article_has_pdf(xissue)
302 if self.ignore_missing_pdf:
303 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
305 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
306 self.process_resource_metadata(xissue, resource_type="issue")
307 self.database_executor.submit(self.add_xissue_into_database, xissue)
309 @staticmethod
310 def article_has_source(art: ArticleData | IssueData):
311 return (
312 next(
313 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
314 None,
315 )
316 is not None
317 )
319 @staticmethod
320 def article_has_pdf(art: ArticleData | IssueData):
321 return (
322 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
323 is not None
324 )
326 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
327 # ARTICLE URL as en ExtLink (to display the link in the article page)
328 if xarticle.url is None:
329 if not self.article_has_source(xarticle): 329 ↛ 339line 329 didn't jump to line 339 because the condition on line 329 was always true
330 if xissue.url:
331 article_source = xissue.url
332 else:
333 article_source = self.collection_url
334 ext_link = create_extlink()
335 ext_link["rel"] = "source"
336 ext_link["location"] = article_source
337 ext_link["metadata"] = self.source_domain
338 xarticle.ext_links.append(ext_link)
339 return self.process_article_metadata(xarticle)
341 content = self.download_file(xarticle.url)
342 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
344 try:
345 with self.tracer.start_as_current_span("parse_article_content"):
346 parsed_xarticle = self.parse_article_content(
347 content, xissue, xarticle, xarticle.url
348 )
349 except ValueError as e:
350 self.logger.warning(e)
351 self.logger.warning("Retrying while invalidating cache")
352 content = self.download_file(xarticle.url, force_refresh=True)
353 with self.tracer.start_as_current_span("parse_article_content"):
354 parsed_xarticle = self.parse_article_content(
355 content, xissue, xarticle, xarticle.url
356 )
358 if parsed_xarticle is None: 358 ↛ 359line 358 didn't jump to line 359 because the condition on line 358 was never true
359 return None
361 if parsed_xarticle.doi:
362 parsed_xarticle.pid = (
363 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
364 )
366 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
367 ext_link = create_extlink()
368 ext_link["rel"] = "source"
369 ext_link["location"] = parsed_xarticle.url
370 ext_link["metadata"] = self.source_domain
371 parsed_xarticle.ext_links.append(ext_link)
373 # The article title may have formulas surrounded with '$'
374 return self.process_article_metadata(parsed_xarticle)
376 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
377 tag = "article-title" if resource_type == "article" else "issue-title"
379 # Process title tex
380 ckeditor_data = build_jats_data_from_html_field(
381 xresource.title_tex,
382 tag=tag,
383 text_lang=xresource.lang,
384 delimiter_inline=self.delimiter_inline_formula,
385 delimiter_disp=self.delimiter_disp_formula,
386 )
388 xresource.title_html = ckeditor_data["value_html"]
389 # xresource.title_tex = ckeditor_data["value_tex"]
390 xresource.title_xml = ckeditor_data["value_xml"]
392 # Process trans_title tex
393 if xresource.trans_title_tex: 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true
394 self.logger.warning(
395 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
396 )
397 trans_title = self.create_trans_title(
398 xresource_lang=xresource.lang,
399 resource_type=resource_type,
400 title_tex=xresource.trans_title_tex,
401 lang=xresource.trans_lang,
402 )
403 xresource.titles.append(trans_title)
405 abstracts_to_parse = [
406 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
407 ]
408 # abstract may have formulas surrounded with '$'
409 if len(abstracts_to_parse) > 0:
410 for xabstract in abstracts_to_parse:
411 ckeditor_data = build_jats_data_from_html_field(
412 xabstract["value_tex"],
413 tag="abstract",
414 text_lang=xabstract["lang"],
415 resource_lang=xresource.lang,
416 field_type="abstract",
417 delimiter_inline=self.delimiter_inline_formula,
418 delimiter_disp=self.delimiter_disp_formula,
419 )
421 xabstract["value_html"] = ckeditor_data["value_html"]
422 # xabstract["value_tex"] = ckeditor_data["value_tex"]
423 xabstract["value_xml"] = ckeditor_data["value_xml"]
425 return xresource
427 def process_article_metadata(self, xarticle: ArticleData):
428 self.process_resource_metadata(xarticle)
429 for bibitem in xarticle.bibitems:
430 bibitem.type = "unknown"
431 update_data_for_jats(xarticle, with_label=False)
433 return xarticle
435 def _wait_download_delay(self, url: str, force_refresh=False):
436 # If we already have a key, we can skip the timeout
437 if isinstance(self.session, CachedSession):
438 if self.session.cache.contains(url=url) and not force_refresh:
439 return
441 delta = self.next_allowed_request - time.time()
442 if delta > 0:
443 self.logger.info(f"Waiting {int(delta)}s before making another request")
444 time.sleep(delta)
445 self.next_allowed_request = time.time() + self.requests_interval
447 def _get(self, url: str, force_refresh=False, headers={}) -> requests.Response:
448 """
449 Wrapper around requests.get with delay based on the crawler class instance
450 """
452 self._wait_download_delay(url, force_refresh)
454 kwargs = {}
455 # self.session.cache.delete(urls=[url])
456 if isinstance(self.session, CachedSession):
457 kwargs["force_refresh"] = force_refresh
459 try:
460 response = self.session.get(
461 url,
462 headers={**self.headers, **headers},
463 timeout=self.requests_timeout,
464 **kwargs,
465 )
466 except DocumentTooLarge as e:
467 self.logger.error(e)
468 response = requests.get(
469 url, headers={**self.headers, **headers}, timeout=self.requests_timeout
470 )
472 if not response.ok:
473 raise requests.exceptions.HTTPError(
474 f"Endpoint answered with code {response.status_code} : {url}",
475 response=response,
476 )
478 return response
480 def download_file(self, url: str, force_refresh=False, headers={}):
481 """
482 Downloads a page and returns its content (decoded string).
483 This function handles retries and decoding
484 """
485 attempts = 0
486 while True:
487 try:
488 if attempts > 0:
489 force_refresh = True
490 response = self._get(
491 url, force_refresh=force_refresh or self.force_refresh, headers=headers
492 )
493 content = self.decode_response(response)
494 if content == "" or not content:
495 raise requests.exceptions.HTTPError(response)
496 if isinstance(self.session, CachedSession):
497 if "Expires" in response.headers:
498 del response.headers["Expires"]
499 del response.headers["Cache-Control"]
500 try:
501 self.session.cache.save_response(response)
502 except DocumentTooLarge as e:
503 self.logger.warning(e)
504 return content
505 except (
506 requests.ConnectionError,
507 requests.ConnectTimeout,
508 requests.exceptions.HTTPError,
509 ) as e:
510 if attempts > 3:
511 raise e
512 self.logger.debug(f"Caught error : {e}", extra={"url": url})
513 attempts += 1
514 # 15 mins, 30 mins, 45 mins
515 delay_minutes = attempts * 15
516 self.logger.debug(
517 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
518 extra={"url": url},
519 )
520 time.sleep(delay_minutes * 60)
522 def decode_response(self, response: requests.Response, encoding: str | None = None):
523 """Override this if the content-type headers from the sources are advertising something else than the actual content
524 SASA needs this"""
525 # Force
526 if encoding:
527 response.encoding = encoding
528 return response.text
530 # Attempt to get encoding using HTTP headers
531 content_type_tag = response.headers.get("Content-Type", None)
533 if content_type_tag: 533 ↛ 540line 533 didn't jump to line 540 because the condition on line 533 was always true
534 charset = self.parse_content_type_charset(content_type_tag)
535 if charset:
536 response.encoding = charset
537 return response.text
539 # Attempt to get encoding using HTML meta charset tag
540 soup = BeautifulSoup(response.text, "html5lib")
541 charset = soup.select_one("meta[charset]")
542 if charset:
543 htmlencoding = charset.get("charset")
544 if isinstance(htmlencoding, str): 544 ↛ 549line 544 didn't jump to line 549 because the condition on line 544 was always true
545 response.encoding = htmlencoding
546 return response.text
548 # Attempt to get encoding using HTML meta content type tag
549 content_type_tag = soup.select_one('meta[http-equiv="Content-Type"]')
550 if content_type_tag:
551 content_type = content_type_tag.get("content")
552 if isinstance(content_type, str): 552 ↛ 558line 552 didn't jump to line 558 because the condition on line 552 was always true
553 charset = self.parse_content_type_charset(content_type)
554 if charset: 554 ↛ 558line 554 didn't jump to line 558 because the condition on line 554 was always true
555 response.encoding = charset
556 return response.text
558 return response.text
560 @staticmethod
561 def parse_content_type_charset(content_type: str):
562 header = EmailPolicy.header_factory("content-type", content_type)
563 if "charset" in header.params:
564 return header.params.get("charset")
566 @tracer.start_as_current_span("add_xissue_to_database")
567 def add_xissue_into_database(self, xissue: IssueData):
568 xissue.journal = self.collection
569 xissue.source = self.source_domain
571 if xissue.year == "":
572 raise ValueError("Failsafe : Cannot insert issue without a year")
574 xpub = create_publisherdata()
575 xpub.name = self.publisher
576 xissue.publisher = xpub
577 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
579 attempt = 1
580 success = False
582 while not success and attempt < 4:
583 try:
584 params = {"xissue": xissue, "use_body": False}
585 cmd = addOrUpdateGDMLIssueXmlCmd(params)
586 cmd.do()
587 success = True
588 self.logger.debug(f"Issue {xissue.pid} inserted in database")
589 except SolrError:
590 self.logger.warning(
591 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
592 )
593 attempt += 1
594 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
595 time.sleep(10)
597 if success is False:
598 raise ConnectionRefusedError("Cannot connect to SolR")
600 def get_metadata_using_citation_meta(
601 self,
602 xarticle: ArticleData,
603 xissue: IssueData,
604 soup: BeautifulSoup,
605 what: list[CitationLiteral] = [],
606 ):
607 """
608 :param xarticle: the xarticle that will collect the metadata
609 :param xissue: the xissue that will collect the publisher
610 :param soup: the BeautifulSoup object of tha article page
611 :param what: list of citation_ items to collect.
612 :return: None. The given article is modified
613 """
615 if "title" in what:
616 # TITLE
617 citation_title_node = soup.select_one("meta[name='citation_title']")
618 if citation_title_node: 618 ↛ 623line 618 didn't jump to line 623 because the condition on line 618 was always true
619 title = citation_title_node.get("content")
620 if isinstance(title, str): 620 ↛ 623line 620 didn't jump to line 623 because the condition on line 620 was always true
621 xarticle.title_tex = title
623 if "author" in what: 623 ↛ 652line 623 didn't jump to line 652 because the condition on line 623 was always true
624 # AUTHORS
625 citation_author_nodes = soup.select("meta[name^='citation_author']")
626 current_author: ContributorDict | None = None
627 for citation_author_node in citation_author_nodes:
628 if citation_author_node.get("name") == "citation_author":
629 text_author = citation_author_node.get("content")
630 if not isinstance(text_author, str): 630 ↛ 631line 630 didn't jump to line 631 because the condition on line 630 was never true
631 raise ValueError("Cannot parse author")
632 if text_author == "": 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true
633 current_author = None
634 continue
635 current_author = create_contributor(role="author", string_name=text_author)
636 xarticle.contributors.append(current_author)
637 continue
638 if current_author is None: 638 ↛ 639line 638 didn't jump to line 639 because the condition on line 638 was never true
639 self.logger.warning("Couldn't parse citation author")
640 continue
641 if citation_author_node.get("name") == "citation_author_institution":
642 text_institution = citation_author_node.get("content")
643 if not isinstance(text_institution, str): 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true
644 continue
645 current_author["addresses"].append(text_institution)
646 if citation_author_node.get("name") == "citation_author_ocrid": 646 ↛ 647line 646 didn't jump to line 647 because the condition on line 646 was never true
647 text_orcid = citation_author_node.get("content")
648 if not isinstance(text_orcid, str):
649 continue
650 current_author["orcid"] = text_orcid
652 if "pdf" in what:
653 # PDF
654 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
655 if citation_pdf_node:
656 pdf_url = citation_pdf_node.get("content")
657 if isinstance(pdf_url, str): 657 ↛ 660line 657 didn't jump to line 660 because the condition on line 657 was always true
658 add_pdf_link_to_xarticle(xarticle, pdf_url)
660 if "lang" in what:
661 # LANG
662 citation_lang_node = soup.select_one("meta[name='citation_language']")
663 if citation_lang_node: 663 ↛ 669line 663 didn't jump to line 669 because the condition on line 663 was always true
664 # TODO: check other language code
665 content_text = citation_lang_node.get("content")
666 if isinstance(content_text, str): 666 ↛ 669line 666 didn't jump to line 669 because the condition on line 666 was always true
667 xarticle.lang = standardize_tag(content_text)
669 if "abstract" in what:
670 # ABSTRACT
671 abstract_node = soup.select_one("meta[name='citation_abstract']")
672 if abstract_node is not None:
673 abstract = abstract_node.get("content")
674 if not isinstance(abstract, str): 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true
675 raise ValueError("Couldn't parse abstract from meta")
676 abstract = BeautifulSoup(abstract, "html.parser").text
677 lang = abstract_node.get("lang")
678 if not isinstance(lang, str):
679 lang = self.detect_language(abstract, xarticle)
680 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
682 if "page" in what:
683 # PAGES
684 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
685 if citation_fpage_node:
686 page = citation_fpage_node.get("content")
687 if isinstance(page, str): 687 ↛ 692line 687 didn't jump to line 692 because the condition on line 687 was always true
688 page = page.split("(")[0]
689 if len(page) < 32: 689 ↛ 692line 689 didn't jump to line 692 because the condition on line 689 was always true
690 xarticle.fpage = page
692 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
693 if citation_lpage_node:
694 page = citation_lpage_node.get("content")
695 if isinstance(page, str): 695 ↛ 700line 695 didn't jump to line 700 because the condition on line 695 was always true
696 page = page.split("(")[0]
697 if len(page) < 32: 697 ↛ 700line 697 didn't jump to line 700 because the condition on line 697 was always true
698 xarticle.lpage = page
700 if "doi" in what:
701 # DOI
702 citation_doi_node = soup.select_one("meta[name='citation_doi']")
703 if citation_doi_node:
704 doi = citation_doi_node.get("content")
705 if isinstance(doi, str): 705 ↛ 712line 705 didn't jump to line 712 because the condition on line 705 was always true
706 doi = doi.strip()
707 pos = doi.find("10.")
708 if pos > 0:
709 doi = doi[pos:]
710 xarticle.doi = doi
712 if "mr" in what:
713 # MR
714 citation_mr_node = soup.select_one("meta[name='citation_mr']")
715 if citation_mr_node:
716 mr = citation_mr_node.get("content")
717 if isinstance(mr, str): 717 ↛ 723line 717 didn't jump to line 723 because the condition on line 717 was always true
718 mr = mr.strip()
719 if mr.find("MR") == 0: 719 ↛ 723line 719 didn't jump to line 723 because the condition on line 719 was always true
720 mr = mr[2:]
721 xarticle.extids.append(("mr-item-id", mr))
723 if "zbl" in what:
724 # ZBL
725 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
726 if citation_zbl_node:
727 zbl = citation_zbl_node.get("content")
728 if isinstance(zbl, str): 728 ↛ 734line 728 didn't jump to line 734 because the condition on line 728 was always true
729 zbl = zbl.strip()
730 if zbl.find("Zbl") == 0: 730 ↛ 734line 730 didn't jump to line 734 because the condition on line 730 was always true
731 zbl = zbl[3:].strip()
732 xarticle.extids.append(("zbl-item-id", zbl))
734 if "publisher" in what:
735 # PUBLISHER
736 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
737 if citation_publisher_node:
738 pub = citation_publisher_node.get("content")
739 if isinstance(pub, str): 739 ↛ 746line 739 didn't jump to line 746 because the condition on line 739 was always true
740 pub = pub.strip()
741 if pub != "": 741 ↛ 746line 741 didn't jump to line 746 because the condition on line 741 was always true
742 xpub = create_publisherdata()
743 xpub.name = pub
744 xissue.publisher = xpub
746 if "keywords" in what:
747 # KEYWORDS
748 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
749 for kwd_node in citation_kwd_nodes:
750 kwds = kwd_node.get("content")
751 if isinstance(kwds, str): 751 ↛ 749line 751 didn't jump to line 749 because the condition on line 751 was always true
752 kwds = kwds.split(",")
753 for kwd in kwds:
754 if kwd == "":
755 continue
756 kwd = kwd.strip()
757 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
759 if "references" in what:
760 citation_references = soup.select("meta[name='citation_reference']")
761 for index, tag in enumerate(citation_references):
762 content = tag.get("content")
763 if not isinstance(content, str): 763 ↛ 764line 763 didn't jump to line 764 because the condition on line 763 was never true
764 raise ValueError("Cannot parse citation_reference meta")
765 label = str(index + 1)
766 if regex.match(r"^\[\d+\].*", content): 766 ↛ 767line 766 didn't jump to line 767 because the condition on line 766 was never true
767 label = None
768 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
770 def create_xissue(
771 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
772 ):
773 if url is not None and url.endswith("/"):
774 url = url[:-1]
775 xissue = create_issuedata()
776 xissue.url = url
778 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
780 xissue.year = year
782 if volume_number is not None:
783 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
785 if issue_number is not None:
786 xissue.number = issue_number.replace(",", "-")
787 return xissue
789 def detect_language(self, text: str, article: ArticleData | None = None):
790 if article and article.lang is not None and article.lang != "und":
791 return article.lang
793 language = self.language_detector.detect_language_of(text)
795 if not language: 795 ↛ 796line 795 didn't jump to line 796 because the condition on line 795 was never true
796 return "und"
797 return language.iso_code_639_1.name.lower()
799 def create_trans_title(
800 self,
801 resource_type: str,
802 title_tex: str,
803 lang: str,
804 xresource_lang: str,
805 title_type: str = "main",
806 ):
807 tag = "trans-article" if resource_type == "article" else "issue-title"
809 ckeditor_data = build_jats_data_from_html_field(
810 title_tex,
811 tag=tag,
812 text_lang=lang,
813 resource_lang=xresource_lang,
814 delimiter_inline=self.delimiter_inline_formula,
815 delimiter_disp=self.delimiter_disp_formula,
816 )
818 titledata = create_titledata(
819 lang=lang,
820 type="main",
821 title_html=ckeditor_data["value_html"],
822 title_xml=ckeditor_data["value_xml"],
823 )
825 return titledata
827 references_mapping = {
828 "citation_title": get_article_title_xml,
829 "citation_journal_title": get_source_xml,
830 "citation_publication_date": get_year_xml,
831 "citation_firstpage": get_fpage_xml,
832 "citation_lastpage": get_lpage_xml,
833 }
835 @classmethod
836 def __parse_meta_citation_reference(cls, content: str, label=None):
837 categories = content.split(";")
839 if len(categories) == 1:
840 return JatsBase.bake_ref(content, label=label)
842 citation_data = [c.split("=") for c in categories if "=" in c]
843 del categories
845 xml_string = ""
846 authors_parsed = False
847 authors_strings = []
848 for data in citation_data:
849 key = data[0].strip()
850 citation_content = data[1]
851 if key == "citation_author":
852 authors_strings.append(get_author_xml(template_str=citation_content))
853 continue
854 elif not authors_parsed:
855 xml_string += ", ".join(authors_strings)
856 authors_parsed = True
858 if key in cls.references_mapping:
859 xml_string += " " + cls.references_mapping[key](citation_content)
861 return JatsBase.bake_ref(xml_string, label=label)
863 @classmethod
864 def get_or_create_source(cls):
865 source, created = Source.objects.get_or_create(
866 domain=cls.source_domain,
867 defaults={
868 "name": cls.source_name,
869 "website": cls.source_website,
870 },
871 )
872 if created: 872 ↛ 873line 872 didn't jump to line 873 because the condition on line 872 was never true
873 source.save()
874 return source
876 @staticmethod
877 def get_issue_pid(
878 collection_id: str,
879 year: str,
880 volume_number: str | None = None,
881 issue_number: str | None = None,
882 ):
883 # Replace any non-word character with an underscore
884 pid = f"{collection_id}_{year}"
885 if volume_number is not None:
886 pid += f"_{volume_number}"
887 if issue_number is not None:
888 pid += f"_{issue_number}"
889 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
890 return pid
892 @staticmethod
893 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
894 pages_split = pages.split(separator)
895 if len(pages_split) == 0: 895 ↛ 896line 895 didn't jump to line 896 because the condition on line 895 was never true
896 article.page_range = pages
897 if len(pages_split) > 0: 897 ↛ exitline 897 didn't return from function 'set_pages' because the condition on line 897 was always true
898 if pages[0].isnumeric(): 898 ↛ exitline 898 didn't return from function 'set_pages' because the condition on line 898 was always true
899 article.fpage = pages_split[0]
900 if (
901 len(pages_split) > 1
902 and pages_split[0] != pages_split[1]
903 and pages_split[1].isnumeric()
904 ):
905 article.lpage = pages_split[1]