Coverage for src/crawler/base_crawler.py: 73%
430 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import logging
2import time
3from datetime import datetime, timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetectorBuilder
13from opentelemetry import trace
14from ptf.cmds.xml.ckeditor.utils import (
15 build_jats_data_from_html_field,
16)
17from ptf.cmds.xml.jats.builder.citation import (
18 get_article_title_xml,
19 get_author_xml,
20 get_fpage_xml,
21 get_lpage_xml,
22 get_source_xml,
23 get_year_xml,
24)
25from ptf.cmds.xml.jats.jats_parser import JatsBase
26from ptf.model_data import (
27 ArticleData,
28 ContributorDict,
29 IssueData,
30 ResourceData,
31 TitleDict,
32 create_contributor,
33 create_extlink,
34 create_issuedata,
35 create_publisherdata,
36 create_titledata,
37)
38from ptf.model_data_converter import update_data_for_jats
39from pylatexenc.latex2text import LatexNodes2Text
40from pysolr import SolrError
41from requests_cache import CachedSession, MongoCache
43from crawler.cmds.xml_cmds import addOrUpdateGDMLIssueXmlCmd
44from crawler.models import Source
45from crawler.types import CitationLiteral
46from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
48# TODO: pass a class factory instead of a dependency to a site
49# TODO: pass a class factory instead of a dependency to a site
52class CrawlerTitleDict(TitleDict):
53 title_tex: str | None
56class BaseCollectionCrawler:
57 """
58 Base collection for the crawlers.
59 To create a crawler:
60 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
61 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
62 3) update factory.py so that crawler_factory can return your new crawler
63 """
65 logger = logging.getLogger(__name__)
66 tracer = trace.get_tracer(__name__)
68 source_name = ""
69 source_domain = ""
70 source_website = ""
72 issue_href = ""
74 collection = None
75 source = None
76 user = None
77 session: requests.Session | CachedSession
79 verify = True
80 headers = {
81 "accept_encoding": "utf-8",
82 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", "Mathdoc/1.0.0"),
83 "From": getattr(settings, "REQUESTS_EMAIL", "accueil@listes.mathdoc.fr"),
84 }
86 next_allowed_request: float = time.time()
88 # seconds to wait between two http requests
89 requests_interval = getattr(settings, "REQUESTS_INTERVAL", 90)
91 latext_parser = LatexNodes2Text()
93 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
94 # do not use the "$" to surround tex formulas
95 delimiter_inline_formula = "$"
96 delimiter_disp_formula = "$"
98 # HACK : Workaround for tests (monkeypatching)
99 # We store the class here, so we can monkeypatch it when running tests
100 # subCrawlers = {
101 # LofplCrawler: None
102 # }
103 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
105 language_detector = LanguageDetectorBuilder.from_all_languages().build()
107 force_refresh = False
109 # Whereas to include headers in requests cache key
110 match_headers = False
111 orcid_re = r"https\:\/\/orcid\.org\/(?P<orcid>\d{4}-\d{4}-\d{4}-\d{4})"
113 # Set this to False on a Crawler-basis to allow inserting articles without PDFs
114 ignore_missing_pdf = True
116 def __init__(
117 self,
118 *args,
119 username: str,
120 collection_id: str,
121 collection_url: str,
122 test_mode: bool = False,
123 publisher: str = "mathdoc",
124 force_refresh=False,
125 ):
126 for CrawlerClass in self.subCrawlers: 126 ↛ 127line 126 didn't jump to line 127 because the loop on line 126 never started
127 self.subCrawlers[CrawlerClass] = CrawlerClass(
128 *args,
129 username=username,
130 collection_id=collection_id,
131 collection_url=collection_url,
132 test_mode=test_mode,
133 publisher=publisher,
134 )
135 self.logger = logging.getLogger(__name__ + "." + self.source_domain)
137 self.username = username
139 self.collection_id = collection_id
140 self.collection_url = (
141 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
142 )
144 self.test_mode = test_mode
145 self.publisher = publisher
147 self.session = requests.session()
149 # Skipped when running tests
150 self.initialize()
152 self.force_refresh = force_refresh
154 def initialize(self):
155 """
156 Acts as a "second" init function to skip model accesses during test data generation
157 """
158 self.collection = get_or_create_collection(self.collection_id)
159 self.source = self.get_or_create_source()
160 self.user = User.objects.get(username=self.username)
161 self.session = CachedSession(
162 match_headers=self.match_headers,
163 headers=self.headers,
164 backend=MongoCache(
165 host=getattr(settings, "MONGO_HOSTNAME", "localhost"),
166 ),
167 expire_after=timedelta(days=30),
168 )
170 @classmethod
171 def can_crawl(cls, pid: str) -> bool:
172 return True
174 def parse_collection_content(self, content: str) -> list[IssueData]:
175 """
176 Parse the HTML content with BeautifulSoup
177 returns a list of xissue.
178 Override this function in a derived class
179 """
180 return []
182 def parse_issue_content(self, content: str, xissue: IssueData):
183 """
184 Parse the HTML content with BeautifulSoup
185 Fills the xissue.articles
186 Override this function in a derived class.
188 CAV : You are supposed to create articles there. Please assign a PID to each article.
189 The PID can be `a + article_index`, like this : `a0` `a21`
190 """
192 def parse_article_content(
193 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
194 ) -> ArticleData | None:
195 """
196 Parse the HTML content with BeautifulSoup
197 returns the xarticle.
198 Override this function in a derived class.
199 The xissue is passed to the function in case the article page has issue information (ex: publisher)
200 The article url is also passed as a parameter
202 CAV : You are supposed to assign articles pid again here
203 """
204 return xarticle
206 @tracer.start_as_current_span("crawl_collection")
207 def crawl_collection(self):
208 # TODO: Comments, filter
209 """
210 Crawl an entire collection. ptf.models.Container objects are created.
211 - get the HTML content of the collection_url
212 - parse the HTML content with beautifulsoup to extract the list of issues
213 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
214 - crawl each issue if col_only is False
215 - Returns the list of merged issues.
216 It is an OrderedDict {pid: {"issues": xissues}}
217 The key is the pid of the merged issues.
218 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
219 the pid is then made with 1999-2000__6_
220 """
222 if self.source is None:
223 raise RuntimeError("ERROR: the source is not set")
225 content = self.download_file(self.collection_url)
226 xissues = self.parse_collection_content(content)
228 """
229 Some collections split the same volumes in different pages
230 Ex: Volume 6 (2000) and Volume 6 (1999)
231 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
232 """
233 # merged_xissues = self.merge_xissues(xissues)
235 xissues_dict = {str(i.pid): i for i in xissues}
237 return xissues_dict
239 @tracer.start_as_current_span("crawl_issue")
240 def crawl_issue(self, xissue: IssueData):
241 """
242 Crawl 1 wag page of an issue.
243 - get the HTML content of the issue
244 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
245 - crawl each article
246 """
248 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
249 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
251 issue_url = xissue.url
252 if issue_url is not None:
253 if issue_url.endswith(".pdf"):
254 add_pdf_link_to_xarticle(xissue, issue_url)
255 xissue.url = None
256 else:
257 content = self.download_file(issue_url)
258 with self.tracer.start_as_current_span("parse_issue_content"):
259 self.parse_issue_content(content, xissue)
261 xarticles = xissue.articles
263 parsed_xarticles = []
265 for xarticle in xarticles:
266 parsed_xarticle = self.crawl_article(xarticle, xissue)
267 if parsed_xarticle is not None:
268 parsed_xarticles.append(parsed_xarticle)
270 xissue.articles = parsed_xarticles
272 article_has_pdf = self.article_has_pdf(xissue)
274 if self.ignore_missing_pdf:
275 xissue.articles = [a for a in xissue.articles if self.article_has_pdf(a)]
277 if not self.test_mode and (len(xissue.articles) > 0 or article_has_pdf):
278 self.process_resource_metadata(xissue, resource_type="issue")
279 self.add_xissue_into_database(xissue)
281 @staticmethod
282 def article_has_source(art: ArticleData | IssueData):
283 return (
284 next(
285 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
286 None,
287 )
288 is not None
289 )
291 @staticmethod
292 def article_has_pdf(art: ArticleData | IssueData):
293 return (
294 next((link for link in art.ext_links if link["rel"] == "article-pdf"), None)
295 is not None
296 )
298 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
299 # ARTICLE URL as en ExtLink (to display the link in the article page)
300 if xarticle.url is None:
301 if not self.article_has_source(xarticle): 301 ↛ 311line 301 didn't jump to line 311 because the condition on line 301 was always true
302 if xissue.url:
303 article_source = xissue.url
304 else:
305 article_source = self.collection_url
306 ext_link = create_extlink()
307 ext_link["rel"] = "source"
308 ext_link["location"] = article_source
309 ext_link["metadata"] = self.source_domain
310 xarticle.ext_links.append(ext_link)
311 return self.process_article_metadata(xarticle)
313 content = self.download_file(xarticle.url)
315 xarticle.pid = f"{xissue.pid}_{xarticle.pid}"
317 with self.tracer.start_as_current_span("parse_article_content"):
318 parsed_xarticle = self.parse_article_content(content, xissue, xarticle, xarticle.url)
319 if parsed_xarticle is None: 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true
320 return None
322 if parsed_xarticle.doi:
323 parsed_xarticle.pid = (
324 parsed_xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
325 )
327 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
328 ext_link = create_extlink()
329 ext_link["rel"] = "source"
330 ext_link["location"] = parsed_xarticle.url
331 ext_link["metadata"] = self.source_domain
332 parsed_xarticle.ext_links.append(ext_link)
334 # The article title may have formulas surrounded with '$'
335 return self.process_article_metadata(parsed_xarticle)
337 def process_resource_metadata(self, xresource: ResourceData, resource_type="article"):
338 tag = "article-title" if resource_type == "article" else "issue-title"
340 # Process title tex
341 ckeditor_data = build_jats_data_from_html_field(
342 xresource.title_tex,
343 tag=tag,
344 text_lang=xresource.lang,
345 delimiter_inline=self.delimiter_inline_formula,
346 delimiter_disp=self.delimiter_disp_formula,
347 )
349 xresource.title_html = ckeditor_data["value_html"]
350 # xresource.title_tex = ckeditor_data["value_tex"]
351 xresource.title_xml = ckeditor_data["value_xml"]
353 # Process trans_title tex
354 if xresource.trans_title_tex: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true
355 self.logger.warning(
356 "Deprecation Notice : prefer using xresource.title directly instead of xresource.trans_title_tex"
357 )
358 trans_title = self.create_trans_title(
359 xresource_lang=xresource.lang,
360 resource_type=resource_type,
361 title_tex=xresource.trans_title_tex,
362 lang=xresource.trans_lang,
363 )
364 xresource.titles.append(trans_title)
366 abstracts_to_parse = [
367 xabstract for xabstract in xresource.abstracts if xabstract["tag"] == "abstract"
368 ]
369 # abstract may have formulas surrounded with '$'
370 if len(abstracts_to_parse) > 0:
371 for xabstract in abstracts_to_parse:
372 ckeditor_data = build_jats_data_from_html_field(
373 xabstract["value_tex"],
374 tag="abstract",
375 text_lang=xabstract["lang"],
376 resource_lang=xresource.lang,
377 field_type="abstract",
378 delimiter_inline=self.delimiter_inline_formula,
379 delimiter_disp=self.delimiter_disp_formula,
380 )
382 xabstract["value_html"] = ckeditor_data["value_html"]
383 # xabstract["value_tex"] = ckeditor_data["value_tex"]
384 xabstract["value_xml"] = ckeditor_data["value_xml"]
386 return xresource
388 def process_article_metadata(self, xresource: ResourceData):
389 self.process_resource_metadata(xresource)
390 update_data_for_jats(xresource)
392 return xresource
394 def get(self, url: str, force_refresh=False, headers={}):
395 attempt = 0
396 response = None
398 while attempt < 3: 398 ↛ 432line 398 didn't jump to line 432 because the condition on line 398 was always true
399 # If we already have a key, we can skip the timeout
400 if isinstance(self.session, CachedSession): 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 if not self.session.cache.contains(url=url) or force_refresh:
402 delta = self.next_allowed_request - time.time()
403 if delta > 0:
404 self.logger.debug(f"Waiting {int(delta)}s before making another request")
405 time.sleep(delta)
406 self.next_allowed_request = time.time() + self.requests_interval
407 try:
408 # self.session.cache.delete(urls=[url])
409 if isinstance(self.session, CachedSession): 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true
410 response = self.session.get(
411 url,
412 headers={**self.headers, **headers},
413 verify=self.verify,
414 force_refresh=force_refresh,
415 )
416 else:
417 response = self.session.get(
418 url, headers={**self.headers, **headers}, verify=self.verify
419 )
420 if not response.ok: 420 ↛ 421line 420 didn't jump to line 421 because the condition on line 420 was never true
421 raise requests.exceptions.HTTPError(
422 f"Endpoint answered with code {response.status_code} : {url}",
423 response=response,
424 )
425 return response
426 except (
427 requests.ConnectionError,
428 requests.ConnectTimeout,
429 requests.exceptions.HTTPError,
430 ):
431 attempt += 1
432 raise requests.exceptions.HTTPError(f"Unable to download {url}")
434 def download_file(self, url: str, force_refresh=False, headers={}):
435 """
436 Downloads a URL, saves its content on disk in filename and returns its content.
437 """
438 attempts = 0
439 while True:
440 try:
441 response = self.get(
442 url, force_refresh=force_refresh or self.force_refresh, headers=headers
443 )
444 content = self.decode_response(response)
445 if content == "" or not content: 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true
446 raise requests.exceptions.HTTPError(response)
447 return content
448 except requests.exceptions.HTTPError as e:
449 self.logger.debug(f"Caught error : {e}", extra={"url": url})
450 attempts += 1
451 # 15 mins, 30 mins, 45 mins
452 delay_minutes = attempts * 15
453 self.logger.debug(
454 f"Retrying in {delay_minutes}mins ({(datetime.now() + timedelta(minutes=delay_minutes)).time()})",
455 extra={"url": url},
456 )
457 time.sleep(delay_minutes * 60)
458 if attempts > 3:
459 raise e
461 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
462 """Override this if the content-type headers from the sources are advertising something else than the actual content
463 SASA needs this"""
464 response.encoding = encoding
465 return response.text
467 @tracer.start_as_current_span("add_xissue_to_database")
468 def add_xissue_into_database(self, xissue: IssueData):
469 xissue.journal = self.collection
470 xissue.source = self.source_domain
472 if xissue.year == "":
473 raise ValueError("Failsafe : Cannot insert issue without a year")
475 xpub = create_publisherdata()
476 xpub.name = self.publisher
477 xissue.publisher = xpub
478 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
480 attempt = 1
481 success = False
483 while not success and attempt < 4:
484 try:
485 params = {"xissue": xissue, "use_body": False}
486 cmd = addOrUpdateGDMLIssueXmlCmd(params)
487 cmd.do()
488 success = True
489 self.logger.debug(f"Issue {xissue.pid} inserted in database")
490 except SolrError:
491 self.logger.warn(
492 f"Encoutered SolrError while inserting issue {xissue.pid} in database"
493 )
494 attempt += 1
495 self.logger.debug(f"Attempt {attempt}. sleeping 10 seconds.")
496 time.sleep(10)
498 if success is False:
499 raise ConnectionRefusedError("Cannot connect to SolR")
501 def get_metadata_using_citation_meta(
502 self,
503 xarticle: ArticleData,
504 xissue: IssueData,
505 soup: BeautifulSoup,
506 what: list[CitationLiteral] = [],
507 ):
508 """
509 If parsing references using this method, use `xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))` afterwards to append the references to the article
512 :param xarticle: the xarticle that will collect the metadata
513 :param xissue: the xissue that will collect the publisher
514 :param soup: the BeautifulSoup object of tha article page
515 :param what: list of citation_ items to collect.
516 :return: None. The given article is modified
517 """
519 if "title" in what:
520 # TITLE
521 citation_title_node = soup.select_one("meta[name='citation_title']")
522 if citation_title_node: 522 ↛ 527line 522 didn't jump to line 527 because the condition on line 522 was always true
523 title = citation_title_node.get("content")
524 if isinstance(title, str): 524 ↛ 527line 524 didn't jump to line 527 because the condition on line 524 was always true
525 xarticle.title_tex = title
527 if "author" in what: 527 ↛ 556line 527 didn't jump to line 556 because the condition on line 527 was always true
528 # AUTHORS
529 citation_author_nodes = soup.select("meta[name^='citation_author']")
530 current_author: ContributorDict | None = None
531 for citation_author_node in citation_author_nodes:
532 if citation_author_node.get("name") == "citation_author":
533 text_author = citation_author_node.get("content")
534 if not isinstance(text_author, str): 534 ↛ 535line 534 didn't jump to line 535 because the condition on line 534 was never true
535 raise ValueError("Cannot parse author")
536 if text_author == "": 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true
537 current_author = None
538 continue
539 current_author = create_contributor(role="author", string_name=text_author)
540 xarticle.contributors.append(current_author)
541 continue
542 if current_author is None: 542 ↛ 543line 542 didn't jump to line 543 because the condition on line 542 was never true
543 self.logger.warning("Couldn't parse citation author")
544 continue
545 if citation_author_node.get("name") == "citation_author_institution":
546 text_institution = citation_author_node.get("content")
547 if not isinstance(text_institution, str): 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true
548 continue
549 current_author["addresses"].append(text_institution)
550 if citation_author_node.get("name") == "citation_author_ocrid": 550 ↛ 551line 550 didn't jump to line 551 because the condition on line 550 was never true
551 text_orcid = citation_author_node.get("content")
552 if not isinstance(text_orcid, str):
553 continue
554 current_author["orcid"] = text_orcid
556 if "pdf" in what:
557 # PDF
558 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
559 if citation_pdf_node:
560 pdf_url = citation_pdf_node.get("content")
561 if isinstance(pdf_url, str): 561 ↛ 564line 561 didn't jump to line 564 because the condition on line 561 was always true
562 add_pdf_link_to_xarticle(xarticle, pdf_url)
564 if "lang" in what:
565 # LANG
566 citation_lang_node = soup.select_one("meta[name='citation_language']")
567 if citation_lang_node: 567 ↛ 573line 567 didn't jump to line 573 because the condition on line 567 was always true
568 # TODO: check other language code
569 content_text = citation_lang_node.get("content")
570 if isinstance(content_text, str): 570 ↛ 573line 570 didn't jump to line 573 because the condition on line 570 was always true
571 xarticle.lang = standardize_tag(content_text)
573 if "abstract" in what:
574 # ABSTRACT
575 abstract_node = soup.select_one("meta[name='citation_abstract']")
576 if abstract_node is not None:
577 abstract = abstract_node.get("content")
578 if not isinstance(abstract, str): 578 ↛ 579line 578 didn't jump to line 579 because the condition on line 578 was never true
579 raise ValueError("Couldn't parse abstract from meta")
580 abstract = BeautifulSoup(abstract, "html.parser").text
581 lang = abstract_node.get("lang")
582 if not isinstance(lang, str):
583 lang = self.detect_language(abstract, xarticle)
584 xarticle.abstracts.append(
585 {
586 "tag": "abstract",
587 "value_html": "",
588 "value_tex": abstract,
589 "value_xml": "",
590 "lang": lang,
591 }
592 )
594 if "page" in what:
595 # PAGES
596 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
597 if citation_fpage_node:
598 page = citation_fpage_node.get("content")
599 if isinstance(page, str): 599 ↛ 604line 599 didn't jump to line 604 because the condition on line 599 was always true
600 page = page.split("(")[0]
601 if len(page) < 32: 601 ↛ 604line 601 didn't jump to line 604 because the condition on line 601 was always true
602 xarticle.fpage = page
604 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
605 if citation_lpage_node:
606 page = citation_lpage_node.get("content")
607 if isinstance(page, str): 607 ↛ 612line 607 didn't jump to line 612 because the condition on line 607 was always true
608 page = page.split("(")[0]
609 if len(page) < 32: 609 ↛ 612line 609 didn't jump to line 612 because the condition on line 609 was always true
610 xarticle.lpage = page
612 if "doi" in what:
613 # DOI
614 citation_doi_node = soup.select_one("meta[name='citation_doi']")
615 if citation_doi_node:
616 doi = citation_doi_node.get("content")
617 if isinstance(doi, str): 617 ↛ 624line 617 didn't jump to line 624 because the condition on line 617 was always true
618 doi = doi.strip()
619 pos = doi.find("10.")
620 if pos > 0:
621 doi = doi[pos:]
622 xarticle.doi = doi
624 if "mr" in what:
625 # MR
626 citation_mr_node = soup.select_one("meta[name='citation_mr']")
627 if citation_mr_node:
628 mr = citation_mr_node.get("content")
629 if isinstance(mr, str): 629 ↛ 635line 629 didn't jump to line 635 because the condition on line 629 was always true
630 mr = mr.strip()
631 if mr.find("MR") == 0: 631 ↛ 635line 631 didn't jump to line 635 because the condition on line 631 was always true
632 mr = mr[2:]
633 xarticle.extids.append(("mr-item-id", mr))
635 if "zbl" in what:
636 # ZBL
637 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
638 if citation_zbl_node:
639 zbl = citation_zbl_node.get("content")
640 if isinstance(zbl, str): 640 ↛ 646line 640 didn't jump to line 646 because the condition on line 640 was always true
641 zbl = zbl.strip()
642 if zbl.find("Zbl") == 0: 642 ↛ 646line 642 didn't jump to line 646 because the condition on line 642 was always true
643 zbl = zbl[3:].strip()
644 xarticle.extids.append(("zbl-item-id", zbl))
646 if "publisher" in what:
647 # PUBLISHER
648 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
649 if citation_publisher_node:
650 pub = citation_publisher_node.get("content")
651 if isinstance(pub, str): 651 ↛ 658line 651 didn't jump to line 658 because the condition on line 651 was always true
652 pub = pub.strip()
653 if pub != "": 653 ↛ 658line 653 didn't jump to line 658 because the condition on line 653 was always true
654 xpub = create_publisherdata()
655 xpub.name = pub
656 xissue.publisher = xpub
658 if "keywords" in what:
659 # KEYWORDS
660 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
661 for kwd_node in citation_kwd_nodes:
662 kwds = kwd_node.get("content")
663 if isinstance(kwds, str): 663 ↛ 661line 663 didn't jump to line 661 because the condition on line 663 was always true
664 kwds = kwds.split(",")
665 for kwd in kwds:
666 if kwd == "":
667 continue
668 kwd = kwd.strip()
669 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
671 if "references" in what:
672 citation_references = soup.select("meta[name='citation_reference']")
673 for index, tag in enumerate(citation_references):
674 content = tag.get("content")
675 if not isinstance(content, str): 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true
676 raise ValueError("Cannot parse citation_reference meta")
677 label = str(index + 1)
678 if regex.match(r"^\[\d+\].*", content): 678 ↛ 679line 678 didn't jump to line 679 because the condition on line 678 was never true
679 label = None
680 xarticle.bibitems.append(self.__parse_meta_citation_reference(content, label))
682 def create_xissue(
683 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
684 ):
685 if url is not None and url.endswith("/"):
686 url = url[:-1]
687 xissue = create_issuedata()
688 xissue.url = url
690 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
692 xissue.year = year
694 if volume_number is not None:
695 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
697 if issue_number is not None:
698 xissue.number = issue_number.replace(",", "-")
699 return xissue
701 def detect_language(self, text: str, article: ArticleData | None = None):
702 if article and article.lang is not None and article.lang != "und":
703 return article.lang
705 language = self.language_detector.detect_language_of(text)
707 if not language: 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true
708 return "und"
709 return language.iso_code_639_1.name.lower()
711 def create_trans_title(
712 self,
713 resource_type: str,
714 title_tex: str,
715 lang: str,
716 xresource_lang: str,
717 title_type: str = "main",
718 ):
719 tag = "trans-article" if resource_type == "article" else "issue-title"
721 ckeditor_data = build_jats_data_from_html_field(
722 title_tex,
723 tag=tag,
724 text_lang=lang,
725 resource_lang=xresource_lang,
726 delimiter_inline=self.delimiter_inline_formula,
727 delimiter_disp=self.delimiter_disp_formula,
728 )
730 titledata = create_titledata(
731 lang=lang,
732 type="main",
733 title_html=ckeditor_data["value_html"],
734 title_xml=ckeditor_data["value_xml"],
735 )
737 return titledata
739 references_mapping = {
740 "citation_title": get_article_title_xml,
741 "citation_journal_title": get_source_xml,
742 "citation_publication_date": get_year_xml,
743 "citation_firstpage": get_fpage_xml,
744 "citation_lastpage": get_lpage_xml,
745 }
747 @classmethod
748 def __parse_meta_citation_reference(cls, content: str, label=None):
749 categories = content.split(";")
751 if len(categories) == 1:
752 return JatsBase.bake_ref(content, label=label)
754 citation_data = [c.split("=") for c in categories if "=" in c]
755 del categories
757 xml_string = ""
758 authors_parsed = False
759 authors_strings = []
760 for data in citation_data:
761 key = data[0].strip()
762 citation_content = data[1]
763 if key == "citation_author":
764 authors_strings.append(get_author_xml(template_str=citation_content))
765 continue
766 elif not authors_parsed:
767 xml_string += ", ".join(authors_strings)
768 authors_parsed = True
770 if key in cls.references_mapping:
771 xml_string += " " + cls.references_mapping[key](citation_content)
773 return JatsBase.bake_ref(xml_string, label=label)
775 @classmethod
776 def get_or_create_source(cls):
777 source, created = Source.objects.get_or_create(
778 domain=cls.source_domain,
779 defaults={
780 "name": cls.source_name,
781 "website": cls.source_website,
782 },
783 )
784 if created: 784 ↛ 785line 784 didn't jump to line 785 because the condition on line 784 was never true
785 source.save()
786 return source
788 @staticmethod
789 def get_issue_pid(
790 collection_id: str,
791 year: str,
792 volume_number: str | None = None,
793 issue_number: str | None = None,
794 ):
795 # Replace any non-word character with an underscore
796 pid = f"{collection_id}_{year}"
797 if volume_number is not None:
798 pid += f"_{volume_number}"
799 if issue_number is not None:
800 pid += f"_{issue_number}"
801 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
802 return pid
804 @staticmethod
805 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
806 pages_split = pages.split(separator)
807 if len(pages_split) == 0: 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true
808 article.page_range = pages
809 if len(pages_split) > 0: 809 ↛ exitline 809 didn't return from function 'set_pages' because the condition on line 809 was always true
810 if pages[0].isnumeric(): 810 ↛ exitline 810 didn't return from function 'set_pages' because the condition on line 810 was always true
811 article.fpage = pages_split[0]
812 if (
813 len(pages_split) > 1
814 and pages_split[0] != pages_split[1]
815 and pages_split[1].isnumeric()
816 ):
817 article.lpage = pages_split[1]