Coverage for src/crawler/base_crawler.py: 75%
360 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import time
2from collections.abc import Sequence
3from datetime import timedelta
5import regex
6import requests
7from bs4 import BeautifulSoup
8from django.conf import settings
9from django.contrib.auth.models import User
10from django.utils import timezone
11from langcodes import standardize_tag
12from lingua import LanguageDetector, LanguageDetectorBuilder
13from ptf.cmds import xml_cmds
14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
15from ptf.cmds.xml.jats.builder.issue import get_title_xml
16from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml
17from ptf.display.resolver import extids_formats, resolve_id
18from ptf.model_data import (
19 ArticleData,
20 IssueData,
21 RefData,
22 create_abstract,
23 create_contributor,
24 create_extlink,
25 create_issuedata,
26 create_publisherdata,
27)
28from ptf.model_data_converter import update_data_for_jats
29from pylatexenc.latex2text import LatexNodes2Text
30from pysolr import SolrError
31from requests_cache import CachedSession, FileCache
33from crawler.models import Periode, Source
34from crawler.types import CitationLiteral
35from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection
37# TODO: pass a class factory instead of a dependency to a site
38# TODO: pass a class factory instead of a dependency to a site
41class BaseCollectionCrawler:
42 """
43 Base collection for the crawlers.
44 To create a crawler:
45 1) derive a class from BaseCollectionCrawler and name it XXXCrawler
46 2) override the functions parse_collection_content, parse_issue_content and parse_article_content
47 3) update factory.py so that crawler_factory can return your new crawler
48 """
50 source_name = ""
51 source_domain = ""
52 source_website = ""
54 periode_begin: int = 0
55 periode_end: int = 9999
57 issue_href = ""
59 source = None
60 session: requests.Session | CachedSession
62 next_allowed_request: float = time.time()
64 latext_parser = LatexNodes2Text()
66 # Override the values in your concrete crawler if the formulas in text (titles, abstracts)
67 # do not use the "$" to surround tex formulas
68 delimiter_inline_formula = "$"
69 delimiter_disp_formula = "$"
71 # HACK : Workaround for tests (monkeypatching)
72 # We store the class here, so we can monkeypatch it when running tests
73 # subCrawlers = {
74 # LofplCrawler: None
75 # }
76 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {}
78 language_detector: LanguageDetector
80 def __init__(
81 self,
82 *args,
83 username: str,
84 collection_id: str,
85 collection_url: str,
86 test_mode: bool = False,
87 publisher: str = "mathdoc",
88 start_pid: str | None = None,
89 ):
90 for CrawlerClass in self.subCrawlers: 90 ↛ 91line 90 didn't jump to line 91 because the loop on line 90 never started
91 self.subCrawlers[CrawlerClass] = CrawlerClass(
92 *args,
93 username=username,
94 collection_id=collection_id,
95 collection_url=collection_url,
96 test_mode=test_mode,
97 publisher=publisher,
98 start_pid=start_pid,
99 )
101 self.username = username
102 self.user = User.objects.get(username=self.username)
104 self.collection_id = collection_id
105 self.collection_url = (
106 collection_url # url of the collection. Ex: https://eudml.org/journal/10098
107 )
108 self.collection = get_or_create_collection(self.collection_id)
110 self.test_mode = test_mode
111 self.publisher = publisher
113 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page
114 # AMP sets or creates the Periode during the __init__
115 # TODO: see with other sources when to create the Periode
116 self.periode = None
117 self.periode_first_issue = None
118 self.periode_last_issue = None
120 self.start_pid = start_pid
122 # Some source have multiple pages for 1 issue. We need to merge the content
123 self.build_language_detector()
125 self.session = CachedSession(
126 backend=FileCache(
127 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache",
128 decode_content=False,
129 ),
130 headers={
131 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0",
132 "From": getattr(settings, "REQUESTS_EMAIL", None) or "accueil@listes.mathdoc.fr",
133 },
134 expire_after=timedelta(days=30),
135 )
137 self.source = self.get_or_create_source()
138 self.periode = self.get_or_create_periode()
140 def build_language_detector(self):
141 self.language_detector = LanguageDetectorBuilder.from_all_languages().build()
143 def parse_collection_content(self, content: str) -> list[IssueData]:
144 """
145 Parse the HTML content with BeautifulSoup
146 returns a list of xissue.
147 Override this function in a derived class
148 """
149 return []
151 def parse_issue_content(self, content: str, xissue: IssueData):
152 """
153 Parse the HTML content with BeautifulSoup
154 Fills the xissue.articles
155 Override this function in a derived class.
157 CAV : You are supposed to create articles there. Please assign a PID to each article.
158 The PID can be `a + article_index`, like this : `a0` `a21`
159 """
161 def parse_article_content(
162 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str
163 ):
164 """
165 Parse the HTML content with BeautifulSoup
166 returns the xarticle.
167 Override this function in a derived class.
168 The xissue is passed to the function in case the article page has issue information (ex: publisher)
169 The article url is also passed as a parameter
171 CAV : You are supposed to assign articles pid again here
172 """
173 xarticle.pid = pid
174 return xarticle
176 def crawl_collection(self):
177 # TODO: Comments, filter
178 """
179 Crawl an entire collection. ptf.models.Container objects are created.
180 - get the HTML content of the collection_url
181 - parse the HTML content with beautifulsoup to extract the list of issues
182 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container)
183 - crawl each issue if col_only is False
184 - Returns the list of merged issues.
185 It is an OrderedDict {pid: {"issues": xissues}}
186 The key is the pid of the merged issues.
187 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999)
188 the pid is then made with 1999-2000__6_
189 """
191 if self.source is None:
192 raise RuntimeError("ERROR: the source is not set")
194 content = self.download_file(self.collection_url)
195 xissues = self.parse_collection_content(content)
197 # xissues = [
198 # issue
199 # for issue in xissues
200 # if int(issue.year) >= self.periode_begin and int(issue.year) <= self.periode_end
201 # ]
203 """
204 Some collections split the same volumes in different pages
205 Ex: Volume 6 (2000) and Volume 6 (1999)
206 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000)
207 """
208 # merged_xissues = self.merge_xissues(xissues)
210 xissues_dict = {str(i.pid): i for i in xissues}
212 filtered_xissues = xissues_dict
213 # Filter the issues to crawl if start_pid was set in the constructor
214 if self.start_pid is not None:
215 filtered_xissues = {}
216 start = False
217 for pid in sorted(xissues_dict):
218 if pid == self.start_pid:
219 start = True
220 if start:
221 filtered_xissues[pid] = xissues_dict[pid]
223 return filtered_xissues
225 def crawl_issue(self, xissue: IssueData):
226 """
227 Crawl 1 wag page of an issue.
228 - get the HTML content of the issue
229 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata
230 - crawl each article
231 """
233 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content.
234 # The list of articles directly come from the collection HTML page: the xissue has no url attribute
235 if hasattr(xissue, "url") and xissue.url:
236 content = self.download_file(xissue.url)
237 self.parse_issue_content(content, xissue)
239 xarticles = xissue.articles
241 parsed_xarticles = []
243 for xarticle in xarticles:
244 parsed_xarticle = self.crawl_article(xarticle, xissue)
245 if parsed_xarticle is not None:
246 parsed_xarticles.append(parsed_xarticle)
248 xissue.articles = parsed_xarticles
250 if not self.test_mode and len(xissue.articles) > 0:
251 self.add_xissue_into_database(xissue)
253 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
254 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
255 parsed_xarticle = xarticle
256 if hasattr(xarticle, "url") and xarticle.url:
257 url = xarticle.url
259 content = self.download_file(xarticle.url)
260 pid = f"{xissue.pid}_{xarticle.pid}"
262 parsed_xarticle = self.parse_article_content(
263 content, xissue, xarticle, xarticle.url, pid
264 )
265 if parsed_xarticle.url is not None:
266 url = parsed_xarticle.url
267 # ARTICLE URL as en ExtLink (to display the link in the article page)
268 ext_link = create_extlink()
269 ext_link["rel"] = "source"
270 ext_link["location"] = url
271 ext_link["metadata"] = self.source_domain
272 parsed_xarticle.ext_links.append(ext_link)
274 # The article title may have formulas surrounded with '$'
275 return self.process_article_metadata(parsed_xarticle)
277 def process_article_metadata(self, xarticle: ArticleData):
278 html, xml = get_html_and_xml_from_text_with_formulas(
279 xarticle.title_tex,
280 delimiter_inline=self.delimiter_inline_formula,
281 delimiter_disp=self.delimiter_disp_formula,
282 )
283 xml = get_title_xml(xml, with_tex_values=False)
284 xarticle.title_html = html
285 xarticle.title_xml = xml
287 abstracts_to_parse = [
288 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract"
289 ]
290 # abstract may have formulas surrounded with '$'
291 if len(abstracts_to_parse) > 0:
292 for xabstract in abstracts_to_parse:
293 html, xml = get_html_and_xml_from_text_with_formulas(
294 xabstract["value_tex"],
295 delimiter_inline=self.delimiter_inline_formula,
296 delimiter_disp=self.delimiter_disp_formula,
297 )
298 xabstract["value_html"] = html
299 lang = xabstract["lang"]
300 if lang == xarticle.lang:
301 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>'
302 else:
303 xabstract[
304 "value_xml"
305 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>'
307 update_data_for_jats(xarticle)
309 return xarticle
311 def get(self, url: str):
312 attempt = 0
313 response = None
315 while attempt < 3:
316 # If we already have a key, we can skip the timeout
317 if isinstance(self.session, CachedSession): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true
318 if not self.session.cache.contains(url=url):
319 delta = self.next_allowed_request - time.time()
320 if delta > 0:
321 time.sleep(delta)
322 self.next_allowed_request = time.time() + 5
323 try:
324 headers = {"accept_encoding": "utf-8"}
325 # For SSL Errors, use verify=False kwarg
326 verify = True
327 if url.startswith("https://hdml.di.ionio.gr/"): 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true
328 verify = False
329 # self.session.cache.delete(urls=[url])
330 response = self.session.get(url, headers=headers, verify=verify)
331 if not response.ok:
332 raise requests.exceptions.HTTPError(
333 f"Endpoint answered with code {response.status_code} : {url}",
334 response=response,
335 )
336 return response
337 except (
338 requests.ConnectionError,
339 requests.ConnectTimeout,
340 requests.exceptions.HTTPError,
341 ):
342 attempt += 1
343 raise requests.exceptions.HTTPError(f"Unable to download {url}")
345 def download_file(self, url: str):
346 """
347 Downloads a URL, saves its content on disk in filename and returns its content.
348 """
349 response = self.get(url)
350 content = self.decode_response(response)
351 if content == "" or not content: 351 ↛ 352line 351 didn't jump to line 352 because the condition on line 351 was never true
352 raise requests.exceptions.HTTPError(response)
353 return content
355 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
356 """Override this if the content-type headers from the sources are advertising something else than the actual content
357 SASA needs this"""
358 response.encoding = encoding
359 return response.text
361 def add_xissue_into_database(self, xissue: IssueData):
362 xissue.journal = self.collection
364 xpub = create_publisherdata()
365 xpub.name = self.publisher
366 xissue.publisher = xpub
367 xissue.last_modified_iso_8601_date_str = timezone.now().isoformat()
369 attempt = 1
370 success = False
372 while not success and attempt < 4:
373 try:
374 params = {"xissue": xissue, "use_body": False}
375 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params)
376 cmd.do()
377 success = True
378 except SolrError:
379 attempt += 1
380 time.sleep(10)
382 def get_metadata_using_citation_meta(
383 self,
384 xarticle: ArticleData,
385 xissue: IssueData,
386 soup: BeautifulSoup,
387 what: list[CitationLiteral] = [],
388 ):
389 """
390 :param xarticle: the xarticle that will collect the metadata
391 :param xissue: the xissue that will collect the publisher
392 :param soup: the BeautifulSoup object of tha article page
393 :param what: list of citation_ items to collect.
394 :return: None. The given article is modified
395 """
397 if "title" in what:
398 # TITLE
399 citation_title_node = soup.select_one("meta[name='citation_title']")
400 if citation_title_node: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true
401 title = citation_title_node.get("content")
402 if isinstance(title, str): 402 ↛ 405line 402 didn't jump to line 405 because the condition on line 402 was always true
403 xarticle.title_tex = title
405 if "author" in what: 405 ↛ 418line 405 didn't jump to line 418 because the condition on line 405 was always true
406 # AUTHORS
407 citation_author_nodes = soup.select("meta[name='citation_author']")
408 for citation_author_node in citation_author_nodes:
409 text_author = citation_author_node.get("content")
410 if not isinstance(text_author, str): 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true
411 continue
412 author = create_contributor()
413 author["role"] = "author"
414 author["string_name"] = text_author
416 xarticle.contributors.append(author)
418 if "pdf" in what: 418 ↛ 426line 418 didn't jump to line 426 because the condition on line 418 was always true
419 # PDF
420 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
421 if citation_pdf_node:
422 pdf_url = citation_pdf_node.get("content")
423 if isinstance(pdf_url, str): 423 ↛ 426line 423 didn't jump to line 426 because the condition on line 423 was always true
424 add_pdf_link_to_xarticle(xarticle, pdf_url)
426 if "lang" in what:
427 # LANG
428 citation_lang_node = soup.select_one("meta[name='citation_language']")
429 if citation_lang_node: 429 ↛ 435line 429 didn't jump to line 435 because the condition on line 429 was always true
430 # TODO: check other language code
431 content_text = citation_lang_node.get("content")
432 if isinstance(content_text, str): 432 ↛ 435line 432 didn't jump to line 435 because the condition on line 432 was always true
433 xarticle.lang = standardize_tag(content_text)
435 if "abstract" in what:
436 # ABSTRACT
437 abstract_node = soup.select_one("div.entry-content")
438 if abstract_node is not None:
439 abstract_section_node = abstract_node.select_one("p")
440 if abstract_section_node: 440 ↛ 452line 440 didn't jump to line 452 because the condition on line 440 was always true
441 abstract = str(abstract_section_node)
442 xarticle.abstracts.append(
443 {
444 "tag": "abstract",
445 "value_html": "",
446 "value_tex": abstract,
447 "value_xml": "",
448 "lang": self.detect_language(abstract, xarticle),
449 }
450 )
452 if "page" in what: 452 ↛ 470line 452 didn't jump to line 470 because the condition on line 452 was always true
453 # PAGES
454 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
455 if citation_fpage_node:
456 page = citation_fpage_node.get("content")
457 if isinstance(page, str): 457 ↛ 462line 457 didn't jump to line 462 because the condition on line 457 was always true
458 page = page.split("(")[0]
459 if len(page) < 32: 459 ↛ 462line 459 didn't jump to line 462 because the condition on line 459 was always true
460 xarticle.fpage = page
462 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
463 if citation_lpage_node:
464 page = citation_lpage_node.get("content")
465 if isinstance(page, str): 465 ↛ 470line 465 didn't jump to line 470 because the condition on line 465 was always true
466 page = page.split("(")[0]
467 if len(page) < 32: 467 ↛ 470line 467 didn't jump to line 470 because the condition on line 467 was always true
468 xarticle.lpage = page
470 if "doi" in what:
471 # DOI
472 citation_doi_node = soup.select_one("meta[name='citation_doi']")
473 if citation_doi_node:
474 doi = citation_doi_node.get("content")
475 if isinstance(doi, str): 475 ↛ 483line 475 didn't jump to line 483 because the condition on line 475 was always true
476 doi = doi.strip()
477 pos = doi.find("10.")
478 if pos > 0:
479 doi = doi[pos:]
480 xarticle.doi = doi
481 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
483 if "mr" in what:
484 # MR
485 citation_mr_node = soup.select_one("meta[name='citation_mr']")
486 if citation_mr_node: 486 ↛ 487line 486 didn't jump to line 487 because the condition on line 486 was never true
487 mr = citation_mr_node.get("content")
488 if isinstance(mr, str):
489 mr = mr.strip()
490 if mr.find("MR") == 0:
491 mr = mr[2:]
492 xarticle.extids.append(("mr-item-id", mr))
494 if "zbl" in what:
495 # ZBL
496 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
497 if citation_zbl_node:
498 zbl = citation_zbl_node.get("content")
499 if isinstance(zbl, str): 499 ↛ 505line 499 didn't jump to line 505 because the condition on line 499 was always true
500 zbl = zbl.strip()
501 if zbl.find("Zbl") == 0: 501 ↛ 505line 501 didn't jump to line 505 because the condition on line 501 was always true
502 zbl = zbl[3:].strip()
503 xarticle.extids.append(("zbl-item-id", zbl))
505 if "publisher" in what:
506 # PUBLISHER
507 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
508 if citation_publisher_node:
509 pub = citation_publisher_node.get("content")
510 if isinstance(pub, str): 510 ↛ 517line 510 didn't jump to line 517 because the condition on line 510 was always true
511 pub = pub.strip()
512 if pub != "": 512 ↛ 517line 512 didn't jump to line 517 because the condition on line 512 was always true
513 xpub = create_publisherdata()
514 xpub.name = pub
515 xissue.publisher = xpub
517 if "keywords" in what:
518 # KEYWORDS
519 citation_kwd_node = soup.select_one("meta[name='citation_keywords']")
520 if citation_kwd_node:
521 kwds = citation_kwd_node.get("content")
522 if isinstance(kwds, str): 522 ↛ exitline 522 didn't return from function 'get_metadata_using_citation_meta' because the condition on line 522 was always true
523 kwds = kwds.split(",")
524 for kwd in kwds:
525 if kwd == "":
526 continue
527 kwd = kwd.strip()
528 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
530 def create_xissue(
531 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1"
532 ):
533 if url is not None and url.endswith("/"): 533 ↛ 534line 533 didn't jump to line 534 because the condition on line 533 was never true
534 url = url[:-1]
535 xissue = create_issuedata()
536 xissue.url = url
538 xissue.pid = self.get_issue_pid(self.collection_id, year, volume_number, issue_number)
540 xissue.year = year
542 if volume_number is not None:
543 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
545 if issue_number is not None:
546 xissue.number = issue_number.replace(",", "-")
547 return xissue
549 def detect_language(self, text: str, article: ArticleData | None = None):
550 if article and article.lang is not None and article.lang != "und":
551 return article.lang
553 language = self.language_detector.detect_language_of(text)
555 if not language: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true
556 return "und"
557 return language.iso_code_639_1.name.lower()
559 def get_or_create_periode(self):
560 if self.periode is not None: 560 ↛ 561line 560 didn't jump to line 561 because the condition on line 560 was never true
561 return self.periode
563 if self.collection is None or self.source is None: 563 ↛ 564line 563 didn't jump to line 564 because the condition on line 563 was never true
564 raise ValueError("You need to set a collection or a source before creating a periode")
566 qs = Periode.objects.filter(collection=self.collection, source=self.source)
567 if qs.exists(): 567 ↛ 570line 567 didn't jump to line 570 because the condition on line 567 was always true
568 periode = qs.first()
569 else:
570 periode = Periode(
571 collection=self.collection,
572 source=self.source,
573 title=self.collection.title_tex,
574 issue_href=self.issue_href,
575 collection_href=self.collection_url,
576 doi_href="",
577 published=False,
578 begin=self.periode_begin,
579 end=self.periode_end,
580 first_issue=self.periode_first_issue,
581 last_issue=self.periode_last_issue,
582 )
583 periode.save()
585 return periode
587 @classmethod
588 def get_or_create_source(cls):
589 source, created = Source.objects.get_or_create(
590 domain=cls.source_domain,
591 defaults={
592 "name": cls.source_name,
593 "website": cls.source_website,
594 "create_xissue": True,
595 "periode_href": "",
596 "article_href": "",
597 "pdf_href": "",
598 },
599 )
600 if created: 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true
601 source.save()
602 return source
604 @staticmethod
605 def create_crawled_bibitem(value_xml: str):
606 xref = RefData(lang="en")
607 # xref.citation_tex = "".join([e["value_tex"] for e in elements])
609 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>'
610 xref.citation_xml = value_xml
611 xref = check_bibitem_xml(xref)
613 # Bakes extlink badges into the bibliography html
614 # Maybe we should put this into another file (jats_parser ?)
615 for extid in xref.extids:
616 href = resolve_id(extid[0], extid[1])
617 if (not href) or (not xref.citation_html): 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true
618 continue
619 str_format = extid[0]
620 if str_format in extids_formats: 620 ↛ 622line 620 didn't jump to line 622 because the condition on line 620 was always true
621 str_format = extids_formats[str_format]
622 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>"
624 return xref
626 @staticmethod
627 def create_bibliography(bibitems: Sequence[RefData]):
628 xml_str = "<ref-list>\n"
629 html_str = "<div>\n"
631 for item in bibitems:
632 xml_str += f"\t{item.citation_xml}\n"
633 html_str += f"\t<p>{item.citation_html}</p>\n"
634 xml_str += "</ref-list>"
636 # for item in bibitems:
637 # html_str =
638 # html_str += f"\t<p>{item.citation_html}</p>\n"
639 html_str += "</div>"
641 tex_str = "<div>\n"
642 for item in bibitems:
643 tex_str += f"\t<p>{item.citation_tex}</p>\n"
644 tex_str += "</div>"
646 biblio_dict = create_abstract(
647 tag="biblio",
648 value_html=html_str,
649 value_tex=tex_str,
650 value_xml=xml_str,
651 lang="en",
652 )
654 return biblio_dict
656 @staticmethod
657 def get_issue_pid(
658 collection_id: str,
659 year: str,
660 volume_number: str | None = None,
661 issue_number: str | None = None,
662 ):
663 # Replace any non-word character with an underscore
664 pid = f"{collection_id}_{year}"
665 if volume_number is not None:
666 pid += f"_{volume_number}"
667 if issue_number is not None:
668 pid += f"_{issue_number}"
669 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
670 return pid
672 @staticmethod
673 def set_pages(article: ArticleData, pages: str, separator: str = "-"):
674 pages_split = pages.split(separator)
675 if len(pages_split) == 0: 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true
676 article.page_range = pages
677 if len(pages_split) > 0: 677 ↛ exitline 677 didn't return from function 'set_pages' because the condition on line 677 was always true
678 if pages[0].isnumeric():
679 article.fpage = pages_split[0]
680 if (
681 len(pages_split) > 1
682 and pages_split[0] != pages_split[1]
683 and pages_split[1].isnumeric()
684 ):
685 article.lpage = pages_split[1]