Coverage for src/crawler/ 75%

360 statements  

« prev     ^ index     » next v7.6.4, created at 2025-01-15 14:09 +0000

1import time 

2from import Sequence 

3from datetime import timedelta 


5import regex 

6import requests 

7from bs4 import BeautifulSoup 

8from django.conf import settings 

9from django.contrib.auth.models import User 

10from django.utils import timezone 

11from langcodes import standardize_tag 

12from lingua import LanguageDetector, LanguageDetectorBuilder 

13from ptf.cmds import xml_cmds 

14from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

15from ptf.cmds.xml.jats.builder.issue import get_title_xml 

16from ptf.cmds.xml.jats.jats_parser import check_bibitem_xml 

17from ptf.display.resolver import extids_formats, resolve_id 

18from ptf.model_data import ( 

19 ArticleData, 

20 IssueData, 

21 RefData, 

22 create_abstract, 

23 create_contributor, 

24 create_extlink, 

25 create_issuedata, 

26 create_publisherdata, 


28from ptf.model_data_converter import update_data_for_jats 

29from pylatexenc.latex2text import LatexNodes2Text 

30from pysolr import SolrError 

31from requests_cache import CachedSession, FileCache 


33from crawler.models import Periode, Source 

34from crawler.types import CitationLiteral 

35from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_or_create_collection 


37# TODO: pass a class factory instead of a dependency to a site 

38# TODO: pass a class factory instead of a dependency to a site 



41class BaseCollectionCrawler: 

42 """ 

43 Base collection for the crawlers. 

44 To create a crawler: 

45 1) derive a class from BaseCollectionCrawler and name it XXXCrawler 

46 2) override the functions parse_collection_content, parse_issue_content and parse_article_content 

47 3) update so that crawler_factory can return your new crawler 

48 """ 


50 source_name = "" 

51 source_domain = "" 

52 source_website = "" 


54 periode_begin: int = 0 

55 periode_end: int = 9999 


57 issue_href = "" 


59 source = None 

60 session: requests.Session | CachedSession 


62 next_allowed_request: float = time.time() 


64 latext_parser = LatexNodes2Text() 


66 # Override the values in your concrete crawler if the formulas in text (titles, abstracts) 

67 # do not use the "$" to surround tex formulas 

68 delimiter_inline_formula = "$" 

69 delimiter_disp_formula = "$" 


71 # HACK : Workaround for tests (monkeypatching) 

72 # We store the class here, so we can monkeypatch it when running tests 

73 # subCrawlers = { 

74 # LofplCrawler: None 

75 # } 

76 subCrawlers: dict[type["BaseCollectionCrawler"], "BaseCollectionCrawler | None"] = {} 


78 language_detector: LanguageDetector 


80 def __init__( 

81 self, 

82 *args, 

83 username: str, 

84 collection_id: str, 

85 collection_url: str, 

86 test_mode: bool = False, 

87 publisher: str = "mathdoc", 

88 start_pid: str | None = None, 

89 ): 

90 for CrawlerClass in self.subCrawlers: 90 ↛ 91line 90 didn't jump to line 91 because the loop on line 90 never started

91 self.subCrawlers[CrawlerClass] = CrawlerClass( 

92 *args, 

93 username=username, 

94 collection_id=collection_id, 

95 collection_url=collection_url, 

96 test_mode=test_mode, 

97 publisher=publisher, 

98 start_pid=start_pid, 

99 ) 


101 self.username = username 

102 self.user = User.objects.get(username=self.username) 


104 self.collection_id = collection_id 

105 self.collection_url = ( 

106 collection_url # url of the collection. Ex: 

107 ) 

108 self.collection = get_or_create_collection(self.collection_id) 


110 self.test_mode = test_mode 

111 self.publisher = publisher 


113 # EUDML sets or creates the Periode based on the <meta name="citation_year"> found in the journal page 

114 # AMP sets or creates the Periode during the __init__ 

115 # TODO: see with other sources when to create the Periode 

116 self.periode = None 

117 self.periode_first_issue = None 

118 self.periode_last_issue = None 


120 self.start_pid = start_pid 


122 # Some source have multiple pages for 1 issue. We need to merge the content 

123 self.build_language_detector() 


125 self.session = CachedSession( 

126 backend=FileCache( 

127 getattr(settings, "REQUESTS_CACHE_LOCATION", None) or "/tmp/ptf_requests_cache", 

128 decode_content=False, 

129 ), 

130 headers={ 

131 "User-Agent": getattr(settings, "REQUESTS_USER_AGENT", None) or "Mathdoc/1.0.0", 

132 "From": getattr(settings, "REQUESTS_EMAIL", None) or "", 

133 }, 

134 expire_after=timedelta(days=30), 

135 ) 


137 self.source = self.get_or_create_source() 

138 self.periode = self.get_or_create_periode() 


140 def build_language_detector(self): 

141 self.language_detector = LanguageDetectorBuilder.from_all_languages().build() 


143 def parse_collection_content(self, content: str) -> list[IssueData]: 

144 """ 

145 Parse the HTML content with BeautifulSoup 

146 returns a list of xissue. 

147 Override this function in a derived class 

148 """ 

149 return [] 


151 def parse_issue_content(self, content: str, xissue: IssueData): 

152 """ 

153 Parse the HTML content with BeautifulSoup 

154 Fills the xissue.articles 

155 Override this function in a derived class. 


157 CAV : You are supposed to create articles there. Please assign a PID to each article. 

158 The PID can be `a + article_index`, like this : `a0` `a21` 

159 """ 


161 def parse_article_content( 

162 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str, pid: str 

163 ): 

164 """ 

165 Parse the HTML content with BeautifulSoup 

166 returns the xarticle. 

167 Override this function in a derived class. 

168 The xissue is passed to the function in case the article page has issue information (ex: publisher) 

169 The article url is also passed as a parameter 


171 CAV : You are supposed to assign articles pid again here 

172 """ 

173 = pid 

174 return xarticle 


176 def crawl_collection(self): 

177 # TODO: Comments, filter 

178 """ 

179 Crawl an entire collection. ptf.models.Container objects are created. 

180 - get the HTML content of the collection_url 

181 - parse the HTML content with beautifulsoup to extract the list of issues 

182 - merge the xissues (some Source can have multiple pages for 1 volume/issue. We create only 1 container) 

183 - crawl each issue if col_only is False 

184 - Returns the list of merged issues. 

185 It is an OrderedDict {pid: {"issues": xissues}} 

186 The key is the pid of the merged issues. 

187 Ex: The source may have Ex: Volume 6 (2000) and Volume 6 (1999) 

188 the pid is then made with 1999-2000__6_ 

189 """ 


191 if self.source is None: 

192 raise RuntimeError("ERROR: the source is not set") 


194 content = self.download_file(self.collection_url) 

195 xissues = self.parse_collection_content(content) 


197 # xissues = [ 

198 # issue 

199 # for issue in xissues 

200 # if int(issue.year) >= self.periode_begin and int(issue.year) <= self.periode_end 

201 # ] 


203 """ 

204 Some collections split the same volumes in different pages 

205 Ex: Volume 6 (2000) and Volume 6 (1999) 

206 We merge the 2 xissues with the same volume number => Volume 6 (1999-2000) 

207 """ 

208 # merged_xissues = self.merge_xissues(xissues) 


210 xissues_dict = {str( i for i in xissues} 


212 filtered_xissues = xissues_dict 

213 # Filter the issues to crawl if start_pid was set in the constructor 

214 if self.start_pid is not None: 

215 filtered_xissues = {} 

216 start = False 

217 for pid in sorted(xissues_dict): 

218 if pid == self.start_pid: 

219 start = True 

220 if start: 

221 filtered_xissues[pid] = xissues_dict[pid] 


223 return filtered_xissues 


225 def crawl_issue(self, xissue: IssueData): 

226 """ 

227 Crawl 1 wag page of an issue. 

228 - get the HTML content of the issue 

229 - parse the HTML content with beautifulsoup to extract the list of articles and/or the issue metadata 

230 - crawl each article 

231 """ 


233 # Some source, like EuDML do not have a separate HTML pages for an issue's table of content. 

234 # The list of articles directly come from the collection HTML page: the xissue has no url attribute 

235 if hasattr(xissue, "url") and xissue.url: 

236 content = self.download_file(xissue.url) 

237 self.parse_issue_content(content, xissue) 


239 xarticles = xissue.articles 


241 parsed_xarticles = [] 


243 for xarticle in xarticles: 

244 parsed_xarticle = self.crawl_article(xarticle, xissue) 

245 if parsed_xarticle is not None: 

246 parsed_xarticles.append(parsed_xarticle) 


248 xissue.articles = parsed_xarticles 


250 if not self.test_mode and len(xissue.articles) > 0: 

251 self.add_xissue_into_database(xissue) 


253 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

254 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

255 parsed_xarticle = xarticle 

256 if hasattr(xarticle, "url") and xarticle.url: 

257 url = xarticle.url 


259 content = self.download_file(xarticle.url) 

260 pid = f"{}_{}" 


262 parsed_xarticle = self.parse_article_content( 

263 content, xissue, xarticle, xarticle.url, pid 

264 ) 

265 if parsed_xarticle.url is not None: 

266 url = parsed_xarticle.url 

267 # ARTICLE URL as en ExtLink (to display the link in the article page) 

268 ext_link = create_extlink() 

269 ext_link["rel"] = "source" 

270 ext_link["location"] = url 

271 ext_link["metadata"] = self.source_domain 

272 parsed_xarticle.ext_links.append(ext_link) 


274 # The article title may have formulas surrounded with '$' 

275 return self.process_article_metadata(parsed_xarticle) 


277 def process_article_metadata(self, xarticle: ArticleData): 

278 html, xml = get_html_and_xml_from_text_with_formulas( 

279 xarticle.title_tex, 

280 delimiter_inline=self.delimiter_inline_formula, 

281 delimiter_disp=self.delimiter_disp_formula, 

282 ) 

283 xml = get_title_xml(xml, with_tex_values=False) 

284 xarticle.title_html = html 

285 xarticle.title_xml = xml 


287 abstracts_to_parse = [ 

288 xabstract for xabstract in xarticle.abstracts if xabstract["tag"] == "abstract" 

289 ] 

290 # abstract may have formulas surrounded with '$' 

291 if len(abstracts_to_parse) > 0: 

292 for xabstract in abstracts_to_parse: 

293 html, xml = get_html_and_xml_from_text_with_formulas( 

294 xabstract["value_tex"], 

295 delimiter_inline=self.delimiter_inline_formula, 

296 delimiter_disp=self.delimiter_disp_formula, 

297 ) 

298 xabstract["value_html"] = html 

299 lang = xabstract["lang"] 

300 if lang == xarticle.lang: 

301 xabstract["value_xml"] = f'<abstract xml:lang="{lang}">{xml}</abstract>' 

302 else: 

303 xabstract[ 

304 "value_xml" 

305 ] = f'<trans-abstract xml:lang="{lang}">{xml}</trans-abstract>' 


307 update_data_for_jats(xarticle) 


309 return xarticle 


311 def get(self, url: str): 

312 attempt = 0 

313 response = None 


315 while attempt < 3: 

316 # If we already have a key, we can skip the timeout 

317 if isinstance(self.session, CachedSession): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 if not self.session.cache.contains(url=url): 

319 delta = self.next_allowed_request - time.time() 

320 if delta > 0: 

321 time.sleep(delta) 

322 self.next_allowed_request = time.time() + 5 

323 try: 

324 headers = {"accept_encoding": "utf-8"} 

325 # For SSL Errors, use verify=False kwarg 

326 verify = True 

327 if url.startswith(""): 327 ↛ 328line 327 didn't jump to line 328 because the condition on line 327 was never true

328 verify = False 

329 # self.session.cache.delete(urls=[url]) 

330 response = self.session.get(url, headers=headers, verify=verify) 

331 if not response.ok: 

332 raise requests.exceptions.HTTPError( 

333 f"Endpoint answered with code {response.status_code} : {url}", 

334 response=response, 

335 ) 

336 return response 

337 except ( 

338 requests.ConnectionError, 

339 requests.ConnectTimeout, 

340 requests.exceptions.HTTPError, 

341 ): 

342 attempt += 1 

343 raise requests.exceptions.HTTPError(f"Unable to download {url}") 


345 def download_file(self, url: str): 

346 """ 

347 Downloads a URL, saves its content on disk in filename and returns its content. 

348 """ 

349 response = self.get(url) 

350 content = self.decode_response(response) 

351 if content == "" or not content: 351 ↛ 352line 351 didn't jump to line 352 because the condition on line 351 was never true

352 raise requests.exceptions.HTTPError(response) 

353 return content 


355 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

356 """Override this if the content-type headers from the sources are advertising something else than the actual content 

357 SASA needs this""" 

358 response.encoding = encoding 

359 return response.text 


361 def add_xissue_into_database(self, xissue: IssueData): 

362 xissue.journal = self.collection 


364 xpub = create_publisherdata() 

365 = self.publisher 

366 xissue.publisher = xpub 

367 xissue.last_modified_iso_8601_date_str = 


369 attempt = 1 

370 success = False 


372 while not success and attempt < 4: 

373 try: 

374 params = {"xissue": xissue, "use_body": False} 

375 cmd = xml_cmds.addOrUpdateIssueXmlCmd(params) 


377 success = True 

378 except SolrError: 

379 attempt += 1 

380 time.sleep(10) 


382 def get_metadata_using_citation_meta( 

383 self, 

384 xarticle: ArticleData, 

385 xissue: IssueData, 

386 soup: BeautifulSoup, 

387 what: list[CitationLiteral] = [], 

388 ): 

389 """ 

390 :param xarticle: the xarticle that will collect the metadata 

391 :param xissue: the xissue that will collect the publisher 

392 :param soup: the BeautifulSoup object of tha article page 

393 :param what: list of citation_ items to collect. 

394 :return: None. The given article is modified 

395 """ 


397 if "title" in what: 

398 # TITLE 

399 citation_title_node = soup.select_one("meta[name='citation_title']") 

400 if citation_title_node: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true

401 title = citation_title_node.get("content") 

402 if isinstance(title, str): 402 ↛ 405line 402 didn't jump to line 405 because the condition on line 402 was always true

403 xarticle.title_tex = title 


405 if "author" in what: 405 ↛ 418line 405 didn't jump to line 418 because the condition on line 405 was always true

406 # AUTHORS 

407 citation_author_nodes ="meta[name='citation_author']") 

408 for citation_author_node in citation_author_nodes: 

409 text_author = citation_author_node.get("content") 

410 if not isinstance(text_author, str): 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 continue 

412 author = create_contributor() 

413 author["role"] = "author" 

414 author["string_name"] = text_author 


416 xarticle.contributors.append(author) 


418 if "pdf" in what: 418 ↛ 426line 418 didn't jump to line 426 because the condition on line 418 was always true

419 # PDF 

420 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

421 if citation_pdf_node: 

422 pdf_url = citation_pdf_node.get("content") 

423 if isinstance(pdf_url, str): 423 ↛ 426line 423 didn't jump to line 426 because the condition on line 423 was always true

424 add_pdf_link_to_xarticle(xarticle, pdf_url) 


426 if "lang" in what: 

427 # LANG 

428 citation_lang_node = soup.select_one("meta[name='citation_language']") 

429 if citation_lang_node: 429 ↛ 435line 429 didn't jump to line 435 because the condition on line 429 was always true

430 # TODO: check other language code 

431 content_text = citation_lang_node.get("content") 

432 if isinstance(content_text, str): 432 ↛ 435line 432 didn't jump to line 435 because the condition on line 432 was always true

433 xarticle.lang = standardize_tag(content_text) 


435 if "abstract" in what: 


437 abstract_node = soup.select_one("div.entry-content") 

438 if abstract_node is not None: 

439 abstract_section_node = abstract_node.select_one("p") 

440 if abstract_section_node: 440 ↛ 452line 440 didn't jump to line 452 because the condition on line 440 was always true

441 abstract = str(abstract_section_node) 

442 xarticle.abstracts.append( 

443 { 

444 "tag": "abstract", 

445 "value_html": "", 

446 "value_tex": abstract, 

447 "value_xml": "", 

448 "lang": self.detect_language(abstract, xarticle), 

449 } 

450 ) 


452 if "page" in what: 452 ↛ 470line 452 didn't jump to line 470 because the condition on line 452 was always true

453 # PAGES 

454 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

455 if citation_fpage_node: 

456 page = citation_fpage_node.get("content") 

457 if isinstance(page, str): 457 ↛ 462line 457 didn't jump to line 462 because the condition on line 457 was always true

458 page = page.split("(")[0] 

459 if len(page) < 32: 459 ↛ 462line 459 didn't jump to line 462 because the condition on line 459 was always true

460 xarticle.fpage = page 


462 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

463 if citation_lpage_node: 

464 page = citation_lpage_node.get("content") 

465 if isinstance(page, str): 465 ↛ 470line 465 didn't jump to line 470 because the condition on line 465 was always true

466 page = page.split("(")[0] 

467 if len(page) < 32: 467 ↛ 470line 467 didn't jump to line 470 because the condition on line 467 was always true

468 xarticle.lpage = page 


470 if "doi" in what: 

471 # DOI 

472 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

473 if citation_doi_node: 

474 doi = citation_doi_node.get("content") 

475 if isinstance(doi, str): 475 ↛ 483line 475 didn't jump to line 483 because the condition on line 475 was always true

476 doi = doi.strip() 

477 pos = doi.find("10.") 

478 if pos > 0: 

479 doi = doi[pos:] 

480 xarticle.doi = doi 

481 = doi.replace("/", "_").replace(".", "_").replace("-", "_") 


483 if "mr" in what: 

484 # MR 

485 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

486 if citation_mr_node: 486 ↛ 487line 486 didn't jump to line 487 because the condition on line 486 was never true

487 mr = citation_mr_node.get("content") 

488 if isinstance(mr, str): 

489 mr = mr.strip() 

490 if mr.find("MR") == 0: 

491 mr = mr[2:] 

492 xarticle.extids.append(("mr-item-id", mr)) 


494 if "zbl" in what: 

495 # ZBL 

496 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

497 if citation_zbl_node: 

498 zbl = citation_zbl_node.get("content") 

499 if isinstance(zbl, str): 499 ↛ 505line 499 didn't jump to line 505 because the condition on line 499 was always true

500 zbl = zbl.strip() 

501 if zbl.find("Zbl") == 0: 501 ↛ 505line 501 didn't jump to line 505 because the condition on line 501 was always true

502 zbl = zbl[3:].strip() 

503 xarticle.extids.append(("zbl-item-id", zbl)) 


505 if "publisher" in what: 


507 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

508 if citation_publisher_node: 

509 pub = citation_publisher_node.get("content") 

510 if isinstance(pub, str): 510 ↛ 517line 510 didn't jump to line 517 because the condition on line 510 was always true

511 pub = pub.strip() 

512 if pub != "": 512 ↛ 517line 512 didn't jump to line 517 because the condition on line 512 was always true

513 xpub = create_publisherdata() 

514 = pub 

515 xissue.publisher = xpub 


517 if "keywords" in what: 


519 citation_kwd_node = soup.select_one("meta[name='citation_keywords']") 

520 if citation_kwd_node: 

521 kwds = citation_kwd_node.get("content") 

522 if isinstance(kwds, str): 522 ↛ exitline 522 didn't return from function 'get_metadata_using_citation_meta' because the condition on line 522 was always true

523 kwds = kwds.split(",") 

524 for kwd in kwds: 

525 if kwd == "": 

526 continue 

527 kwd = kwd.strip() 

528 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 


530 def create_xissue( 

531 self, url: str | None, year: str, volume_number: str | None, issue_number: str | None = "1" 

532 ): 

533 if url is not None and url.endswith("/"): 533 ↛ 534line 533 didn't jump to line 534 because the condition on line 533 was never true

534 url = url[:-1] 

535 xissue = create_issuedata() 

536 xissue.url = url 


538 = self.get_issue_pid(self.collection_id, year, volume_number, issue_number) 


540 xissue.year = year 


542 if volume_number is not None: 

543 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 


545 if issue_number is not None: 

546 xissue.number = issue_number.replace(",", "-") 

547 return xissue 


549 def detect_language(self, text: str, article: ArticleData | None = None): 

550 if article and article.lang is not None and article.lang != "und": 

551 return article.lang 


553 language = self.language_detector.detect_language_of(text) 


555 if not language: 555 ↛ 556line 555 didn't jump to line 556 because the condition on line 555 was never true

556 return "und" 

557 return 


559 def get_or_create_periode(self): 

560 if self.periode is not None: 560 ↛ 561line 560 didn't jump to line 561 because the condition on line 560 was never true

561 return self.periode 


563 if self.collection is None or self.source is None: 563 ↛ 564line 563 didn't jump to line 564 because the condition on line 563 was never true

564 raise ValueError("You need to set a collection or a source before creating a periode") 


566 qs = Periode.objects.filter(collection=self.collection, source=self.source) 

567 if qs.exists(): 567 ↛ 570line 567 didn't jump to line 570 because the condition on line 567 was always true

568 periode = qs.first() 

569 else: 

570 periode = Periode( 

571 collection=self.collection, 

572 source=self.source, 

573 title=self.collection.title_tex, 

574 issue_href=self.issue_href, 

575 collection_href=self.collection_url, 

576 doi_href="", 

577 published=False, 

578 begin=self.periode_begin, 

579 end=self.periode_end, 

580 first_issue=self.periode_first_issue, 

581 last_issue=self.periode_last_issue, 

582 ) 



585 return periode 


587 @classmethod 

588 def get_or_create_source(cls): 

589 source, created = Source.objects.get_or_create( 

590 domain=cls.source_domain, 

591 defaults={ 

592 "name": cls.source_name, 

593 "website": cls.source_website, 

594 "create_xissue": True, 

595 "periode_href": "", 

596 "article_href": "", 

597 "pdf_href": "", 

598 }, 

599 ) 

600 if created: 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true


602 return source 


604 @staticmethod 

605 def create_crawled_bibitem(value_xml: str): 

606 xref = RefData(lang="en") 

607 # xref.citation_tex = "".join([e["value_tex"] for e in elements]) 


609 value_xml = f'<mixed-citation xml:space="preserve">{value_xml}</mixed-citation>' 

610 xref.citation_xml = value_xml 

611 xref = check_bibitem_xml(xref) 


613 # Bakes extlink badges into the bibliography html 

614 # Maybe we should put this into another file (jats_parser ?) 

615 for extid in xref.extids: 

616 href = resolve_id(extid[0], extid[1]) 

617 if (not href) or (not xref.citation_html): 617 ↛ 618line 617 didn't jump to line 618 because the condition on line 617 was never true

618 continue 

619 str_format = extid[0] 

620 if str_format in extids_formats: 620 ↛ 622line 620 didn't jump to line 622 because the condition on line 620 was always true

621 str_format = extids_formats[str_format] 

622 xref.citation_html += f" | <a href={href} class='badge bg-secondary rounded-pill ref-badge extid-badge'>{str_format}</a>" 


624 return xref 


626 @staticmethod 

627 def create_bibliography(bibitems: Sequence[RefData]): 

628 xml_str = "<ref-list>\n" 

629 html_str = "<div>\n" 


631 for item in bibitems: 

632 xml_str += f"\t{item.citation_xml}\n" 

633 html_str += f"\t<p>{item.citation_html}</p>\n" 

634 xml_str += "</ref-list>" 


636 # for item in bibitems: 

637 # html_str = 

638 # html_str += f"\t<p>{item.citation_html}</p>\n" 

639 html_str += "</div>" 


641 tex_str = "<div>\n" 

642 for item in bibitems: 

643 tex_str += f"\t<p>{item.citation_tex}</p>\n" 

644 tex_str += "</div>" 


646 biblio_dict = create_abstract( 

647 tag="biblio", 

648 value_html=html_str, 

649 value_tex=tex_str, 

650 value_xml=xml_str, 

651 lang="en", 

652 ) 


654 return biblio_dict 


656 @staticmethod 

657 def get_issue_pid( 

658 collection_id: str, 

659 year: str, 

660 volume_number: str | None = None, 

661 issue_number: str | None = None, 

662 ): 

663 # Replace any non-word character with an underscore 

664 pid = f"{collection_id}_{year}" 

665 if volume_number is not None: 

666 pid += f"_{volume_number}" 

667 if issue_number is not None: 

668 pid += f"_{issue_number}" 

669 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

670 return pid 


672 @staticmethod 

673 def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

674 pages_split = pages.split(separator) 

675 if len(pages_split) == 0: 675 ↛ 676line 675 didn't jump to line 676 because the condition on line 675 was never true

676 article.page_range = pages 

677 if len(pages_split) > 0: 677 ↛ exitline 677 didn't return from function 'set_pages' because the condition on line 677 was always true

678 if pages[0].isnumeric(): 

679 article.fpage = pages_split[0] 

680 if ( 

681 len(pages_split) > 1 

682 and pages_split[0] != pages_split[1] 

683 and pages_split[1].isnumeric() 

684 ): 

685 article.lpage = pages_split[1]