Coverage for src/crawler/by_source/edpsci_crawler.py: 85%
54 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.cmds.xml.jats.jats_parser import JatsBase
6from ptf.model_data import create_articledata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.utils import cleanup_str
12class EdpsciCrawler(BaseCollectionCrawler):
13 source_name = "EDP Sciences"
14 source_domain = "EDPSCI"
15 source_website = "https://www.edpsciences.org/"
17 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)"
18 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
20 def parse_collection_content(self, content):
21 xissues = []
22 soup = BeautifulSoup(content, "html.parser")
23 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a")
24 for issue in issues:
25 issue_url = issue.get("href")
26 if not isinstance(issue_url, str): 26 ↛ 27line 26 didn't jump to line 27 because the condition on line 26 was never true
27 raise ValueError("Couldn't find issue url")
28 issue_search = regex.search(self.issue_re, cleanup_str(issue.text))
29 if not issue_search: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true
30 raise ValueError("Couldn't parse volume title")
31 issue_dict = issue_search.groupdict()
32 xissues.append(
33 self.create_xissue(
34 urljoin(self.collection_url, issue_url),
35 issue_dict["year"],
36 issue_dict["volume"],
37 issue_dict.get("number", None),
38 )
39 )
40 return xissues
42 def parse_issue_content(self, content, xissue):
43 soup = BeautifulSoup(content, "html.parser")
44 articles = soup.select(".article_title")
45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("EdpsciCrawler needs issue url to parse issue content")
48 for index, article_tag in enumerate(articles):
49 article_url = article_tag.get("href")
50 if not isinstance(article_url, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 raise ValueError("Couldn't parse article url")
52 xarticle = create_articledata()
53 xarticle.url = urljoin(xissue.url, article_url)
54 xarticle.pid = "a" + str(index)
56 xissue.articles.append(xarticle)
58 def parse_article_content(self, content, xissue, xarticle, url):
59 soup = BeautifulSoup(content, "html.parser")
60 self.get_metadata_using_citation_meta(
61 xarticle,
62 xissue,
63 soup,
64 [
65 "title",
66 "publisher",
67 "author",
68 "doi",
69 "pdf",
70 "lang",
71 "keywords",
72 "abstract",
73 "references",
74 ],
75 )
76 if len(xarticle.bibitems) > 0:
77 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))
79 # Article number
80 article_number = soup.select_one(
81 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type"
82 )
83 if article_number: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 xarticle.article_number = article_number.text
86 # Pages
87 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type")
88 if pages: 88 ↛ 92line 88 didn't jump to line 92 because the condition on line 88 was always true
89 pages_splitted = pages.text.split(" - ")
90 xarticle.fpage = pages_splitted[0]
91 xarticle.lpage = pages_splitted[1]
92 return xarticle