Coverage for src/crawler/by_source/edpsci_crawler.py: 85%
53 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import cleanup_str
11class EdpsciCrawler(BaseCollectionCrawler):
12 source_name = "EDP Sciences"
13 source_domain = "EDPSCI"
14 source_website = "https://www.edpsciences.org/"
16 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)"
17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a")
23 for issue in issues:
24 issue_url = issue.get("href")
25 if not isinstance(issue_url, str): 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true
26 raise ValueError("Couldn't find issue url")
27 issue_search = regex.search(self.issue_re, cleanup_str(issue.text))
28 if not issue_search: 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 raise ValueError("Couldn't parse volume title")
30 issue_dict = issue_search.groupdict()
31 xissues.append(
32 self.create_xissue(
33 urljoin(self.collection_url, issue_url),
34 issue_dict["year"],
35 issue_dict["volume"],
36 issue_dict.get("number", None),
37 )
38 )
39 return xissues
41 def parse_issue_content(self, content, xissue):
42 soup = BeautifulSoup(content, "html.parser")
43 articles = soup.select(".article_title")
44 if not xissue.url: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("EdpsciCrawler needs issue url to parse issue content")
47 for index, article_tag in enumerate(articles):
48 article_url = article_tag.get("href")
49 if not isinstance(article_url, str): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError("Couldn't parse article url")
51 xarticle = create_articledata()
52 xarticle.url = urljoin(xissue.url, article_url)
53 xarticle.pid = "a" + str(index)
55 xissue.articles.append(xarticle)
57 def parse_article_content(self, content, xissue, xarticle, url):
58 soup = BeautifulSoup(content, "html.parser")
59 self.get_metadata_using_citation_meta(
60 xarticle,
61 xissue,
62 soup,
63 [
64 "title",
65 "publisher",
66 "author",
67 "doi",
68 "pdf",
69 "lang",
70 "keywords",
71 "abstract",
72 "references",
73 ],
74 )
75 if len(xarticle.bibitems) > 0:
76 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems))
78 # Article number
79 article_number = soup.select_one(
80 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type"
81 )
82 if article_number: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 xarticle.article_number = article_number.text
85 # Pages
86 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type")
87 if pages: 87 ↛ 91line 87 didn't jump to line 91 because the condition on line 87 was always true
88 pages_splitted = pages.text.split(" - ")
89 xarticle.fpage = pages_splitted[0]
90 xarticle.lpage = pages_splitted[1]
91 return xarticle