Coverage for src/crawler/by_source/edpsci_crawler.py: 85%
47 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import create_articledata
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import cleanup_str, regex_to_dict
10class EdpsciCrawler(BaseCollectionCrawler):
11 source_name = "EDP Sciences"
12 source_domain = "EDPSCI"
13 source_website = "https://www.edpsciences.org/"
15 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)"
16 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
18 def parse_collection_content(self, content):
19 xissues = []
20 soup = BeautifulSoup(content, "html.parser")
21 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a")
22 for issue in issues:
23 issue_url = issue.get("href")
24 if not isinstance(issue_url, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 raise ValueError("Couldn't find issue url")
26 issue_dict = regex_to_dict(
27 self.issue_re, cleanup_str(issue.text), error_msg="Couldn't parse volume title"
28 )
29 xissues.append(
30 self.create_xissue(
31 urljoin(self.collection_url, issue_url),
32 issue_dict["year"],
33 issue_dict["volume"],
34 issue_dict.get("number", None),
35 )
36 )
37 return xissues
39 def parse_issue_content(self, content, xissue):
40 soup = BeautifulSoup(content, "html.parser")
41 articles = soup.select(".article_title")
42 if not xissue.url: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("EdpsciCrawler needs issue url to parse issue content")
45 for index, article_tag in enumerate(articles):
46 article_url = article_tag.get("href")
47 if not isinstance(article_url, str): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 raise ValueError("Couldn't parse article url")
49 xarticle = create_articledata()
50 xarticle.url = urljoin(xissue.url, article_url)
51 xarticle.pid = "a" + str(index)
53 xissue.articles.append(xarticle)
55 def parse_article_content(self, content, xissue, xarticle, url):
56 soup = BeautifulSoup(content, "html.parser")
57 self.get_metadata_using_citation_meta(
58 xarticle,
59 xissue,
60 soup,
61 [
62 "title",
63 "publisher",
64 "author",
65 "doi",
66 "pdf",
67 "lang",
68 "keywords",
69 "abstract",
70 "references",
71 ],
72 )
74 # Article number
75 article_number = soup.select_one(
76 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type"
77 )
78 if article_number: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 xarticle.article_number = article_number.text
81 # Pages
82 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type")
83 if pages: 83 ↛ 87line 83 didn't jump to line 87 because the condition on line 83 was always true
84 pages_splitted = pages.text.split(" - ")
85 xarticle.fpage = pages_splitted[0]
86 xarticle.lpage = pages_splitted[1]
87 return xarticle