Coverage for src/crawler/by_source/edpsci_crawler.py: 85%

47 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_articledata 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import cleanup_str, regex_to_dict 

8 

9 

10class EdpsciCrawler(BaseCollectionCrawler): 

11 source_name = "EDP Sciences" 

12 source_domain = "EDPSCI" 

13 source_website = "https://www.edpsciences.org/" 

14 

15 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)" 

16 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 

18 def parse_collection_content(self, content): 

19 xissues = [] 

20 soup = BeautifulSoup(content, "html.parser") 

21 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a") 

22 for issue in issues: 

23 issue_url = issue.get("href") 

24 if not isinstance(issue_url, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError("Couldn't find issue url") 

26 issue_dict = regex_to_dict( 

27 self.issue_re, cleanup_str(issue.text), error_msg="Couldn't parse volume title" 

28 ) 

29 xissues.append( 

30 self.create_xissue( 

31 urljoin(self.collection_url, issue_url), 

32 issue_dict["year"], 

33 issue_dict["volume"], 

34 issue_dict.get("number", None), 

35 ) 

36 ) 

37 return xissues 

38 

39 def parse_issue_content(self, content, xissue): 

40 soup = BeautifulSoup(content, "html.parser") 

41 articles = soup.select(".article_title") 

42 if not xissue.url: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("EdpsciCrawler needs issue url to parse issue content") 

44 

45 for index, article_tag in enumerate(articles): 

46 article_url = article_tag.get("href") 

47 if not isinstance(article_url, str): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 raise ValueError("Couldn't parse article url") 

49 xarticle = create_articledata() 

50 xarticle.url = urljoin(xissue.url, article_url) 

51 xarticle.pid = "a" + str(index) 

52 

53 xissue.articles.append(xarticle) 

54 

55 def parse_article_content(self, content, xissue, xarticle, url): 

56 soup = BeautifulSoup(content, "html.parser") 

57 self.get_metadata_using_citation_meta( 

58 xarticle, 

59 xissue, 

60 soup, 

61 [ 

62 "title", 

63 "publisher", 

64 "author", 

65 "doi", 

66 "pdf", 

67 "lang", 

68 "keywords", 

69 "abstract", 

70 "references", 

71 ], 

72 ) 

73 

74 # Article number 

75 article_number = soup.select_one( 

76 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type" 

77 ) 

78 if article_number: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 xarticle.article_number = article_number.text 

80 

81 # Pages 

82 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type") 

83 if pages: 83 ↛ 87line 83 didn't jump to line 87 because the condition on line 83 was always true

84 pages_splitted = pages.text.split(" - ") 

85 xarticle.fpage = pages_splitted[0] 

86 xarticle.lpage = pages_splitted[1] 

87 return xarticle