Coverage for src / crawler / by_source / edpsci_crawler.py: 86%

49 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_articledata 

5from ptf.utils import settings 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import cleanup_str, regex_to_dict 

9 

10 

11class EdpsciCrawler(BaseCollectionCrawler): 

12 source_name = "EDP Sciences" 

13 source_domain = "EDPSCI" 

14 source_website = "https://www.edpsciences.org/" 

15 

16 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)" 

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

18 

19 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 30) 

20 

21 def parse_collection_content(self, content): 

22 xissues = [] 

23 soup = BeautifulSoup(content, "html.parser") 

24 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a") 

25 for issue in issues: 

26 issue_url = issue.get("href") 

27 if not isinstance(issue_url, str): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 raise ValueError("Couldn't find issue url") 

29 issue_dict = regex_to_dict( 

30 self.issue_re, cleanup_str(issue.text), error_msg="Couldn't parse volume title" 

31 ) 

32 xissues.append( 

33 self.create_xissue( 

34 urljoin(self.collection_url, issue_url), 

35 issue_dict["year"], 

36 issue_dict["volume"], 

37 issue_dict.get("number", None), 

38 ) 

39 ) 

40 return xissues 

41 

42 def parse_issue_content(self, content, xissue): 

43 soup = BeautifulSoup(content, "html.parser") 

44 articles = soup.select(".article_title") 

45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("EdpsciCrawler needs issue url to parse issue content") 

47 

48 for index, article_tag in enumerate(articles): 

49 article_url = article_tag.get("href") 

50 if not isinstance(article_url, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("Couldn't parse article url") 

52 xarticle = create_articledata() 

53 xarticle.url = urljoin(xissue.url, article_url) 

54 xarticle.pid = "a" + str(index) 

55 

56 xissue.articles.append(xarticle) 

57 

58 def parse_article_content(self, content, xissue, xarticle, url): 

59 soup = BeautifulSoup(content, "html.parser") 

60 self.get_metadata_using_citation_meta( 

61 xarticle, 

62 xissue, 

63 soup, 

64 [ 

65 "title", 

66 "publisher", 

67 "author", 

68 "doi", 

69 "pdf", 

70 "lang", 

71 "keywords", 

72 "abstract", 

73 "references", 

74 ], 

75 ) 

76 

77 # Article number 

78 article_number = soup.select_one( 

79 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type" 

80 ) 

81 if article_number: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 xarticle.article_number = article_number.text 

83 

84 # Pages 

85 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type") 

86 if pages: 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 pages_splitted = pages.text.split(" - ") 

88 xarticle.fpage = pages_splitted[0] 

89 xarticle.lpage = pages_splitted[1] 

90 return xarticle