Coverage for src/crawler/by_source/edpsci_crawler.py: 86%

44 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import cleanup_str 

9 

10 

11class EdpsciCrawler(BaseCollectionCrawler): 

12 source_name = "EDP Sciences" 

13 source_domain = "EDPSCI" 

14 source_website = "https://www.edpsciences.org/" 

15 

16 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)" 

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a") 

23 for issue in issues: 

24 issue_url = issue.get("href") 

25 if not isinstance(issue_url, str): 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 raise ValueError("Couldn't find issue url") 

27 issue_search = regex.search(self.issue_re, cleanup_str(issue.text)) 

28 if not issue_search: 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse volume title") 

30 issue_dict = issue_search.groupdict() 

31 xissues.append( 

32 self.create_xissue( 

33 urljoin(self.collection_url, issue_url), 

34 issue_dict["year"], 

35 issue_dict["volume"], 

36 issue_dict.get("number", None), 

37 ) 

38 ) 

39 return xissues 

40 

41 def parse_issue_content(self, content, xissue): 

42 soup = BeautifulSoup(content, "html.parser") 

43 articles = soup.select(".article_title") 

44 if not xissue.url: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("EdpsciCrawler needs issue url to parse issue content") 

46 

47 for index, article_tag in enumerate(articles): 

48 article_url = article_tag.get("href") 

49 if not isinstance(article_url, str): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError("Couldn't parse article url") 

51 xarticle = create_articledata() 

52 xarticle.url = urljoin(xissue.url, article_url) 

53 xarticle.pid = "a" + str(index) 

54 

55 xissue.articles.append(xarticle) 

56 

57 def parse_article_content(self, content, xissue, xarticle, url, pid): 

58 soup = BeautifulSoup(content, "html.parser") 

59 self.get_metadata_using_citation_meta( 

60 xarticle, 

61 xissue, 

62 soup, 

63 [ 

64 "title", 

65 "publisher", 

66 "author", 

67 "doi", 

68 "pdf", 

69 "lang", 

70 "keywords", 

71 "abstract", 

72 "page", 

73 "references", 

74 ], 

75 ) 

76 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems)) 

77 return super().parse_article_content(content, xissue, xarticle, url, pid)