Coverage for src/crawler/by_source/edpsci_crawler.py: 85%

54 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.cmds.xml.jats.jats_parser import JatsBase 

6from ptf.model_data import create_articledata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.utils import cleanup_str 

10 

11 

12class EdpsciCrawler(BaseCollectionCrawler): 

13 source_name = "EDP Sciences" 

14 source_domain = "EDPSCI" 

15 source_website = "https://www.edpsciences.org/" 

16 

17 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)" 

18 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

19 

20 def parse_collection_content(self, content): 

21 xissues = [] 

22 soup = BeautifulSoup(content, "html.parser") 

23 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a") 

24 for issue in issues: 

25 issue_url = issue.get("href") 

26 if not isinstance(issue_url, str): 26 ↛ 27line 26 didn't jump to line 27 because the condition on line 26 was never true

27 raise ValueError("Couldn't find issue url") 

28 issue_search = regex.search(self.issue_re, cleanup_str(issue.text)) 

29 if not issue_search: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 raise ValueError("Couldn't parse volume title") 

31 issue_dict = issue_search.groupdict() 

32 xissues.append( 

33 self.create_xissue( 

34 urljoin(self.collection_url, issue_url), 

35 issue_dict["year"], 

36 issue_dict["volume"], 

37 issue_dict.get("number", None), 

38 ) 

39 ) 

40 return xissues 

41 

42 def parse_issue_content(self, content, xissue): 

43 soup = BeautifulSoup(content, "html.parser") 

44 articles = soup.select(".article_title") 

45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("EdpsciCrawler needs issue url to parse issue content") 

47 

48 for index, article_tag in enumerate(articles): 

49 article_url = article_tag.get("href") 

50 if not isinstance(article_url, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("Couldn't parse article url") 

52 xarticle = create_articledata() 

53 xarticle.url = urljoin(xissue.url, article_url) 

54 xarticle.pid = "a" + str(index) 

55 

56 xissue.articles.append(xarticle) 

57 

58 def parse_article_content(self, content, xissue, xarticle, url): 

59 soup = BeautifulSoup(content, "html.parser") 

60 self.get_metadata_using_citation_meta( 

61 xarticle, 

62 xissue, 

63 soup, 

64 [ 

65 "title", 

66 "publisher", 

67 "author", 

68 "doi", 

69 "pdf", 

70 "lang", 

71 "keywords", 

72 "abstract", 

73 "references", 

74 ], 

75 ) 

76 if len(xarticle.bibitems) > 0: 

77 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems)) 

78 

79 # Article number 

80 article_number = soup.select_one( 

81 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type" 

82 ) 

83 if article_number: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 xarticle.article_number = article_number.text 

85 

86 # Pages 

87 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type") 

88 if pages: 88 ↛ 92line 88 didn't jump to line 92 because the condition on line 88 was always true

89 pages_splitted = pages.text.split(" - ") 

90 xarticle.fpage = pages_splitted[0] 

91 xarticle.lpage = pages_splitted[1] 

92 return xarticle