Coverage for src/crawler/by_source/edpsci_crawler.py: 86%

50 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.cmds.xml.jats.jats_parser import JatsBase 

5from ptf.model_data import create_articledata 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import cleanup_str, regex_to_dict 

9 

10 

11class EdpsciCrawler(BaseCollectionCrawler): 

12 source_name = "EDP Sciences" 

13 source_domain = "EDPSCI" 

14 source_website = "https://www.edpsciences.org/" 

15 

16 issue_re = r"Vol\. (?P<volume>\d+)(?: \/ (?P<number>[\d \w]+))? \- (?:\w+ )?(?P<year>\d+)" 

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues = soup.select("#issues > .year > .bloc > .volume > .issues-url > .ico.free + a") 

23 for issue in issues: 

24 issue_url = issue.get("href") 

25 if not isinstance(issue_url, str): 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 raise ValueError("Couldn't find issue url") 

27 issue_dict = regex_to_dict( 

28 self.issue_re, cleanup_str(issue.text), error_msg="Couldn't parse volume title" 

29 ) 

30 xissues.append( 

31 self.create_xissue( 

32 urljoin(self.collection_url, issue_url), 

33 issue_dict["year"], 

34 issue_dict["volume"], 

35 issue_dict.get("number", None), 

36 ) 

37 ) 

38 return xissues 

39 

40 def parse_issue_content(self, content, xissue): 

41 soup = BeautifulSoup(content, "html.parser") 

42 articles = soup.select(".article_title") 

43 if not xissue.url: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError("EdpsciCrawler needs issue url to parse issue content") 

45 

46 for index, article_tag in enumerate(articles): 

47 article_url = article_tag.get("href") 

48 if not isinstance(article_url, str): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("Couldn't parse article url") 

50 xarticle = create_articledata() 

51 xarticle.url = urljoin(xissue.url, article_url) 

52 xarticle.pid = "a" + str(index) 

53 

54 xissue.articles.append(xarticle) 

55 

56 def parse_article_content(self, content, xissue, xarticle, url): 

57 soup = BeautifulSoup(content, "html.parser") 

58 self.get_metadata_using_citation_meta( 

59 xarticle, 

60 xissue, 

61 soup, 

62 [ 

63 "title", 

64 "publisher", 

65 "author", 

66 "doi", 

67 "pdf", 

68 "lang", 

69 "keywords", 

70 "abstract", 

71 "references", 

72 ], 

73 ) 

74 if len(xarticle.bibitems) > 0: 

75 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems)) 

76 

77 # Article number 

78 article_number = soup.select_one( 

79 ".summary tr th:-soup-contains-own('Article Number') ~ td:last-of-type" 

80 ) 

81 if article_number: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 xarticle.article_number = article_number.text 

83 

84 # Pages 

85 pages = soup.select_one(".summary tr th:-soup-contains-own('Page(s)') ~ td:last-of-type") 

86 if pages: 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 pages_splitted = pages.text.split(" - ") 

88 xarticle.fpage = pages_splitted[0] 

89 xarticle.lpage = pages_splitted[1] 

90 return xarticle