Coverage for src/crawler/by_source/ems_crawler.py: 90%

54 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import cleanup_str 

9 

10 

11class EmsCrawler(BaseCollectionCrawler): 

12 source_name = "EMS Press" 

13 source_domain = "EMS" 

14 source_website = "https://ems.press/" 

15 

16 issue_re = r"Vol\. (?P<volume>\d+), No\. (?P<number>\d+),pp\. (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 

18 def parse_collection_content(self, content): 

19 xissues = [] 

20 soup = BeautifulSoup(content, "html.parser") 

21 issues = soup.select("a.issue-title") 

22 for issue in issues: 

23 volume_year = ( 

24 issue.parent.find_previous_sibling("h2", {"class": "volume-title"}) 

25 .select_one(".volume-year") 

26 .text 

27 ) 

28 issue_search = regex.search(self.issue_re, issue.text) 

29 if not issue_search: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 raise ValueError("Couldn't parse issue data") 

31 issue_group = issue_search.groupdict() 

32 issue_href = issue.get("href") 

33 if not isinstance(issue_href, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 raise ValueError("Couldn't parse issue url") 

35 

36 xissues.append( 

37 self.create_xissue( 

38 urljoin(self.source_website, issue_href), 

39 volume_year, 

40 issue_group["volume"], 

41 issue_group["number"], 

42 ) 

43 ) 

44 

45 return xissues 

46 

47 def parse_issue_content(self, content, xissue): 

48 soup = BeautifulSoup(content, "html.parser") 

49 articles = soup.select("article > a.unstyled") 

50 for index, article_tag in enumerate(articles): 

51 xarticle = create_articledata() 

52 xarticle.pid = "a" + str(index) 

53 article_href = article_tag.get("href") 

54 if not isinstance(article_href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't parse article href") 

56 xarticle.url = urljoin(self.source_website, article_href) 

57 xissue.articles.append(xarticle) 

58 

59 def parse_article_content(self, content, xissue, xarticle, url, pid): 

60 soup = BeautifulSoup(content, "html.parser") 

61 

62 self.get_metadata_using_citation_meta( 

63 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"] 

64 ) 

65 

66 # Abstract 

67 abstract_tag = soup.select_one("div.formatted-text > p") 

68 

69 if abstract_tag: 69 ↛ 75line 69 didn't jump to line 75 because the condition on line 69 was always true

70 abstract_text = cleanup_str(abstract_tag.text) 

71 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text) 

72 xarticle.abstracts.append(abstract) 

73 

74 # Keywords 

75 keywords_tag = soup.select("ul.keywords > li") 

76 for k_tag in keywords_tag: 

77 kwd_type = "" 

78 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification": 

79 kwd_type = "msc" 

80 keyword = create_subj(lang="en", value=k_tag.text, type=kwd_type) 

81 xarticle.kwds.append(keyword) 

82 

83 xarticle.pid = pid 

84 return xarticle