Coverage for src/crawler/by_source/ems_crawler.py: 91%

51 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_abstract, create_articledata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import cleanup_str, regex_to_dict 

8 

9 

10class EmsCrawler(BaseCollectionCrawler): 

11 source_name = "EMS Press" 

12 source_domain = "EMS" 

13 source_website = "https://ems.press/" 

14 

15 issue_re = r"Vol\. (?P<volume>\d+), No\. (?P<number>\d+),pp\. (?P<fpage>\d+)–(?P<lpage>\d+)" 

16 

17 def parse_collection_content(self, content): 

18 xissues = [] 

19 soup = BeautifulSoup(content, "html.parser") 

20 issues = soup.select("a.issue-title") 

21 for issue in issues: 

22 volume_year = ( 

23 issue.parent.find_previous_sibling("h2", {"class": "volume-title"}) 

24 .select_one(".volume-year") 

25 .text 

26 ) 

27 issue_group = regex_to_dict( 

28 self.issue_re, issue.text, error_msg="Couldn't parse issue data" 

29 ) 

30 

31 issue_href = issue.get("href") 

32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 raise ValueError("Couldn't parse issue url") 

34 

35 xissues.append( 

36 self.create_xissue( 

37 urljoin(self.source_website, issue_href), 

38 volume_year, 

39 issue_group["volume"], 

40 issue_group["number"], 

41 ) 

42 ) 

43 

44 return xissues 

45 

46 def parse_issue_content(self, content, xissue): 

47 soup = BeautifulSoup(content, "html.parser") 

48 articles = soup.select("article > a.unstyled") 

49 for index, article_tag in enumerate(articles): 

50 xarticle = create_articledata() 

51 xarticle.pid = "a" + str(index) 

52 article_href = article_tag.get("href") 

53 if not isinstance(article_href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("Couldn't parse article href") 

55 xarticle.url = urljoin(self.source_website, article_href) 

56 xissue.articles.append(xarticle) 

57 

58 def parse_article_content(self, content, xissue, xarticle, url): 

59 soup = BeautifulSoup(content, "html.parser") 

60 

61 self.get_metadata_using_citation_meta( 

62 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"] 

63 ) 

64 

65 # Abstract 

66 abstract_tag = soup.select_one("div.formatted-text > p") 

67 

68 if abstract_tag: 68 ↛ 74line 68 didn't jump to line 74 because the condition on line 68 was always true

69 abstract_text = cleanup_str(abstract_tag.text) 

70 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text) 

71 xarticle.abstracts.append(abstract) 

72 

73 # Keywords 

74 keywords_tag = soup.select("ul.keywords > li") 

75 for k_tag in keywords_tag: 

76 kwd_type = "" 

77 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification": 

78 kwd_type = "msc" 

79 kwd_text = cleanup_str(k_tag.text) 

80 if kwd_text != "": 80 ↛ 75line 80 didn't jump to line 75 because the condition on line 80 was always true

81 keyword = create_subj(value=kwd_text, type=kwd_type) 

82 xarticle.kwds.append(keyword) 

83 

84 return xarticle