Coverage for src/crawler/by_source/ems_crawler.py: 91%

51 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_abstract, create_articledata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import cleanup_str, regex_to_dict 

8 

9 

10class EmsCrawler(BaseCollectionCrawler): 

11 source_name = "EMS Press" 

12 source_domain = "EMS" 

13 source_website = "https://ems.press/" 

14 

15 issue_re = ( 

16 r"Vol\. (?P<volume>\d+),(?: No\. (?P<number>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 ) 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues = soup.select("a.issue-title") 

23 for issue in issues: 

24 volume_year = ( 

25 issue.parent.find_previous_sibling("h2", {"class": "volume-title"}) 

26 .select_one(".volume-year") 

27 .text 

28 ) 

29 issue_group = regex_to_dict( 

30 self.issue_re, issue.text, error_msg="Couldn't parse issue data" 

31 ) 

32 

33 issue_href = issue.get("href") 

34 if not isinstance(issue_href, str): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 raise ValueError("Couldn't parse issue url") 

36 

37 xissues.append( 

38 self.create_xissue( 

39 urljoin(self.source_website, issue_href), 

40 volume_year, 

41 issue_group["volume"], 

42 issue_group.get("number"), 

43 ) 

44 ) 

45 

46 return xissues 

47 

48 def parse_issue_content(self, content, xissue): 

49 soup = BeautifulSoup(content, "html.parser") 

50 articles = soup.select("article > a.unstyled") 

51 for index, article_tag in enumerate(articles): 

52 xarticle = create_articledata() 

53 xarticle.pid = "a" + str(index) 

54 article_href = article_tag.get("href") 

55 if not isinstance(article_href, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't parse article href") 

57 xarticle.url = urljoin(self.source_website, article_href) 

58 xissue.articles.append(xarticle) 

59 

60 def parse_article_content(self, content, xissue, xarticle, url): 

61 soup = BeautifulSoup(content, "html.parser") 

62 

63 self.get_metadata_using_citation_meta( 

64 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"] 

65 ) 

66 

67 # Abstract 

68 abstract_tag = soup.select_one("div.formatted-text > p") 

69 

70 if abstract_tag: 70 ↛ 76line 70 didn't jump to line 76 because the condition on line 70 was always true

71 abstract_text = cleanup_str(abstract_tag.text) 

72 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text) 

73 xarticle.abstracts.append(abstract) 

74 

75 # Keywords 

76 keywords_tag = soup.select("ul.keywords > li") 

77 for k_tag in keywords_tag: 

78 kwd_type = "" 

79 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification": 

80 kwd_type = "msc" 

81 kwd_text = cleanup_str(k_tag.text) 

82 if kwd_text != "": 82 ↛ 77line 82 didn't jump to line 77 because the condition on line 82 was always true

83 keyword = create_subj(value=kwd_text, type=kwd_type) 

84 xarticle.kwds.append(keyword) 

85 

86 return xarticle