Coverage for src / crawler / by_source / ems_crawler.py: 89%

64 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-08 09:35 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_abstract, create_articledata, create_subj 

5 

6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

7from crawler.utils import cleanup_str, regex_to_dict 

8 

9 

10class EmsCrawler(BaseCollectionCrawler): 

11 source_name = "EMS Press" 

12 source_domain = "EMS" 

13 source_website = "https://ems.press/" 

14 

15 issue_re_1 = ( 

16 r"Vol\. (?P<volume>\d+),(?: No\. (?P<number>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 ) 

18 issue_re_2 = r"Vol\. (?P<volume>\d+),(?: No\. (?P<number_1>\d+)/(?P<number_2>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)" 

19 

20 def parse_collection_content(self, content): 

21 xissues = [] 

22 soup = BeautifulSoup(content, "html.parser") 

23 issues = soup.select("a.issue-title") 

24 for issue in issues: 

25 volume_year = ( 

26 issue.parent.find_previous_sibling("h2", {"class": "volume-title"}) 

27 .select_one(".volume-year") 

28 .text 

29 ) 

30 

31 try: 

32 issue_group = regex_to_dict( 

33 self.issue_re_1, issue.text, error_msg="Couldn't parse issue data" 

34 ) 

35 except ValueError: 

36 issue_group = regex_to_dict( 

37 self.issue_re_2, issue.text, error_msg="Couldn't parse issue data" 

38 ) 

39 

40 issue_href = issue.get("href") 

41 if not isinstance(issue_href, str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't parse issue url") 

43 

44 xissues.append( 

45 self.create_xissue( 

46 urljoin(self.source_website, issue_href), 

47 volume_year, 

48 issue_group["volume"], 

49 # issue_number=issue_group.get("number_1")+ "-" + issue_group.get("number_2"), 

50 issue_group.get("number"), 

51 ) 

52 ) 

53 

54 return xissues 

55 

56 def parse_issue_content(self, content, xissue): 

57 soup = BeautifulSoup(content, "html.parser") 

58 articles = soup.select("article > a.unstyled") 

59 for index, article_tag in enumerate(articles): 

60 xarticle = create_articledata() 

61 xarticle.pid = "a" + str(index) 

62 article_href = article_tag.get("href") 

63 if not isinstance(article_href, str): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Couldn't parse article href") 

65 xarticle.url = urljoin(self.source_website, article_href) 

66 xissue.articles.append(xarticle) 

67 

68 def parse_article_content(self, content, xissue, xarticle, url): 

69 soup = BeautifulSoup(content, "html.parser") 

70 

71 self.get_metadata_using_citation_meta( 

72 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"] 

73 ) 

74 

75 # Abstract 

76 abstract_tag = soup.select_one("div.formatted-text > p") 

77 

78 if abstract_tag: 78 ↛ 83line 78 didn't jump to line 83 because the condition on line 78 was always true

79 abstract_text = cleanup_str(abstract_tag.text) 

80 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_text)) 

81 

82 # Keywords 

83 keywords_tag = soup.select("ul.keywords > li") 

84 for k_tag in keywords_tag: 

85 kwd_type = "" 

86 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification": 

87 kwd_type = "msc" 

88 kwd_text = cleanup_str(k_tag.text) 

89 if kwd_text != "": 89 ↛ 84line 89 didn't jump to line 84 because the condition on line 89 was always true

90 keyword = create_subj(value=kwd_text, type=kwd_type) 

91 xarticle.kwds.append(keyword) 

92 

93 # Contributors ORCID 

94 contributors_tag = soup.find("div", class_="person-group") 

95 if contributors_tag: 95 ↛ 104line 95 didn't jump to line 104 because the condition on line 95 was always true

96 contributor_list = contributors_tag.find_all("section", class_="person") 

97 if len(contributor_list) == len(xarticle.contributors): 97 ↛ 104line 97 didn't jump to line 104 because the condition on line 97 was always true

98 for i in range(len(contributor_list)): 

99 orcid_tag = contributor_list[i].find("a", string="ORCID") 

100 orcid_url = orcid_tag["href"] if orcid_tag else None 

101 if orcid_url: 

102 orcid = orcid_url.split("/")[-1] 

103 xarticle.contributors[i]["orcid"] = orcid 

104 return xarticle