Coverage for src / crawler / by_source / ems_crawler.py: 89%
64 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import create_abstract, create_articledata, create_subj
6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
7from crawler.utils import cleanup_str, regex_to_dict
10class EmsCrawler(BaseCollectionCrawler):
11 source_name = "EMS Press"
12 source_domain = "EMS"
13 source_website = "https://ems.press/"
15 issue_re_1 = (
16 r"Vol\. (?P<volume>\d+),(?: No\. (?P<number>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)"
17 )
18 issue_re_2 = r"Vol\. (?P<volume>\d+),(?: No\. (?P<number_1>\d+)/(?P<number_2>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)"
20 def parse_collection_content(self, content):
21 xissues = []
22 soup = BeautifulSoup(content, "html.parser")
23 issues = soup.select("a.issue-title")
24 for issue in issues:
25 volume_year = (
26 issue.parent.find_previous_sibling("h2", {"class": "volume-title"})
27 .select_one(".volume-year")
28 .text
29 )
31 try:
32 issue_group = regex_to_dict(
33 self.issue_re_1, issue.text, error_msg="Couldn't parse issue data"
34 )
35 except ValueError:
36 issue_group = regex_to_dict(
37 self.issue_re_2, issue.text, error_msg="Couldn't parse issue data"
38 )
40 issue_href = issue.get("href")
41 if not isinstance(issue_href, str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't parse issue url")
44 xissues.append(
45 self.create_xissue(
46 urljoin(self.source_website, issue_href),
47 volume_year,
48 issue_group["volume"],
49 # issue_number=issue_group.get("number_1")+ "-" + issue_group.get("number_2"),
50 issue_group.get("number"),
51 )
52 )
54 return xissues
56 def parse_issue_content(self, content, xissue):
57 soup = BeautifulSoup(content, "html.parser")
58 articles = soup.select("article > a.unstyled")
59 for index, article_tag in enumerate(articles):
60 xarticle = create_articledata()
61 xarticle.pid = "a" + str(index)
62 article_href = article_tag.get("href")
63 if not isinstance(article_href, str): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError("Couldn't parse article href")
65 xarticle.url = urljoin(self.source_website, article_href)
66 xissue.articles.append(xarticle)
68 def parse_article_content(self, content, xissue, xarticle, url):
69 soup = BeautifulSoup(content, "html.parser")
71 self.get_metadata_using_citation_meta(
72 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]
73 )
75 # Abstract
76 abstract_tag = soup.select_one("div.formatted-text > p")
78 if abstract_tag: 78 ↛ 83line 78 didn't jump to line 83 because the condition on line 78 was always true
79 abstract_text = cleanup_str(abstract_tag.text)
80 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_text))
82 # Keywords
83 keywords_tag = soup.select("ul.keywords > li")
84 for k_tag in keywords_tag:
85 kwd_type = ""
86 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification":
87 kwd_type = "msc"
88 kwd_text = cleanup_str(k_tag.text)
89 if kwd_text != "": 89 ↛ 84line 89 didn't jump to line 84 because the condition on line 89 was always true
90 keyword = create_subj(value=kwd_text, type=kwd_type)
91 xarticle.kwds.append(keyword)
93 # Contributors ORCID
94 contributors_tag = soup.find("div", class_="person-group")
95 if contributors_tag: 95 ↛ 104line 95 didn't jump to line 104 because the condition on line 95 was always true
96 contributor_list = contributors_tag.find_all("section", class_="person")
97 if len(contributor_list) == len(xarticle.contributors): 97 ↛ 104line 97 didn't jump to line 104 because the condition on line 97 was always true
98 for i in range(len(contributor_list)):
99 orcid_tag = contributor_list[i].find("a", string="ORCID")
100 orcid_url = orcid_tag["href"] if orcid_tag else None
101 if orcid_url:
102 orcid = orcid_url.split("/")[-1]
103 xarticle.contributors[i]["orcid"] = orcid
104 return xarticle