Coverage for src/crawler/by_source/ems_crawler.py: 91%
51 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import create_abstract, create_articledata, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import cleanup_str, regex_to_dict
10class EmsCrawler(BaseCollectionCrawler):
11 source_name = "EMS Press"
12 source_domain = "EMS"
13 source_website = "https://ems.press/"
15 issue_re = r"Vol\. (?P<volume>\d+), No\. (?P<number>\d+),pp\. (?P<fpage>\d+)–(?P<lpage>\d+)"
17 def parse_collection_content(self, content):
18 xissues = []
19 soup = BeautifulSoup(content, "html.parser")
20 issues = soup.select("a.issue-title")
21 for issue in issues:
22 volume_year = (
23 issue.parent.find_previous_sibling("h2", {"class": "volume-title"})
24 .select_one(".volume-year")
25 .text
26 )
27 issue_group = regex_to_dict(
28 self.issue_re, issue.text, error_msg="Couldn't parse issue data"
29 )
31 issue_href = issue.get("href")
32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true
33 raise ValueError("Couldn't parse issue url")
35 xissues.append(
36 self.create_xissue(
37 urljoin(self.source_website, issue_href),
38 volume_year,
39 issue_group["volume"],
40 issue_group["number"],
41 )
42 )
44 return xissues
46 def parse_issue_content(self, content, xissue):
47 soup = BeautifulSoup(content, "html.parser")
48 articles = soup.select("article > a.unstyled")
49 for index, article_tag in enumerate(articles):
50 xarticle = create_articledata()
51 xarticle.pid = "a" + str(index)
52 article_href = article_tag.get("href")
53 if not isinstance(article_href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError("Couldn't parse article href")
55 xarticle.url = urljoin(self.source_website, article_href)
56 xissue.articles.append(xarticle)
58 def parse_article_content(self, content, xissue, xarticle, url):
59 soup = BeautifulSoup(content, "html.parser")
61 self.get_metadata_using_citation_meta(
62 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]
63 )
65 # Abstract
66 abstract_tag = soup.select_one("div.formatted-text > p")
68 if abstract_tag: 68 ↛ 74line 68 didn't jump to line 74 because the condition on line 68 was always true
69 abstract_text = cleanup_str(abstract_tag.text)
70 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text)
71 xarticle.abstracts.append(abstract)
73 # Keywords
74 keywords_tag = soup.select("ul.keywords > li")
75 for k_tag in keywords_tag:
76 kwd_type = ""
77 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification":
78 kwd_type = "msc"
79 kwd_text = cleanup_str(k_tag.text)
80 if kwd_text != "": 80 ↛ 75line 80 didn't jump to line 75 because the condition on line 80 was always true
81 keyword = create_subj(value=kwd_text, type=kwd_type)
82 xarticle.kwds.append(keyword)
84 return xarticle