Coverage for src/crawler/by_source/ems_crawler.py: 91%
51 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import create_abstract, create_articledata, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import cleanup_str, regex_to_dict
10class EmsCrawler(BaseCollectionCrawler):
11 source_name = "EMS Press"
12 source_domain = "EMS"
13 source_website = "https://ems.press/"
15 issue_re = (
16 r"Vol\. (?P<volume>\d+),(?: No\. (?P<number>\d+),)?pp\. (?P<fpage>\d+)–(?P<lpage>\d+)"
17 )
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 issues = soup.select("a.issue-title")
23 for issue in issues:
24 volume_year = (
25 issue.parent.find_previous_sibling("h2", {"class": "volume-title"})
26 .select_one(".volume-year")
27 .text
28 )
29 issue_group = regex_to_dict(
30 self.issue_re, issue.text, error_msg="Couldn't parse issue data"
31 )
33 issue_href = issue.get("href")
34 if not isinstance(issue_href, str): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 raise ValueError("Couldn't parse issue url")
37 xissues.append(
38 self.create_xissue(
39 urljoin(self.source_website, issue_href),
40 volume_year,
41 issue_group["volume"],
42 issue_group.get("number"),
43 )
44 )
46 return xissues
48 def parse_issue_content(self, content, xissue):
49 soup = BeautifulSoup(content, "html.parser")
50 articles = soup.select("article > a.unstyled")
51 for index, article_tag in enumerate(articles):
52 xarticle = create_articledata()
53 xarticle.pid = "a" + str(index)
54 article_href = article_tag.get("href")
55 if not isinstance(article_href, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 raise ValueError("Couldn't parse article href")
57 xarticle.url = urljoin(self.source_website, article_href)
58 xissue.articles.append(xarticle)
60 def parse_article_content(self, content, xissue, xarticle, url):
61 soup = BeautifulSoup(content, "html.parser")
63 self.get_metadata_using_citation_meta(
64 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]
65 )
67 # Abstract
68 abstract_tag = soup.select_one("div.formatted-text > p")
70 if abstract_tag: 70 ↛ 76line 70 didn't jump to line 76 because the condition on line 70 was always true
71 abstract_text = cleanup_str(abstract_tag.text)
72 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text)
73 xarticle.abstracts.append(abstract)
75 # Keywords
76 keywords_tag = soup.select("ul.keywords > li")
77 for k_tag in keywords_tag:
78 kwd_type = ""
79 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification":
80 kwd_type = "msc"
81 kwd_text = cleanup_str(k_tag.text)
82 if kwd_text != "": 82 ↛ 77line 82 didn't jump to line 77 because the condition on line 82 was always true
83 keyword = create_subj(value=kwd_text, type=kwd_type)
84 xarticle.kwds.append(keyword)
86 return xarticle