Coverage for src/crawler/by_source/ems_crawler.py: 90%
54 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import cleanup_str
11class EmsCrawler(BaseCollectionCrawler):
12 source_name = "EMS Press"
13 source_domain = "EMS"
14 source_website = "https://ems.press/"
16 issue_re = r"Vol\. (?P<volume>\d+), No\. (?P<number>\d+),pp\. (?P<fpage>\d+)–(?P<lpage>\d+)"
18 def parse_collection_content(self, content):
19 xissues = []
20 soup = BeautifulSoup(content, "html.parser")
21 issues = soup.select("a.issue-title")
22 for issue in issues:
23 volume_year = (
24 issue.parent.find_previous_sibling("h2", {"class": "volume-title"})
25 .select_one(".volume-year")
26 .text
27 )
28 issue_search = regex.search(self.issue_re, issue.text)
29 if not issue_search: 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true
30 raise ValueError("Couldn't parse issue data")
31 issue_group = issue_search.groupdict()
32 issue_href = issue.get("href")
33 if not isinstance(issue_href, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true
34 raise ValueError("Couldn't parse issue url")
36 xissues.append(
37 self.create_xissue(
38 urljoin(self.source_website, issue_href),
39 volume_year,
40 issue_group["volume"],
41 issue_group["number"],
42 )
43 )
45 return xissues
47 def parse_issue_content(self, content, xissue):
48 soup = BeautifulSoup(content, "html.parser")
49 articles = soup.select("article > a.unstyled")
50 for index, article_tag in enumerate(articles):
51 xarticle = create_articledata()
52 xarticle.pid = "a" + str(index)
53 article_href = article_tag.get("href")
54 if not isinstance(article_href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("Couldn't parse article href")
56 xarticle.url = urljoin(self.source_website, article_href)
57 xissue.articles.append(xarticle)
59 def parse_article_content(self, content, xissue, xarticle, url, pid):
60 soup = BeautifulSoup(content, "html.parser")
62 self.get_metadata_using_citation_meta(
63 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]
64 )
66 # Abstract
67 abstract_tag = soup.select_one("div.formatted-text > p")
69 if abstract_tag: 69 ↛ 75line 69 didn't jump to line 75 because the condition on line 69 was always true
70 abstract_text = cleanup_str(abstract_tag.text)
71 abstract = create_abstract(lang="en", tag="abstract", value_tex=abstract_text)
72 xarticle.abstracts.append(abstract)
74 # Keywords
75 keywords_tag = soup.select("ul.keywords > li")
76 for k_tag in keywords_tag:
77 kwd_type = ""
78 if k_tag.parent.previous_sibling.text == "Mathematics Subject Classification":
79 kwd_type = "msc"
80 keyword = create_subj(lang="en", value=k_tag.text, type=kwd_type)
81 xarticle.kwds.append(keyword)
83 xarticle.pid = pid
84 return xarticle