Coverage for src / crawler / by_source / emis_am_crawler.py: 14%
90 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.crawler_utils import get_issue_pid
9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
12class Emis_amCrawler(BaseCollectionCrawler):
13 source_name = "European Mathematical Information Service"
14 source_domain = "EMIS_AM"
15 source_website = "https://www.emis.de"
17 issue_re = regex.compile(
18 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})"
19 )
21 requests_verify = False
23 @classmethod
24 def get_view_id(cls):
25 return "EMIS"
27 def parse_collection_content(self, content):
28 xissues = []
30 soup = BeautifulSoup(content, "html5lib")
31 b_tag = soup.select(
32 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')"
33 )
34 if not b_tag:
35 raise ValueError("Couldn't parse page")
36 for a_tag in b_tag:
37 href = a_tag.get("href")
38 if not isinstance(href, str):
39 raise ValueError("Couldn't parse ")
40 xissue = create_issuedata()
41 xissue.pid = href
42 xissue.url = urljoin(self.collection_url, href)
43 xissues.append(xissue)
44 return xissues
46 def parse_issue_content(self, content, xissue):
47 soup = BeautifulSoup(content, "html5lib")
48 if not xissue.url:
49 raise ValueError("Issue URL must be set")
51 # Parse issue year, volume and number
52 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2")
53 if not title_tag:
54 raise ValueError("Couldn't find issue title")
55 title_str = cleanup_str(title_tag.text)
56 issue_search = self.issue_re.search(title_str)
57 if not issue_search:
58 raise ValueError("Couldn't parse issue title")
59 issue_data = issue_search.groupdict()
60 xissue.volume = issue_data["volume"]
61 xissue.number = issue_data["number"]
62 xissue.year = issue_data["year"]
63 xissue.pid = get_issue_pid(
64 self.collection_id,
65 issue_data["year"],
66 issue_data["volume"],
67 issue_data["number"],
68 )
70 # Parse issue article list
71 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]")
72 for index, a_tag in enumerate(article_tags):
73 href = a_tag.get("href")
74 if not isinstance(href, str):
75 raise ValueError("Couldn't parse article link")
76 xarticle = create_articledata()
77 xarticle.pid = "a" + str(index)
78 xarticle.url = urljoin(xissue.url, href)
79 xissue.articles.append(xarticle)
81 def parse_article_content(self, content, xissue, xarticle, url):
82 if not xarticle.url:
83 raise ValueError("Article must have an url")
85 soup = BeautifulSoup(content, "html5lib")
86 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)")
87 if not article_content:
88 raise ValueError("Couldn't parse article")
89 title_tag = article_content.select_one("h2")
90 if not title_tag:
91 raise ValueError("Couldn't find title")
92 xarticle.title_tex = cleanup_str(title_tag.text)
94 authors_tag = article_content.select_one("h3")
95 if not authors_tag:
96 raise ValueError("Couldn't find title")
97 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ")
98 for author in authors_str:
99 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
101 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')")
102 if keyword_tag:
103 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ")
104 for kwd in keywords_str.split("; "):
105 xarticle.kwds.append(create_subj(value=kwd))
107 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')")
108 if msc_tag:
109 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ")
110 for kwd in msc_tag.split(" "):
111 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
113 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')")
114 if pdf_tag:
115 pdf_href = pdf_tag.get("href")
116 if not isinstance(pdf_href, str):
117 raise ValueError("Couldn't parse pdf href")
118 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
119 return xarticle