Coverage for src/crawler/by_source/emis_am_crawler.py: 13%
89 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class Emis_amCrawler(BaseCollectionCrawler):
12 source_name = "European Mathematical Information Service"
13 source_domain = "EMIS_AM"
14 source_website = "https://www.emis.de"
16 issue_re = regex.compile(
17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})"
18 )
20 requests_verify = False
22 @classmethod
23 def get_view_id(cls):
24 return "EMIS"
26 def parse_collection_content(self, content):
27 xissues = []
29 soup = BeautifulSoup(content, "html5lib")
30 b_tag = soup.select(
31 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')"
32 )
33 if not b_tag:
34 raise ValueError("Couldn't parse page")
35 for a_tag in b_tag:
36 href = a_tag.get("href")
37 if not isinstance(href, str):
38 raise ValueError("Couldn't parse ")
39 xissue = create_issuedata()
40 xissue.pid = href
41 xissue.url = urljoin(self.collection_url, href)
42 xissues.append(xissue)
43 return xissues
45 def parse_issue_content(self, content, xissue):
46 soup = BeautifulSoup(content, "html5lib")
47 if not xissue.url:
48 raise ValueError("Issue URL must be set")
50 # Parse issue year, volume and number
51 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2")
52 if not title_tag:
53 raise ValueError("Couldn't find issue title")
54 title_str = cleanup_str(title_tag.text)
55 issue_search = self.issue_re.search(title_str)
56 if not issue_search:
57 raise ValueError("Couldn't parse issue title")
58 issue_data = issue_search.groupdict()
59 xissue.volume = issue_data["volume"]
60 xissue.number = issue_data["number"]
61 xissue.year = issue_data["year"]
62 xissue.pid = self.get_issue_pid(
63 self.collection_id,
64 issue_data["year"],
65 issue_data["volume"],
66 issue_data["number"],
67 )
69 # Parse issue article list
70 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]")
71 for index, a_tag in enumerate(article_tags):
72 href = a_tag.get("href")
73 if not isinstance(href, str):
74 raise ValueError("Couldn't parse article link")
75 xarticle = create_articledata()
76 xarticle.pid = "a" + str(index)
77 xarticle.url = urljoin(xissue.url, href)
78 xissue.articles.append(xarticle)
80 def parse_article_content(self, content, xissue, xarticle, url):
81 if not xarticle.url:
82 raise ValueError("Article must have an url")
84 soup = BeautifulSoup(content, "html5lib")
85 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)")
86 if not article_content:
87 raise ValueError("Couldn't parse article")
88 title_tag = article_content.select_one("h2")
89 if not title_tag:
90 raise ValueError("Couldn't find title")
91 xarticle.title_tex = cleanup_str(title_tag.text)
93 authors_tag = article_content.select_one("h3")
94 if not authors_tag:
95 raise ValueError("Couldn't find title")
96 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ")
97 for author in authors_str:
98 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
100 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')")
101 if keyword_tag:
102 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ")
103 for kwd in keywords_str.split("; "):
104 xarticle.kwds.append(create_subj(value=kwd))
106 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')")
107 if msc_tag:
108 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ")
109 for kwd in msc_tag.split(" "):
110 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
112 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')")
113 if pdf_tag:
114 pdf_href = pdf_tag.get("href")
115 if not isinstance(pdf_href, str):
116 raise ValueError("Couldn't parse pdf href")
117 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
118 return xarticle