Coverage for src/crawler/by_source/emis_am_crawler.py: 12%
86 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class Emis_amCrawler(BaseCollectionCrawler):
12 source_name = "European Mathematical Information Service"
13 source_domain = "EMIS_AM"
14 source_website = "https://www.emis.de"
16 issue_re = regex.compile(
17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})"
18 )
20 requests_verify = False
22 def parse_collection_content(self, content):
23 xissues = []
25 soup = BeautifulSoup(content, "html5lib")
26 b_tag = soup.select(
27 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')"
28 )
29 if not b_tag:
30 raise ValueError("Couldn't parse page")
31 for a_tag in b_tag:
32 href = a_tag.get("href")
33 if not isinstance(href, str):
34 raise ValueError("Couldn't parse ")
35 xissue = create_issuedata()
36 xissue.pid = href
37 xissue.url = urljoin(self.collection_url, href)
38 xissues.append(xissue)
39 return xissues
41 def parse_issue_content(self, content, xissue):
42 soup = BeautifulSoup(content, "html5lib")
43 if not xissue.url:
44 raise ValueError("Issue URL must be set")
46 # Parse issue year, volume and number
47 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2")
48 if not title_tag:
49 raise ValueError("Couldn't find issue title")
50 title_str = cleanup_str(title_tag.text)
51 issue_search = self.issue_re.search(title_str)
52 if not issue_search:
53 raise ValueError("Couldn't parse issue title")
54 issue_data = issue_search.groupdict()
55 xissue.volume = issue_data["volume"]
56 xissue.number = issue_data["number"]
57 xissue.year = issue_data["year"]
58 xissue.pid = self.get_issue_pid(
59 self.collection_id,
60 issue_data["year"],
61 issue_data["volume"],
62 issue_data["number"],
63 )
65 # Parse issue article list
66 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]")
67 for index, a_tag in enumerate(article_tags):
68 href = a_tag.get("href")
69 if not isinstance(href, str):
70 raise ValueError("Couldn't parse article link")
71 xarticle = create_articledata()
72 xarticle.pid = "a" + str(index)
73 xarticle.url = urljoin(xissue.url, href)
74 xissue.articles.append(xarticle)
76 def parse_article_content(self, content, xissue, xarticle, url):
77 if not xarticle.url:
78 raise ValueError("Article must have an url")
80 soup = BeautifulSoup(content, "html5lib")
81 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)")
82 if not article_content:
83 raise ValueError("Couldn't parse article")
84 title_tag = article_content.select_one("h2")
85 if not title_tag:
86 raise ValueError("Couldn't find title")
87 xarticle.title_tex = cleanup_str(title_tag.text)
89 authors_tag = article_content.select_one("h3")
90 if not authors_tag:
91 raise ValueError("Couldn't find title")
92 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ")
93 for author in authors_str:
94 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
96 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')")
97 if keyword_tag:
98 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ")
99 for kwd in keywords_str.split("; "):
100 xarticle.kwds.append(create_subj(value=kwd))
102 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')")
103 if msc_tag:
104 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ")
105 for kwd in msc_tag.split(" "):
106 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
108 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')")
109 if pdf_tag:
110 pdf_href = pdf_tag.get("href")
111 if not isinstance(pdf_href, str):
112 raise ValueError("Couldn't parse pdf href")
113 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
114 return xarticle