Coverage for src/crawler/by_source/emis_am_crawler.py: 11%
85 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class Emis_amCrawler(BaseCollectionCrawler):
12 source_name = "European Mathematical Information Service"
13 source_domain = "EMIS_AM"
14 source_website = "https://www.emis.de"
16 issue_re = regex.compile(
17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})"
18 )
20 def parse_collection_content(self, content):
21 xissues = []
23 soup = BeautifulSoup(content, "html5lib")
24 b_tag = soup.select(
25 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')"
26 )
27 if not b_tag:
28 raise ValueError("Couldn't parse page")
29 for a_tag in b_tag:
30 href = a_tag.get("href")
31 if not isinstance(href, str):
32 raise ValueError("Couldn't parse ")
33 xissue = create_issuedata()
34 xissue.pid = href
35 xissue.url = urljoin(self.collection_url, href)
36 xissues.append(xissue)
37 return xissues
39 def parse_issue_content(self, content, xissue):
40 soup = BeautifulSoup(content, "html5lib")
41 if not xissue.url:
42 raise ValueError("Issue URL must be set")
44 # Parse issue year, volume and number
45 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2")
46 if not title_tag:
47 raise ValueError("Couldn't find issue title")
48 title_str = cleanup_str(title_tag.text)
49 issue_search = self.issue_re.search(title_str)
50 if not issue_search:
51 raise ValueError("Couldn't parse issue title")
52 issue_data = issue_search.groupdict()
53 xissue.volume = issue_data["volume"]
54 xissue.number = issue_data["number"]
55 xissue.year = issue_data["year"]
56 xissue.pid = self.get_issue_pid(
57 self.collection_id,
58 issue_data["year"],
59 issue_data["volume"],
60 issue_data["number"],
61 )
63 # Parse issue article list
64 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]")
65 for index, a_tag in enumerate(article_tags):
66 href = a_tag.get("href")
67 if not isinstance(href, str):
68 raise ValueError("Couldn't parse article link")
69 xarticle = create_articledata()
70 xarticle.pid = "a" + str(index)
71 xarticle.url = urljoin(xissue.url, href)
72 xissue.articles.append(xarticle)
74 def parse_article_content(self, content, xissue, xarticle, url):
75 if not xarticle.url:
76 raise ValueError("Article must have an url")
78 soup = BeautifulSoup(content, "html5lib")
79 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)")
80 if not article_content:
81 raise ValueError("Couldn't parse article")
82 title_tag = article_content.select_one("h2")
83 if not title_tag:
84 raise ValueError("Couldn't find title")
85 xarticle.title_tex = cleanup_str(title_tag.text)
87 authors_tag = article_content.select_one("h3")
88 if not authors_tag:
89 raise ValueError("Couldn't find title")
90 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ")
91 for author in authors_str:
92 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
94 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')")
95 if keyword_tag:
96 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ")
97 for kwd in keywords_str.split("; "):
98 xarticle.kwds.append(create_subj(value=kwd))
100 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')")
101 if msc_tag:
102 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ")
103 for kwd in msc_tag.split(" "):
104 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
106 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')")
107 if pdf_tag:
108 pdf_href = pdf_tag.get("href")
109 if not isinstance(pdf_href, str):
110 raise ValueError("Couldn't parse pdf href")
111 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
112 return xarticle