Coverage for src/crawler/by_source/emis_aas_crawler.py: 20%
79 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1from urllib.parse import urljoin
3import regex
4import requests
5from bs4 import BeautifulSoup
6from ptf.model_data import create_articledata, create_contributor, create_subj
7from requests.exceptions import ConnectionError
9from crawler.base_crawler import BaseCollectionCrawler
10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
12# https://www.emis.de/journals/APPS/
13# TODO python ./src/manage.py crawl -u besnierp -source EMIS_AAS -pid AAS
16class Emis_aasCrawler(BaseCollectionCrawler):
17 source_name = "European Mathematical Information Service"
18 source_domain = "EMIS_AAS"
19 source_website = "https://www.emis.de"
21 @classmethod
22 def get_view_id(cls):
23 return "EMIS"
25 issue_re = regex.compile(
26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
27 )
28 classification_re = regex.compile(r"AMS Classification:\s*([\w\d]+(?:,\s*[\w\d]+)*)")
29 page_re = regex.compile(r"Pp.\s*([\w\d]+-[\w\d]+)")
31 verify = False
33 def parse_collection_content(self, content):
34 xissues = []
35 soup = BeautifulSoup(content, "html.parser")
36 issues = soup.select("ul b li a")
37 for issue in issues:
38 issue_search = regex.search(self.issue_re, issue.text)
39 if not issue_search:
40 raise ValueError("Couldn't parse issue data")
41 issue_dict = issue_search.groupdict()
43 issue_href = issue.get("href")
44 if not isinstance(issue_href, str):
45 raise ValueError("Couldn't parse issue url")
46 xissues.append(
47 self.create_xissue(
48 urljoin(self.collection_url, issue_href),
49 issue_dict["year"],
50 issue_dict["volume"],
51 None,
52 )
53 )
54 return xissues
56 def parse_issue_content(self, content, xissue):
57 """as the article data is in the xissue.url page, we won't need the parse_article_content method.
58 If the link to the article doesn't work, we don't create the article object.
59 """
60 soup = BeautifulSoup(content, "html.parser")
61 articles = soup.select("ol li")
62 for index, article_tag in enumerate(articles):
63 try:
64 article_href = article_tag.select_one("a").get("href")
65 except AttributeError:
66 raise ValueError("Couldn't parse article data")
67 if not isinstance(article_href, str):
68 raise ValueError("Couldn't parse article data")
69 # article_url = urljoin(self.collection_url, article_href)
70 article_url = urljoin(xissue.url, article_href)
71 try:
72 # here we check that the link leads to the pdf article
73 resp = requests.get(article_url, stream=True, verify=False)
74 if resp.status_code != 200:
75 self.logger.warning(
76 "http response for the article %s : %s", article_url, resp.status_code
77 )
78 continue
79 resp.close()
80 # no error, let's fetch the article
81 xarticle = create_articledata()
82 # add the xissue id because the parse_article is not called
83 xarticle.pid = xissue.pid + "_a" + str(index)
84 # no need xarticle.url because the pfd link is in the issue page
85 add_pdf_link_to_xarticle(xarticle, article_url)
87 title_tag = article_tag.select_one("i")
88 xarticle.title_tex = cleanup_str(title_tag.text[:-1])
90 authors_tag = article_tag.select_one("b font")
91 authors_str = cleanup_str(authors_tag.text)
92 authors_str = authors_str.replace(" and ", ", ")
93 for author in authors_str.split(", "):
94 if cleanup_str(author) == "":
95 raise ValueError("Invalid author")
96 xarticle.contributors.append(
97 create_contributor(role="author", string_name=author)
98 )
99 # add the classification
100 matches = regex.search(self.classification_re, article_tag.text)
101 if matches:
102 match = matches[0]
103 classification_list = match.split(": ")[1].split(", ")[:-1]
104 for kwd in classification_list:
105 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
106 # add the pages
107 matches = regex.search(self.page_re, article_tag.text)
108 if matches:
109 match = matches[0]
110 try:
111 page_range = match.split(".")[1].replace(" ", "")
112 xarticle.page_range = page_range
113 except IndexError:
114 self.logger.debug(
115 "PAGE ISSUE FOR ARTICLE: ",
116 xarticle.title_tex,
117 extra={"pid": xissue.pid},
118 )
119 xissue.articles.append(xarticle)
120 except ConnectionError as e:
121 self.logger.warning(e, extra={"pid": xissue.pid})