Coverage for src/crawler/by_source/emis_aas_crawler.py: 18%
76 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3import regex
4import requests
5from bs4 import BeautifulSoup
6from ptf.model_data import create_articledata, create_contributor, create_subj
7from requests.exceptions import ConnectionError
9from crawler.base_crawler import BaseCollectionCrawler
10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
12# https://www.emis.de/journals/APPS/
13# TODO python ./src/manage.py crawl -u besnierp -source EMIS_AAS -pid AAS
16class Emis_aasCrawler(BaseCollectionCrawler):
17 source_name = "European Mathematical Information Service"
18 source_domain = "EMIS_AAS"
19 source_website = "https://www.emis.de"
21 issue_re = regex.compile(
22 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"
23 )
24 classification_re = regex.compile(r"AMS Classification:\s*([\w\d]+(?:,\s*[\w\d]+)*)")
25 page_re = regex.compile(r"Pp.\s*([\w\d]+-[\w\d]+)")
27 verify = False
29 def parse_collection_content(self, content):
30 xissues = []
31 soup = BeautifulSoup(content, "html.parser")
32 issues = soup.select("ul b li a")
33 for issue in issues:
34 issue_search = regex.search(self.issue_re, issue.text)
35 if not issue_search:
36 raise ValueError("Couldn't parse issue data")
37 issue_dict = issue_search.groupdict()
39 issue_href = issue.get("href")
40 if not isinstance(issue_href, str):
41 raise ValueError("Couldn't parse issue url")
42 xissues.append(
43 self.create_xissue(
44 urljoin(self.collection_url, issue_href),
45 issue_dict["year"],
46 issue_dict["volume"],
47 None,
48 )
49 )
50 return xissues
52 def parse_issue_content(self, content, xissue):
53 """as the article data is in the xissue.url page, we won't need the parse_article_content method.
54 If the link to the article doesn't work, we don't create the article object.
55 """
56 soup = BeautifulSoup(content, "html.parser")
57 articles = soup.select("ol li")
58 for index, article_tag in enumerate(articles):
59 try:
60 article_href = article_tag.select_one("a").get("href")
61 except AttributeError:
62 raise ValueError("Couldn't parse article data")
63 if not isinstance(article_href, str):
64 raise ValueError("Couldn't parse article data")
65 # article_url = urljoin(self.collection_url, article_href)
66 article_url = urljoin(xissue.url, article_href)
67 try:
68 # here we check that the link leads to the pdf article
69 resp = requests.get(article_url, stream=True, verify=False)
70 if resp.status_code != 200:
71 self.logger.warning(
72 "http response for the article %s : %s", article_url, resp.status_code
73 )
74 continue
75 resp.close()
76 # no error, let's fetch the article
77 xarticle = create_articledata()
78 # add the xissue id because the parse_article is not called
79 xarticle.pid = xissue.pid + "_a" + str(index)
80 # no need xarticle.url because the pfd link is in the issue page
81 add_pdf_link_to_xarticle(xarticle, article_url)
83 title_tag = article_tag.select_one("i")
84 xarticle.title_tex = cleanup_str(title_tag.text[:-1])
86 authors_tag = article_tag.select_one("b font")
87 authors_str = cleanup_str(authors_tag.text)
88 authors_str = authors_str.replace(" and ", ", ")
89 for author in authors_str.split(", "):
90 if cleanup_str(author) == "":
91 raise ValueError("Invalid author")
92 xarticle.contributors.append(
93 create_contributor(role="author", string_name=author)
94 )
95 # add the classification
96 matches = regex.search(self.classification_re, article_tag.text)
97 if matches:
98 match = matches[0]
99 classification_list = match.split(": ")[1].split(", ")[:-1]
100 for kwd in classification_list:
101 xarticle.kwds.append(create_subj(value=kwd, type="msc"))
102 # add the pages
103 matches = regex.search(self.page_re, article_tag.text)
104 if matches:
105 match = matches[0]
106 try:
107 page_range = match.split(".")[1].replace(" ", "")
108 xarticle.page_range = page_range
109 except IndexError:
110 self.logger.debug(
111 "PAGE ISSUE FOR ARTICLE: ",
112 xarticle.title_tex,
113 extra={"pid": xissue.pid},
114 )
115 xissue.articles.append(xarticle)
116 except ConnectionError as e:
117 self.logger.warning(e, extra={"pid": xissue.pid})