Coverage for src/crawler/by_source/emis_aas

1from urllib.parse import urljoin

3import regex

4import requests

5from bs4 import BeautifulSoup

6from ptf.model_data import create_articledata, create_contributor, create_subj

7from requests.exceptions import ConnectionError

9from crawler.base_crawler import BaseCollectionCrawler

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

12# https://www.emis.de/journals/APPS/

13# TODO python ./src/manage.py crawl -u besnierp -source EMIS_AAS -pid AAS

16class Emis_aasCrawler(BaseCollectionCrawler):

17 source_name = "European Mathematical Information Service"

18 source_domain = "EMIS_AAS"

19 source_website = "https://www.emis.de"

21 @classmethod

22 def get_view_id(cls):

23 return "EMIS"

25 issue_re = regex.compile(

26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?"

27 )

28 classification_re = regex.compile(r"AMS Classification:\s*([\w\d]+(?:,\s*[\w\d]+)*)")

29 page_re = regex.compile(r"Pp.\s*([\w\d]+-[\w\d]+)")

31 verify = False

33 def parse_collection_content(self, content):

34 xissues = []

35 soup = BeautifulSoup(content, "html.parser")

36 issues = soup.select("ul b li a")

37 for issue in issues:

38 issue_search = regex.search(self.issue_re, issue.text)

39 if not issue_search:

40 raise ValueError("Couldn't parse issue data")

41 issue_dict = issue_search.groupdict()

43 issue_href = issue.get("href")

44 if not isinstance(issue_href, str):

45 raise ValueError("Couldn't parse issue url")

46 xissues.append(

47 self.create_xissue(

48 urljoin(self.collection_url, issue_href),

49 issue_dict["year"],

50 issue_dict["volume"],

51 None,

52 )

53 )

54 return xissues

56 def parse_issue_content(self, content, xissue):

57 """as the article data is in the xissue.url page, we won't need the parse_article_content method.

58 If the link to the article doesn't work, we don't create the article object.

59 """

60 soup = BeautifulSoup(content, "html.parser")

61 articles = soup.select("ol li")

62 for index, article_tag in enumerate(articles):

63 try:

64 article_href = article_tag.select_one("a").get("href")

65 except AttributeError:

66 raise ValueError("Couldn't parse article data")

67 if not isinstance(article_href, str):

68 raise ValueError("Couldn't parse article data")

69 # article_url = urljoin(self.collection_url, article_href)

70 article_url = urljoin(xissue.url, article_href)

71 try:

72 # here we check that the link leads to the pdf article

73 resp = requests.get(article_url, stream=True, verify=False)

74 if resp.status_code != 200:

75 self.logger.warning(

76 "http response for the article %s : %s", article_url, resp.status_code

77 )

78 continue

79 resp.close()

80 # no error, let's fetch the article

81 xarticle = create_articledata()

82 # add the xissue id because the parse_article is not called

83 xarticle.pid = xissue.pid + "_a" + str(index)

84 # no need xarticle.url because the pfd link is in the issue page

85 add_pdf_link_to_xarticle(xarticle, article_url)

87 title_tag = article_tag.select_one("i")

88 xarticle.title_tex = cleanup_str(title_tag.text[:-1])

90 authors_tag = article_tag.select_one("b font")

91 authors_str = cleanup_str(authors_tag.text)

92 authors_str = authors_str.replace(" and ", ", ")

93 for author in authors_str.split(", "):

94 if cleanup_str(author) == "":

95 raise ValueError("Invalid author")

96 xarticle.contributors.append(

97 create_contributor(role="author", string_name=author)

98 )

99 # add the classification

100 matches = regex.search(self.classification_re, article_tag.text)

101 if matches:

102 match = matches[0]

103 classification_list = match.split(": ")[1].split(", ")[:-1]

104 for kwd in classification_list:

105 xarticle.kwds.append(create_subj(value=kwd, type="msc"))

106 # add the pages

107 matches = regex.search(self.page_re, article_tag.text)

108 if matches:

109 match = matches[0]

110 try:

111 page_range = match.split(".")[1].replace(" ", "")

112 xarticle.page_range = page_range

113 except IndexError:

114 self.logger.debug(

115 "PAGE ISSUE FOR ARTICLE: ",

116 xarticle.title_tex,

117 extra={"pid": xissue.pid},

118 )

119 xissue.articles.append(xarticle)

120 except ConnectionError as e:

121 self.logger.warning(e, extra={"pid": xissue.pid})

Coverage for src / crawler / by_source / emis_aas_crawler.py: 20%

79 statements