Coverage for src/crawler/by_source/emis_aas_crawler.py: 20%

79 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4import requests 

5from bs4 import BeautifulSoup 

6from ptf.model_data import create_articledata, create_contributor, create_subj 

7from requests.exceptions import ConnectionError 

8 

9from crawler.base_crawler import BaseCollectionCrawler 

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

11 

12# https://www.emis.de/journals/APPS/ 

13# TODO python ./src/manage.py crawl -u besnierp -source EMIS_AAS -pid AAS 

14 

15 

16class Emis_aasCrawler(BaseCollectionCrawler): 

17 source_name = "European Mathematical Information Service" 

18 source_domain = "EMIS_AAS" 

19 source_website = "https://www.emis.de" 

20 

21 @classmethod 

22 def get_view_id(cls): 

23 return "EMIS" 

24 

25 issue_re = regex.compile( 

26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

27 ) 

28 classification_re = regex.compile(r"AMS Classification:\s*([\w\d]+(?:,\s*[\w\d]+)*)") 

29 page_re = regex.compile(r"Pp.\s*([\w\d]+-[\w\d]+)") 

30 

31 verify = False 

32 

33 def parse_collection_content(self, content): 

34 xissues = [] 

35 soup = BeautifulSoup(content, "html.parser") 

36 issues = soup.select("ul b li a") 

37 for issue in issues: 

38 issue_search = regex.search(self.issue_re, issue.text) 

39 if not issue_search: 

40 raise ValueError("Couldn't parse issue data") 

41 issue_dict = issue_search.groupdict() 

42 

43 issue_href = issue.get("href") 

44 if not isinstance(issue_href, str): 

45 raise ValueError("Couldn't parse issue url") 

46 xissues.append( 

47 self.create_xissue( 

48 urljoin(self.collection_url, issue_href), 

49 issue_dict["year"], 

50 issue_dict["volume"], 

51 None, 

52 ) 

53 ) 

54 return xissues 

55 

56 def parse_issue_content(self, content, xissue): 

57 """as the article data is in the xissue.url page, we won't need the parse_article_content method. 

58 If the link to the article doesn't work, we don't create the article object. 

59 """ 

60 soup = BeautifulSoup(content, "html.parser") 

61 articles = soup.select("ol li") 

62 for index, article_tag in enumerate(articles): 

63 try: 

64 article_href = article_tag.select_one("a").get("href") 

65 except AttributeError: 

66 raise ValueError("Couldn't parse article data") 

67 if not isinstance(article_href, str): 

68 raise ValueError("Couldn't parse article data") 

69 # article_url = urljoin(self.collection_url, article_href) 

70 article_url = urljoin(xissue.url, article_href) 

71 try: 

72 # here we check that the link leads to the pdf article 

73 resp = requests.get(article_url, stream=True, verify=False) 

74 if resp.status_code != 200: 

75 self.logger.warning( 

76 "http response for the article %s : %s", article_url, resp.status_code 

77 ) 

78 continue 

79 resp.close() 

80 # no error, let's fetch the article 

81 xarticle = create_articledata() 

82 # add the xissue id because the parse_article is not called 

83 xarticle.pid = xissue.pid + "_a" + str(index) 

84 # no need xarticle.url because the pfd link is in the issue page 

85 add_pdf_link_to_xarticle(xarticle, article_url) 

86 

87 title_tag = article_tag.select_one("i") 

88 xarticle.title_tex = cleanup_str(title_tag.text[:-1]) 

89 

90 authors_tag = article_tag.select_one("b font") 

91 authors_str = cleanup_str(authors_tag.text) 

92 authors_str = authors_str.replace(" and ", ", ") 

93 for author in authors_str.split(", "): 

94 if cleanup_str(author) == "": 

95 raise ValueError("Invalid author") 

96 xarticle.contributors.append( 

97 create_contributor(role="author", string_name=author) 

98 ) 

99 # add the classification 

100 matches = regex.search(self.classification_re, article_tag.text) 

101 if matches: 

102 match = matches[0] 

103 classification_list = match.split(": ")[1].split(", ")[:-1] 

104 for kwd in classification_list: 

105 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

106 # add the pages 

107 matches = regex.search(self.page_re, article_tag.text) 

108 if matches: 

109 match = matches[0] 

110 try: 

111 page_range = match.split(".")[1].replace(" ", "") 

112 xarticle.page_range = page_range 

113 except IndexError: 

114 self.logger.debug( 

115 "PAGE ISSUE FOR ARTICLE: ", 

116 xarticle.title_tex, 

117 extra={"pid": xissue.pid}, 

118 ) 

119 xissue.articles.append(xarticle) 

120 except ConnectionError as e: 

121 self.logger.warning(e, extra={"pid": xissue.pid})