Coverage for src/crawler/by_source/emis_aas_crawler.py: 18%

76 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4import requests 

5from bs4 import BeautifulSoup 

6from ptf.model_data import create_articledata, create_contributor, create_subj 

7from requests.exceptions import ConnectionError 

8 

9from crawler.base_crawler import BaseCollectionCrawler 

10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

11 

12# https://www.emis.de/journals/APPS/ 

13# TODO python ./src/manage.py crawl -u besnierp -source EMIS_AAS -pid AAS 

14 

15 

16class Emis_aasCrawler(BaseCollectionCrawler): 

17 source_name = "European Mathematical Information Service" 

18 source_domain = "EMIS_AAS" 

19 source_website = "https://www.emis.de" 

20 

21 issue_re = regex.compile( 

22 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

23 ) 

24 classification_re = regex.compile(r"AMS Classification:\s*([\w\d]+(?:,\s*[\w\d]+)*)") 

25 page_re = regex.compile(r"Pp.\s*([\w\d]+-[\w\d]+)") 

26 

27 verify = False 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 soup = BeautifulSoup(content, "html.parser") 

32 issues = soup.select("ul b li a") 

33 for issue in issues: 

34 issue_search = regex.search(self.issue_re, issue.text) 

35 if not issue_search: 

36 raise ValueError("Couldn't parse issue data") 

37 issue_dict = issue_search.groupdict() 

38 

39 issue_href = issue.get("href") 

40 if not isinstance(issue_href, str): 

41 raise ValueError("Couldn't parse issue url") 

42 xissues.append( 

43 self.create_xissue( 

44 urljoin(self.collection_url, issue_href), 

45 issue_dict["year"], 

46 issue_dict["volume"], 

47 None, 

48 ) 

49 ) 

50 return xissues 

51 

52 def parse_issue_content(self, content, xissue): 

53 """as the article data is in the xissue.url page, we won't need the parse_article_content method. 

54 If the link to the article doesn't work, we don't create the article object. 

55 """ 

56 soup = BeautifulSoup(content, "html.parser") 

57 articles = soup.select("ol li") 

58 for index, article_tag in enumerate(articles): 

59 try: 

60 article_href = article_tag.select_one("a").get("href") 

61 except AttributeError: 

62 raise ValueError("Couldn't parse article data") 

63 if not isinstance(article_href, str): 

64 raise ValueError("Couldn't parse article data") 

65 # article_url = urljoin(self.collection_url, article_href) 

66 article_url = urljoin(xissue.url, article_href) 

67 try: 

68 # here we check that the link leads to the pdf article 

69 resp = requests.get(article_url, stream=True, verify=False) 

70 if resp.status_code != 200: 

71 self.logger.warning( 

72 "http response for the article %s : %s", article_url, resp.status_code 

73 ) 

74 continue 

75 resp.close() 

76 # no error, let's fetch the article 

77 xarticle = create_articledata() 

78 # add the xissue id because the parse_article is not called 

79 xarticle.pid = xissue.pid + "_a" + str(index) 

80 # no need xarticle.url because the pfd link is in the issue page 

81 add_pdf_link_to_xarticle(xarticle, article_url) 

82 

83 title_tag = article_tag.select_one("i") 

84 xarticle.title_tex = cleanup_str(title_tag.text[:-1]) 

85 

86 authors_tag = article_tag.select_one("b font") 

87 authors_str = cleanup_str(authors_tag.text) 

88 authors_str = authors_str.replace(" and ", ", ") 

89 for author in authors_str.split(", "): 

90 if cleanup_str(author) == "": 

91 raise ValueError("Invalid author") 

92 xarticle.contributors.append( 

93 create_contributor(role="author", string_name=author) 

94 ) 

95 # add the classification 

96 matches = regex.search(self.classification_re, article_tag.text) 

97 if matches: 

98 match = matches[0] 

99 classification_list = match.split(": ")[1].split(", ")[:-1] 

100 for kwd in classification_list: 

101 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

102 # add the pages 

103 matches = regex.search(self.page_re, article_tag.text) 

104 if matches: 

105 match = matches[0] 

106 try: 

107 page_range = match.split(".")[1].replace(" ", "") 

108 xarticle.page_range = page_range 

109 except IndexError: 

110 self.logger.debug( 

111 "PAGE ISSUE FOR ARTICLE: ", 

112 xarticle.title_tex, 

113 extra={"pid": xissue.pid}, 

114 ) 

115 xissue.articles.append(xarticle) 

116 except ConnectionError as e: 

117 self.logger.warning(e, extra={"pid": xissue.pid})