Coverage for src/crawler/by_source/amp_crawler.py: 92%

97 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from bs4 import BeautifulSoup 

2from crawler.base_crawler import BaseCollectionCrawler 

3from crawler.crawler_types import CitationLiteral 

4 

5from ptf.model_data import create_articledata 

6from ptf.model_data import create_issuedata 

7 

8 

9class AmpCrawler(BaseCollectionCrawler): 

10 source_name = "Annals of Mathematics Princeton University" 

11 source_domain = "AMP" 

12 source_website = "https://annals.math.princeton.edu" 

13 

14 periode_begin = 2003 

15 periode_end = 2017 

16 

17 def __init__(self, *args, **kwargs): 

18 super().__init__(*args, **kwargs) 

19 

20 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue. 

21 # self.collection_id = "AM" 

22 # self.collection_url = "https://annals.math.princeton.edu" 

23 

24 self.source = self.get_or_create_source() 

25 

26 self.periode = self.get_or_create_periode() 

27 

28 def parse_collection_content(self, content): 

29 """ 

30 Parse the HTML page of Annals of Math and returns a list of xissue. 

31 Each xissue has its volume/number/year metadata + its url 

32 """ 

33 soup = BeautifulSoup(content, "html.parser") 

34 xissues = [] 

35 

36 # Extract the list of issues 

37 issue_nodes = soup.find_all("div", {"class": "cat-item-2"}) 

38 

39 for issue_node in issue_nodes: 

40 issue_link_node = issue_node.find("a") 

41 if issue_link_node: 41 ↛ 39line 41 didn't jump to line 39 because the condition on line 41 was always true

42 url = issue_link_node.get("href") 

43 xissue = self.create_xissue(url) 

44 if xissue: 

45 xissues.append(xissue) 

46 

47 return xissues 

48 

49 def create_xissue(self, url): 

50 if url.endswith("/"): 

51 url = url[:-1] 

52 parts = url.split("/") 

53 

54 last_part = parts[-1] 

55 exceptions = last_part.split("-") 

56 if len(exceptions) > 2: 

57 year = exceptions[0] 

58 volume = exceptions[0] 

59 number = exceptions[1] 

60 else: 

61 year = parts[-2] 

62 if len(year) < 4: 

63 # The links are different with volumes before 2015 

64 year = parts[-3] 

65 

66 volume_number = parts[-1] 

67 volume_number_parts = volume_number.split("-") 

68 volume = volume_number_parts[0] 

69 number = volume_number_parts[1] 

70 

71 year_int = int(year) 

72 if self.periode_begin <= year_int <= self.periode_end: 

73 xissue = create_issuedata() 

74 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

75 xissue.year = year 

76 xissue.volume = volume 

77 xissue.number = number 

78 xissue.url = url 

79 else: 

80 xissue = None 

81 

82 return xissue 

83 

84 def parse_issue_content(self, content, xissue): 

85 soup = BeautifulSoup(content, "html.parser") 

86 article_nodes = soup.find_all("h2", {"class": "entry-title"}) 

87 

88 for index_article, article_node in enumerate(article_nodes): 

89 article_link_node = article_node.find("a") 

90 if article_link_node: 90 ↛ 88line 90 didn't jump to line 88 because the condition on line 90 was always true

91 url = article_link_node.get("href") 

92 xarticle = create_articledata() 

93 xarticle.pid = "a" + str(index_article) 

94 xarticle.url = url 

95 xissue.articles.append(xarticle) 

96 

97 def parse_article_content(self, content, xissue, xarticle, url, pid): 

98 """ 

99 Parse the content with Beautifulsoup and returns an ArticleData 

100 """ 

101 xarticle = create_articledata() 

102 xarticle.pid = pid 

103 xarticle.lang = "en" 

104 

105 soup = BeautifulSoup(content, "html.parser") 

106 

107 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"] 

108 

109 title_node = soup.find("h1", {"class": "entry-title"}) 

110 if title_node: 110 ↛ 113line 110 didn't jump to line 113 because the condition on line 110 was always true

111 what.append("title") 

112 

113 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 113 ↛ 119line 113 didn't jump to line 119 because the condition on line 113 was always true

114 # Exception with Annals of Math: 2 articles have the same DOI ! 

115 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14 

116 # we ignore DOI/ZBMATH/MR for the first one 

117 what.append("doi") 

118 

119 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

120 

121 # ZBMATH 

122 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"}) 

123 for metadata_header_node in metadata_header_nodes: 

124 text = metadata_header_node.get_text() 

125 

126 if text == "zbMATH": 

127 link_node = metadata_header_node.parent.find("a") 

128 if link_node: 128 ↛ 123line 128 didn't jump to line 123 because the condition on line 128 was always true

129 zblid = link_node.get("href") 

130 pos = zblid.find("?q=an:") 

131 if pos > 0: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true

132 zblid = zblid[pos + 6 :] 

133 xarticle.extids.append(("zbl-item-id", zblid)) 

134 elif text == "MR": 

135 link_node = metadata_header_node.parent.find("a") 

136 if link_node: 136 ↛ 123line 136 didn't jump to line 123 because the condition on line 136 was always true

137 mrid = link_node.get("href") 

138 pos = mrid.find("?mr=") 

139 if pos > 0: 139 ↛ 141line 139 didn't jump to line 141 because the condition on line 139 was always true

140 mrid = mrid[pos + 4 :] 

141 xarticle.extids.append(("mr-item-id", mrid)) 

142 

143 return xarticle