Coverage for src/crawler/by_source/amp_crawler.py: 93%

97 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from bs4 import BeautifulSoup 

2from ptf.model_data import create_articledata, create_issuedata 

3 

4from crawler.base_crawler import BaseCollectionCrawler 

5from crawler.types import CitationLiteral 

6 

7 

8class AmpCrawler(BaseCollectionCrawler): 

9 source_name = "Annals of Mathematics website" 

10 source_domain = "AMP" 

11 source_website = "https://annals.math.princeton.edu" 

12 

13 periode_begin = 2003 

14 periode_end = 2017 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Parse the HTML page of Annals of Math and returns a list of xissue. 

19 Each xissue has its volume/number/year metadata + its url 

20 """ 

21 soup = BeautifulSoup(content, "html.parser") 

22 xissues = [] 

23 

24 # Extract the list of issues 

25 issue_nodes = soup.find_all("div", {"class": "cat-item-2"}) 

26 

27 for issue_node in issue_nodes: 

28 issue_link_node = issue_node.find("a") 

29 if issue_link_node: 29 ↛ 27line 29 didn't jump to line 27 because the condition on line 29 was always true

30 url = issue_link_node.get("href") 

31 xissue = self.create_amp_xissue(url) 

32 if xissue: 

33 xissues.append(xissue) 

34 

35 return xissues 

36 

37 def create_amp_xissue(self, url): 

38 if url.endswith("/"): 

39 url = url[:-1] 

40 parts = url.split("/") 

41 

42 last_part = parts[-1] 

43 exceptions = last_part.split("-") 

44 if len(exceptions) > 2: 

45 year = exceptions[0] 

46 volume = exceptions[0] 

47 number = exceptions[1] 

48 else: 

49 year = parts[-2] 

50 if len(year) < 4: 

51 # The links are different with volumes before 2015 

52 year = parts[-3] 

53 

54 volume_number = parts[-1] 

55 volume_number_parts = volume_number.split("-") 

56 volume = volume_number_parts[0] 

57 number = volume_number_parts[1] 

58 

59 year_int = int(year) 

60 if self.periode_begin <= year_int <= self.periode_end: 

61 xissue = create_issuedata() 

62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

63 xissue.year = year 

64 xissue.volume = volume 

65 xissue.number = number 

66 xissue.url = url 

67 else: 

68 xissue = None 

69 

70 return xissue 

71 

72 def parse_issue_content(self, content, xissue): 

73 soup = BeautifulSoup(content, "html.parser") 

74 article_nodes = soup.find_all("h2", {"class": "entry-title"}) 

75 

76 for index_article, article_node in enumerate(article_nodes): 

77 article_link_node = article_node.find("a") 

78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true

79 url = article_link_node.get("href") 

80 xarticle = create_articledata() 

81 xarticle.pid = "a" + str(index_article) 

82 xarticle.url = url 

83 xissue.articles.append(xarticle) 

84 

85 def parse_article_content(self, content, xissue, xarticle, url, pid): 

86 """ 

87 Parse the content with Beautifulsoup and returns an ArticleData 

88 """ 

89 xarticle.pid = pid 

90 xarticle.lang = "en" 

91 

92 soup = BeautifulSoup(content, "html.parser") 

93 

94 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"] 

95 

96 title_node = soup.find("h1", {"class": "entry-title"}) 

97 if title_node: 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was always true

98 what.append("title") 

99 

100 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 100 ↛ 106line 100 didn't jump to line 106 because the condition on line 100 was always true

101 # Exception with Annals of Math: 2 articles have the same DOI ! 

102 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14 

103 # we ignore DOI/ZBMATH/MR for the first one 

104 what.append("doi") 

105 

106 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

107 

108 abstract_node = soup.select_one("div.entry-content") 

109 if abstract_node is not None: 109 ↛ 124line 109 didn't jump to line 124 because the condition on line 109 was always true

110 abstract_section_node = abstract_node.select_one("p") 

111 if abstract_section_node: 111 ↛ 124line 111 didn't jump to line 124 because the condition on line 111 was always true

112 abstract = str(abstract_section_node) 

113 xarticle.abstracts.append( 

114 { 

115 "tag": "abstract", 

116 "value_html": "", 

117 "value_tex": abstract, 

118 "value_xml": "", 

119 "lang": self.detect_language(abstract, xarticle), 

120 } 

121 ) 

122 

123 # ZBMATH 

124 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"}) 

125 for metadata_header_node in metadata_header_nodes: 

126 text = metadata_header_node.get_text() 

127 

128 if text == "zbMATH": 

129 link_node = metadata_header_node.parent.find("a") 

130 if link_node: 130 ↛ 125line 130 didn't jump to line 125 because the condition on line 130 was always true

131 zblid = link_node.get("href") 

132 pos = zblid.find("?q=an:") 

133 if pos > 0: 133 ↛ 135line 133 didn't jump to line 135 because the condition on line 133 was always true

134 zblid = zblid[pos + 6 :] 

135 xarticle.extids.append(("zbl-item-id", zblid)) 

136 elif text == "MR": 

137 link_node = metadata_header_node.parent.find("a") 

138 if link_node: 138 ↛ 125line 138 didn't jump to line 125 because the condition on line 138 was always true

139 mrid = link_node.get("href") 

140 pos = mrid.find("?mr=") 

141 if pos > 0: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true

142 mrid = mrid[pos + 4 :] 

143 xarticle.extids.append(("mr-item-id", mrid)) 

144 

145 return xarticle