Coverage for src/crawler/by_source/amp_crawler.py: 87%

99 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1import lingua 

2import requests 

3from bs4 import BeautifulSoup 

4from lingua import LanguageDetectorBuilder 

5from ptf.model_data import IssueData, create_abstract, create_articledata 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.types import CitationLiteral 

9 

10 

11class AmpCrawler(BaseCollectionCrawler): 

12 source_name = "Annals of Mathematics website" 

13 source_domain = "AMP" 

14 source_website = "https://annals.math.princeton.edu" 

15 

16 language_detector = LanguageDetectorBuilder.from_languages( 

17 lingua.Language.ENGLISH, 

18 lingua.Language.FRENCH, 

19 ).build() 

20 

21 def crawl_issue(self, xissue: IssueData): 

22 try: 

23 super().crawl_issue(xissue) 

24 except requests.exceptions.HTTPError: 

25 self.logger.warning("Got HTTPError while crawling issue. Skipping") 

26 

27 def parse_collection_content(self, content): 

28 """ 

29 Parse the HTML page of Annals of Math and returns a list of xissue. 

30 Each xissue has its volume/number/year metadata + its url 

31 """ 

32 soup = BeautifulSoup(content, "html.parser") 

33 xissues = [] 

34 

35 # Extract the list of issues 

36 issue_nodes = soup.find_all("div", {"class": "cat-item-2"}) 

37 

38 for issue_node in issue_nodes: 

39 issue_link_node = issue_node.find("a") 

40 if issue_link_node: 40 ↛ 38line 40 didn't jump to line 38 because the condition on line 40 was always true

41 url = issue_link_node.get("href") 

42 xissue = self.create_amp_xissue(url) 

43 if xissue: 43 ↛ 38line 43 didn't jump to line 38 because the condition on line 43 was always true

44 xissues.append(xissue) 

45 

46 return xissues 

47 

48 def create_amp_xissue(self, url): 

49 if url.endswith("/"): 

50 url = url[:-1] 

51 parts = url.split("/") 

52 

53 last_part = parts[-1] 

54 exceptions = last_part.split("-") 

55 if len(exceptions) > 2: 

56 year = exceptions[0] 

57 volume = exceptions[0] 

58 number = exceptions[1] 

59 else: 

60 year = parts[-2] 

61 if len(year) < 4: 

62 # The links are different with volumes before 2015 

63 year = parts[-3] 

64 

65 volume_number = parts[-1] 

66 volume_number_parts = volume_number.split("-") 

67 volume = volume_number_parts[0] 

68 number = volume_number_parts[1] 

69 

70 return self.create_xissue(url, year, volume, number) 

71 

72 def parse_issue_content(self, content, xissue): 

73 soup = BeautifulSoup(content, "html.parser") 

74 article_nodes = soup.select("h2.entry-title") 

75 

76 for index_article, article_node in enumerate(article_nodes): 

77 article_link_node = article_node.select_one("a") 

78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true

79 href = article_link_node.get("href") 

80 if not isinstance(href, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't parse issue: href is not a string") 

82 xarticle = create_articledata() 

83 xarticle.pid = "a" + str(index_article) 

84 xarticle.url = href 

85 xissue.articles.append(xarticle) 

86 

87 def parse_article_content(self, content, xissue, xarticle, url): 

88 """ 

89 Parse the content with Beautifulsoup and returns an ArticleData 

90 """ 

91 xarticle.lang = "en" 

92 

93 soup = BeautifulSoup(content, "html.parser") 

94 

95 what: list[CitationLiteral] = ["author", "abstract", "page"] 

96 

97 doi_tag = soup.select_one("meta[name='citation_doi']") 

98 if doi_tag and not doi_tag.text.startswith("https://doi.org/"): 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was always true

99 what.append("pdf") 

100 

101 title_node = soup.find("h1", {"class": "entry-title"}) 

102 if title_node: 102 ↛ 105line 102 didn't jump to line 105 because the condition on line 102 was always true

103 what.append("title") 

104 

105 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 105 ↛ 111line 105 didn't jump to line 111 because the condition on line 105 was always true

106 # Exception with Annals of Math: 2 articles have the same DOI ! 

107 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14 

108 # we ignore DOI/ZBMATH/MR for the first one 

109 what.append("doi") 

110 

111 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

112 

113 abstract_node = soup.select_one("div.entry-content") 

114 if abstract_node is not None: 114 ↛ 125line 114 didn't jump to line 125 because the condition on line 114 was always true

115 abstract_section_node = abstract_node.select_one("p") 

116 if abstract_section_node: 116 ↛ 125line 116 didn't jump to line 125 because the condition on line 116 was always true

117 abstract = str(abstract_section_node) 

118 xarticle.abstracts.append( 

119 create_abstract( 

120 value_tex=abstract, lang=self.detect_language(abstract, xarticle) 

121 ) 

122 ) 

123 

124 # ZBMATH 

125 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"}) 

126 for metadata_header_node in metadata_header_nodes: 

127 text = metadata_header_node.get_text() 

128 

129 if text == "zbMATH": 

130 link_node = metadata_header_node.parent.find("a") 

131 if link_node: 131 ↛ 126line 131 didn't jump to line 126 because the condition on line 131 was always true

132 zblid = link_node.get("href") 

133 pos = zblid.find("?q=an:") 

134 if pos > 0: 134 ↛ 136line 134 didn't jump to line 136 because the condition on line 134 was always true

135 zblid = zblid[pos + 6 :] 

136 xarticle.extids.append(("zbl-item-id", zblid)) 

137 elif text == "MR": 

138 link_node = metadata_header_node.parent.find("a") 

139 if link_node: 139 ↛ 126line 139 didn't jump to line 126 because the condition on line 139 was always true

140 mrid = link_node.get("href") 

141 pos = mrid.find("?mr=") 

142 if pos > 0: 142 ↛ 144line 142 didn't jump to line 144 because the condition on line 142 was always true

143 mrid = mrid[pos + 4 :] 

144 xarticle.extids.append(("mr-item-id", mrid)) 

145 

146 return xarticle