Coverage for src/crawler/by_source/amp_crawler.py: 91%

97 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import lingua 

2from bs4 import BeautifulSoup 

3from lingua import LanguageDetectorBuilder 

4from ptf.model_data import create_articledata, create_issuedata 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.types import CitationLiteral 

8 

9 

10class AmpCrawler(BaseCollectionCrawler): 

11 source_name = "Annals of Mathematics website" 

12 source_domain = "AMP" 

13 source_website = "https://annals.math.princeton.edu" 

14 

15 language_detector = LanguageDetectorBuilder.from_languages( 

16 lingua.Language.ENGLISH, 

17 lingua.Language.FRENCH, 

18 ).build() 

19 

20 def parse_collection_content(self, content): 

21 """ 

22 Parse the HTML page of Annals of Math and returns a list of xissue. 

23 Each xissue has its volume/number/year metadata + its url 

24 """ 

25 soup = BeautifulSoup(content, "html.parser") 

26 xissues = [] 

27 

28 # Extract the list of issues 

29 issue_nodes = soup.find_all("div", {"class": "cat-item-2"}) 

30 

31 for issue_node in issue_nodes: 

32 issue_link_node = issue_node.find("a") 

33 if issue_link_node: 33 ↛ 31line 33 didn't jump to line 31 because the condition on line 33 was always true

34 url = issue_link_node.get("href") 

35 xissue = self.create_amp_xissue(url) 

36 if xissue: 36 ↛ 31line 36 didn't jump to line 31 because the condition on line 36 was always true

37 xissues.append(xissue) 

38 

39 return xissues 

40 

41 def create_amp_xissue(self, url): 

42 if url.endswith("/"): 

43 url = url[:-1] 

44 parts = url.split("/") 

45 

46 last_part = parts[-1] 

47 exceptions = last_part.split("-") 

48 if len(exceptions) > 2: 

49 year = exceptions[0] 

50 volume = exceptions[0] 

51 number = exceptions[1] 

52 else: 

53 year = parts[-2] 

54 if len(year) < 4: 

55 # The links are different with volumes before 2015 

56 year = parts[-3] 

57 

58 volume_number = parts[-1] 

59 volume_number_parts = volume_number.split("-") 

60 volume = volume_number_parts[0] 

61 number = volume_number_parts[1] 

62 

63 xissue = create_issuedata() 

64 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

65 xissue.year = year 

66 xissue.volume = volume 

67 xissue.number = number 

68 xissue.url = url 

69 

70 return xissue 

71 

72 def parse_issue_content(self, content, xissue): 

73 soup = BeautifulSoup(content, "html.parser") 

74 article_nodes = soup.find_all("h2", {"class": "entry-title"}) 

75 

76 for index_article, article_node in enumerate(article_nodes): 

77 article_link_node = article_node.find("a") 

78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true

79 url = article_link_node.get("href") 

80 xarticle = create_articledata() 

81 xarticle.pid = "a" + str(index_article) 

82 xarticle.url = url 

83 xissue.articles.append(xarticle) 

84 

85 def parse_article_content(self, content, xissue, xarticle, url): 

86 """ 

87 Parse the content with Beautifulsoup and returns an ArticleData 

88 """ 

89 xarticle.lang = "en" 

90 

91 soup = BeautifulSoup(content, "html.parser") 

92 

93 what: list[CitationLiteral] = ["author", "abstract", "page"] 

94 

95 doi_tag = soup.select_one("meta[name='citation_doi']") 

96 if not doi_tag.text.startswith("https://doi.org/"): 96 ↛ 99line 96 didn't jump to line 99 because the condition on line 96 was always true

97 what.append("pdf") 

98 

99 title_node = soup.find("h1", {"class": "entry-title"}) 

100 if title_node: 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was always true

101 what.append("title") 

102 

103 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 103 ↛ 109line 103 didn't jump to line 109 because the condition on line 103 was always true

104 # Exception with Annals of Math: 2 articles have the same DOI ! 

105 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14 

106 # we ignore DOI/ZBMATH/MR for the first one 

107 what.append("doi") 

108 

109 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

110 

111 abstract_node = soup.select_one("div.entry-content") 

112 if abstract_node is not None: 112 ↛ 127line 112 didn't jump to line 127 because the condition on line 112 was always true

113 abstract_section_node = abstract_node.select_one("p") 

114 if abstract_section_node: 114 ↛ 127line 114 didn't jump to line 127 because the condition on line 114 was always true

115 abstract = str(abstract_section_node) 

116 xarticle.abstracts.append( 

117 { 

118 "tag": "abstract", 

119 "value_html": "", 

120 "value_tex": abstract, 

121 "value_xml": "", 

122 "lang": self.detect_language(abstract, xarticle), 

123 } 

124 ) 

125 

126 # ZBMATH 

127 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"}) 

128 for metadata_header_node in metadata_header_nodes: 

129 text = metadata_header_node.get_text() 

130 

131 if text == "zbMATH": 

132 link_node = metadata_header_node.parent.find("a") 

133 if link_node: 133 ↛ 128line 133 didn't jump to line 128 because the condition on line 133 was always true

134 zblid = link_node.get("href") 

135 pos = zblid.find("?q=an:") 

136 if pos > 0: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true

137 zblid = zblid[pos + 6 :] 

138 xarticle.extids.append(("zbl-item-id", zblid)) 

139 elif text == "MR": 

140 link_node = metadata_header_node.parent.find("a") 

141 if link_node: 141 ↛ 128line 141 didn't jump to line 128 because the condition on line 141 was always true

142 mrid = link_node.get("href") 

143 pos = mrid.find("?mr=") 

144 if pos > 0: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true

145 mrid = mrid[pos + 4 :] 

146 xarticle.extids.append(("mr-item-id", mrid)) 

147 

148 return xarticle