Coverage for src/crawler/by_source/cambridge_crawler.py: 10%

120 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from dateutil import parser 

5from ptf.cmds.xml.xml_utils import escape 

6from ptf.model_data import create_abstract, create_articledata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.cmds.mixed_citation import ( 

10 ExtLinkXml, 

11 GenericRefElement, 

12 MixedCitation, 

13) 

14from crawler.utils import cleanup_str 

15 

16 

17class CambridgeCrawler(BaseCollectionCrawler): 

18 source_name = "Cambridge University Press" 

19 source_domain = "CAMBRIDGE" 

20 source_website = "https://www.cambridge.org/" 

21 

22 delimiter_disp_formula = "$$" 

23 

24 def parse_collection_content(self, content): 

25 xissues = [] 

26 soup = BeautifulSoup(content, "html.parser") 

27 items = soup.select(".journal-all-issues .item") 

28 for item in items: 

29 href = item.get("href") 

30 if not isinstance(href, str): 

31 raise ValueError("Couldn't parse issue") 

32 href = urljoin(self.collection_url, href) 

33 

34 volume_tag = item.select_one(".issue") 

35 if not volume_tag: 

36 raise ValueError("Couldn't parse issue number") 

37 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ") 

38 

39 year_tag = item.select_one(".date") 

40 if not year_tag: 

41 raise ValueError("Couldn't parse issue year") 

42 year = parser.parse(year_tag.text).year 

43 

44 xissue = self.create_xissue( 

45 href, volume_number=volume_number, year=str(year), issue_number=None 

46 ) 

47 xissues.append(xissue) 

48 

49 return xissues 

50 

51 def parse_issue_content(self, content, xissue): 

52 if not xissue.url: 

53 raise ValueError("Issue must have an url") 

54 soup = BeautifulSoup(content, "html.parser") 

55 article_tag = soup.select(".journal-reader .part-link") 

56 

57 for index_article, article_node in enumerate(article_tag): 

58 url = article_node.get("href") 

59 if not isinstance(url, str): 

60 raise ValueError("Couldn't find article href") 

61 xarticle = create_articledata() 

62 xarticle.pid = "a" + str(index_article) 

63 xarticle.url = urljoin(xissue.url, url) 

64 

65 xissue.articles.append(xarticle) 

66 

67 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')") 

68 if has_pagination: 

69 pagination_link = has_pagination.get("href") 

70 if isinstance(pagination_link, str): 

71 page_url = urljoin(xissue.url, pagination_link) 

72 content = self.download_file(page_url) 

73 

74 self.parse_issue_content(content, xissue) 

75 

76 def parse_article_content(self, content, xissue, xarticle, url): 

77 """ 

78 Parse the content with Beautifulsoup and returns an ArticleData 

79 """ 

80 

81 xarticle.lang = "en" 

82 

83 soup = BeautifulSoup(content, "html5lib") 

84 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["pdf", "author", "doi"]) 

85 title_tag = soup.select_one("#maincontent hgroup") 

86 if not title_tag: 

87 raise ValueError("Couldn't find title tag") 

88 xarticle.title_tex = cleanup_str(title_tag.text) 

89 

90 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')") 

91 if abstract_header: 

92 abstract_parent = abstract_header.parent 

93 abstract_header.decompose() 

94 

95 no_content = abstract_parent.select_one(".no-content") 

96 if no_content: 

97 no_content.decompose() 

98 

99 xarticle.abstracts.append( 

100 create_abstract( 

101 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text) 

102 ) 

103 ) 

104 references_list = soup.select_one("#references-list") 

105 if references_list: 

106 xarticle.bibitems = self.parse_cambridge_references(references_list) 

107 return xarticle 

108 

109 def parse_cambridge_references(self, soup: Tag): 

110 bibitems = [] 

111 for item in soup.select(".circle-list__item"): 

112 citation_builder = MixedCitation() 

113 label_tag = item.select_one(".circle-list__item__number") 

114 if label_tag: 

115 citation_builder.label = escape(cleanup_str(label_tag.text)) 

116 citation_content = item.select_one(".circle-list__item__grouped__content") 

117 if citation_content: 

118 self.parse_cambridge_ref_nodes(citation_content, citation_builder) 

119 

120 # Group all StringNames into one PersonGroup object 

121 persongroup_builder = GenericRefElement() 

122 persongroup_builder.name = "person-group" 

123 # Index of StringNames objects 

124 i = [ 

125 index 

126 for index, element in enumerate(citation_builder.elements) 

127 if isinstance(element, GenericRefElement) and element.name == "string-name" 

128 ] 

129 if len(i) > 0: 

130 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1] 

131 del citation_builder.elements[i[0] : i[-1] + 1] 

132 citation_builder.elements.insert(i[0], persongroup_builder) 

133 

134 bibitems.append(citation_builder.get_jats_ref()) 

135 return bibitems 

136 

137 def parse_cambridge_ref_nodes( 

138 self, 

139 current_tag: Tag, 

140 current_builder: GenericRefElement, 

141 ): 

142 for element in current_tag.children: 

143 if isinstance(element, str): 

144 current_builder.elements.append(escape(element)) 

145 continue 

146 if isinstance(element, Tag): 

147 tag_class = element.get("class") 

148 if isinstance(tag_class, list): 

149 if len(tag_class) > 0: 

150 tag_class = tag_class[0] 

151 else: 

152 tag_class = None 

153 

154 if not tag_class: 

155 continue 

156 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"): 

157 continue 

158 if element.name == "a": 

159 href = element.get("href") 

160 if isinstance(href, str): 

161 current_builder.elements.append( 

162 ExtLinkXml(escape(href), escape(element.text)) 

163 ) 

164 continue 

165 

166 if tag_class in [ 

167 "surname", 

168 "given-names", 

169 "string-name", 

170 "person-group", 

171 "publisher-name", 

172 "source", 

173 "volume", 

174 "year", 

175 "fpage", 

176 "lpage", 

177 "article-title", 

178 "issue", 

179 "chapter-title", 

180 "inline-formula", 

181 "collab", 

182 "alternatives", 

183 "italic", 

184 "publisher-loc", 

185 "roman", 

186 "edition", 

187 "suffix", 

188 ]: 

189 refnode_builder = GenericRefElement() 

190 refnode_builder.name = tag_class 

191 current_builder.elements.append(refnode_builder) 

192 self.parse_cambridge_ref_nodes(element, refnode_builder) 

193 continue 

194 

195 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}") 

196 current_builder.elements.append(escape(element.text))