Coverage for src/crawler/by_source/cambridge_crawler.py: 11%

117 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from dateutil import parser 

5from ptf.cmds.xml.xml_utils import escape 

6from ptf.model_data import create_abstract, create_articledata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.cmds.mixed_citation import ( 

10 ExtLinkXml, 

11 GenericRefElement, 

12 MixedCitation, 

13) 

14from crawler.utils import cleanup_str 

15 

16 

17class CambridgeCrawler(BaseCollectionCrawler): 

18 source_name = "Cambridge University Press" 

19 source_domain = "CAMBRIDGE" 

20 source_website = "https://www.cambridge.org/" 

21 

22 delimiter_disp_formula = "$$" 

23 

24 def parse_collection_content(self, content): 

25 xissues = [] 

26 soup = BeautifulSoup(content, "html.parser") 

27 items = soup.select(".journal-all-issues .item") 

28 for item in items: 

29 href = item.get("href") 

30 if not isinstance(href, str): 

31 raise ValueError("Couldn't parse issue") 

32 href = urljoin(self.collection_url, href) 

33 

34 volume_tag = item.select_one(".issue") 

35 if not volume_tag: 

36 raise ValueError("Couldn't parse issue number") 

37 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ") 

38 

39 year_tag = item.select_one(".date") 

40 if not year_tag: 

41 raise ValueError("Couldn't parse issue year") 

42 year = parser.parse(year_tag.text).year 

43 

44 xissue = self.create_xissue( 

45 href, volume_number=volume_number, year=str(year), issue_number=None 

46 ) 

47 xissues.append(xissue) 

48 

49 return xissues 

50 

51 def parse_issue_content(self, content, xissue): 

52 if not xissue.url: 

53 raise ValueError("Issue must have an url") 

54 soup = BeautifulSoup(content, "html.parser") 

55 article_tag = soup.select(".journal-reader .part-link") 

56 

57 for index_article, article_node in enumerate(article_tag): 

58 url = article_node.get("href") 

59 if not isinstance(url, str): 

60 raise ValueError("Couldn't find article href") 

61 xarticle = create_articledata() 

62 xarticle.pid = "a" + str(index_article) 

63 xarticle.url = urljoin(xissue.url, url) 

64 

65 xissue.articles.append(xarticle) 

66 

67 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')") 

68 if has_pagination: 

69 pagination_link = has_pagination.get("href") 

70 if isinstance(pagination_link, str): 

71 page_url = urljoin(xissue.url, pagination_link) 

72 content = self.download_file(page_url) 

73 

74 self.parse_issue_content(content, xissue) 

75 

76 def parse_article_content(self, content, xissue, xarticle, url): 

77 """ 

78 Parse the content with Beautifulsoup and returns an ArticleData 

79 """ 

80 

81 xarticle.lang = "en" 

82 

83 soup = BeautifulSoup(content, "html.parser") 

84 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["pdf", "author", "doi"]) 

85 

86 xarticle.title_tex = cleanup_str(soup.select_one("#maincontent").select_one("hgroup").text) 

87 

88 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')") 

89 if abstract_header: 

90 abstract_parent = abstract_header.parent 

91 abstract_header.decompose() 

92 

93 no_content = abstract_parent.select_one(".no-content") 

94 if no_content: 

95 no_content.decompose() 

96 

97 xarticle.abstracts.append( 

98 create_abstract( 

99 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text) 

100 ) 

101 ) 

102 references_list = soup.select_one("#references-list") 

103 if references_list: 

104 xarticle.bibitems = self.parse_cambridge_references(references_list) 

105 return xarticle 

106 

107 def parse_cambridge_references(self, soup: Tag): 

108 bibitems = [] 

109 for item in soup.select(".circle-list__item"): 

110 citation_builder = MixedCitation() 

111 label_tag = item.select_one(".circle-list__item__number") 

112 if label_tag: 

113 citation_builder.label = escape(cleanup_str(label_tag.text)) 

114 citation_content = item.select_one(".circle-list__item__grouped__content") 

115 if citation_content: 

116 self.parse_cambridge_ref_nodes(citation_content, citation_builder) 

117 

118 # Group all StringNames into one PersonGroup object 

119 persongroup_builder = GenericRefElement() 

120 persongroup_builder.name = "person-group" 

121 # Index of StringNames objects 

122 i = [ 

123 index 

124 for index, element in enumerate(citation_builder.elements) 

125 if isinstance(element, GenericRefElement) and element.name == "string-name" 

126 ] 

127 if len(i) > 0: 

128 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1] 

129 del citation_builder.elements[i[0] : i[-1] + 1] 

130 citation_builder.elements.insert(i[0], persongroup_builder) 

131 

132 bibitems.append(citation_builder.get_jats_ref()) 

133 return bibitems 

134 

135 def parse_cambridge_ref_nodes( 

136 self, 

137 current_tag: Tag, 

138 current_builder: GenericRefElement, 

139 ): 

140 for element in current_tag.children: 

141 if isinstance(element, str): 

142 current_builder.elements.append(escape(element)) 

143 continue 

144 if isinstance(element, Tag): 

145 tag_class = element.get("class") 

146 if isinstance(tag_class, list): 

147 if len(tag_class) > 0: 

148 tag_class = tag_class[0] 

149 else: 

150 tag_class = None 

151 

152 if not tag_class: 

153 continue 

154 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"): 

155 continue 

156 if element.name == "a": 

157 href = element.get("href") 

158 if isinstance(href, str): 

159 current_builder.elements.append( 

160 ExtLinkXml(escape(href), escape(element.text)) 

161 ) 

162 continue 

163 

164 if tag_class in [ 

165 "surname", 

166 "given-names", 

167 "string-name", 

168 "person-group", 

169 "publisher-name", 

170 "source", 

171 "volume", 

172 "year", 

173 "fpage", 

174 "lpage", 

175 "article-title", 

176 "issue", 

177 "chapter-title", 

178 "inline-formula", 

179 "collab", 

180 "alternatives", 

181 "italic", 

182 "publisher-loc", 

183 "roman", 

184 "edition", 

185 "suffix", 

186 ]: 

187 refnode_builder = GenericRefElement() 

188 refnode_builder.name = tag_class 

189 current_builder.elements.append(refnode_builder) 

190 self.parse_cambridge_ref_nodes(element, refnode_builder) 

191 continue 

192 

193 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}") 

194 current_builder.elements.append(escape(element.text))