Coverage for src / crawler / by_source / cup_crawler.py: 10%

186 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-05-21 12:58 +0000

1import logging 

2import re 

3from urllib.parse import urljoin 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.cmds.xml.xml_utils import escape 

7from ptf.model_data import create_abstract, create_articledata, create_contributor 

8 

9from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

10from crawler.cmds.mixed_citation import ( 

11 ExtLinkXml, 

12 GenericRefElement, 

13 MixedCitation, 

14) 

15from crawler.utils import cleanup_str, regex_to_dict 

16 

17logger = logging.getLogger(__name__) 

18 

19 

20class CupCrawler(MatchingCrawler): 

21 source_name = "Cambridge University Press" 

22 source_domain = "CUP" 

23 source_website = "https://www.cambridge.org/core/" 

24 

25 issue_re = r"Issue (?P<issue>\S+)" 

26 issue_error_re = r"Volume (?P<issue_nb>\d+)" 

27 volume_re = r"Volume (?P<volume>\d+)" 

28 archive_volume_re = r"Vol (?P<volume>\d+)" 

29 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)" 

30 

31 pid_year_restrictions = { 

32 "GLMJ": 6, 

33 "CJM": 6, 

34 "CMB": 6, 

35 } 

36 

37 def parse_collection_content(self, content): 

38 xissues = [] 

39 soup = BeautifulSoup(content, "html.parser") 

40 

41 volumes_tag = soup.select( 

42 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation" 

43 ) 

44 for volume_tag in volumes_tag: 

45 issue_defaut_nb = "1" 

46 volume = volume_tag.select_one("a") 

47 if volume is None: 

48 raise ValueError("Couldn't parse volume tag") 

49 

50 try: 

51 volume_group = regex_to_dict( 

52 self.volume_re, volume.text, error_msg="Couldn't parse volume number" 

53 ) 

54 except ValueError: 

55 try: 

56 volume_group = regex_to_dict( 

57 self.archive_volume_re, 

58 volume.text, 

59 error_msg="Couldn't parse volume number", 

60 ) 

61 except ValueError: 

62 raise ValueError(f"Couldn't parse volume number from text: {volume.text}") 

63 

64 issues_tag = volume_tag.select("div > ul > li > ul > li > a") 

65 

66 ## If no issue listed : we consider the volume has only one issue 

67 if not issues_tag: 

68 issue_href = volume.get("href") 

69 year_span = volume.select_one("span.date") 

70 if not year_span: 

71 raise ValueError("Couldn't parse year for volume with no issue") 

72 year = year_span.text.split(" ")[-1] 

73 xissues.append( 

74 self.create_xissue( 

75 urljoin(self.source_website, issue_href), 

76 year, 

77 volume_group.get("volume"), 

78 "1", 

79 ) 

80 ) 

81 continue 

82 

83 # Get all the volume listed issues 

84 for issue_tag in issues_tag: 

85 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data( 

86 issue_tag, issue_defaut_nb 

87 ) 

88 # # Cambridge has declared articles younger than 5 not as open access 

89 # if issue_year < current_year: 

90 xissues.append( 

91 self.create_xissue( 

92 urljoin(self.source_website, issue_href), 

93 issue_year, 

94 volume_group.get("volume"), 

95 issue_nb, 

96 ) 

97 ) 

98 return xissues 

99 

100 def get_issue_data(self, issue_tag, default_issue_nb): 

101 """ 

102 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1) 

103 """ 

104 year_span = issue_tag.select_one("span.date") 

105 if not year_span: 

106 raise ValueError("Couldn't parse year for issue") 

107 year = year_span.text.split(" ")[-1] 

108 

109 issue_href = issue_tag.get("href") 

110 if not isinstance(issue_href, str): 

111 raise ValueError("Couldn't parse issue href") 

112 

113 try: 

114 issue = regex_to_dict( 

115 self.issue_re, issue_tag.text, error_msg="Couldn't parse issue number" 

116 ) 

117 except ValueError: 

118 try: 

119 issue = regex_to_dict( 

120 self.issue_error_re, issue_tag.text, error_msg="Couldn't parse issue number" 

121 ) 

122 except ValueError: 

123 raise ValueError(f"Couldn't parse issue number from text: {issue_tag.text}") 

124 

125 issue_nb = issue.get("issue") 

126 return issue_nb, issue_href, year, default_issue_nb 

127 

128 def parse_issue_content(self, content, xissue): 

129 soup = BeautifulSoup(content, "html.parser") 

130 articles = soup.select("div.representation") 

131 article_number = 0 

132 for article in articles: 

133 xarticle = create_articledata() 

134 article_href = article.select_one("a.part-link").get("href") 

135 if not isinstance(article_href, str): 

136 raise ValueError("Couldn't parse article href") 

137 xarticle.url = urljoin(self.source_website, article_href) 

138 xarticle.pid = "a" + str(article_number) 

139 xissue.articles.append(xarticle) 

140 article_number += 1 

141 

142 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')") 

143 if has_pagination: 

144 pagination_link = has_pagination.get("href") 

145 if isinstance(pagination_link, str): 

146 page_url = urljoin(xissue.url, pagination_link) 

147 content = self.download_file(page_url) 

148 

149 self.parse_issue_content(content, xissue) 

150 

151 def parse_article_content(self, content, xissue, xarticle, url): 

152 soup = BeautifulSoup(content, "html.parser") 

153 

154 self.get_metadata_using_citation_meta( 

155 xarticle, 

156 xissue, 

157 soup, 

158 [ 

159 "pdf", 

160 "page", 

161 "doi", 

162 "publisher", 

163 "citation_keywords", 

164 "citation_reference", 

165 ], 

166 ) 

167 

168 ## Title 

169 title_tag = soup.select_one("hgroup > h1") 

170 if title_tag is None: 

171 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}") 

172 xarticle.title_tex = cleanup_str(title_tag.text) 

173 

174 ## Abstract 

175 abstract_tag = soup.select_one("div.abstract") 

176 

177 if abstract_tag: 

178 abstract = cleanup_str(abstract_tag.text) 

179 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang)) 

180 else: 

181 logger.info(f"No abstract found for article with url: {xarticle.url}") 

182 

183 ## keywords 

184 keywords_tag = soup.select_one("div.keywords") 

185 keywords = keywords_tag.select("span") if keywords_tag else [] 

186 for keyword in keywords: 

187 xarticle.kwds.append( 

188 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)} 

189 ) 

190 

191 ## Contributors name doi email 

192 self.parse_cup_contributors(soup, xarticle) 

193 

194 references_list = soup.select_one("#references-list") 

195 if references_list: 

196 xarticle.bibitems = self.parse_cambridge_references(references_list) 

197 return xarticle 

198 

199 def parse_cup_contributors(self, soup, xarticle): 

200 # Fetch ORCIDs [Name, ORCID] 

201 contributors = soup.select_one("div.contributors-details") 

202 if not contributors: 

203 raise ValueError("Couldn't parse contributors") 

204 

205 orcid_by_name = {} 

206 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}): 

207 name = orcid_link["data-test-orcid"] 

208 href = orcid_link.get("href", "") 

209 orcid_id = href.rstrip("/").split("/")[-1] if href else None 

210 orcid_by_name[name] = orcid_id 

211 

212 # Fetch Emails [Name, Email] 

213 email_by_name = {} 

214 for corresp in contributors.find_all(class_="corresp"): 

215 mailto = corresp.find("a", href=re.compile(r"^mailto:")) 

216 if mailto: 

217 email = mailto["href"].replace("mailto:", "") 

218 # Le nom du correspondant est souvent juste avant dans le texte 

219 # On cherche dans les blocs .author le lien corresp 

220 email_by_name["__corresp__"] = email # sera affiné ci-dessous 

221 

222 # Fetch Authors 

223 for author_block in contributors.find_all(attrs={"data-test-author": True}): 

224 string_name = author_block["data-test-author"] 

225 

226 # Split name into first and last name 

227 parts = string_name.strip().split() 

228 if len(parts) >= 2: 

229 first_name = " ".join(parts[:-1]) 

230 last_name = parts[-1] 

231 else: 

232 first_name = "" 

233 last_name = string_name 

234 

235 # ORCID 

236 orcid = orcid_by_name.get(string_name) 

237 

238 # Email 

239 email = "" 

240 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:")) 

241 if mailto_tag: 

242 email = mailto_tag["href"].replace("mailto:", "") 

243 

244 xarticle.contributors.append( 

245 create_contributor( 

246 role="author", 

247 string_name=string_name, 

248 first_name=first_name, 

249 last_name=last_name, 

250 orcid=orcid, 

251 email=email, 

252 ) 

253 ) 

254 return xarticle 

255 

256 def parse_cambridge_references(self, soup: Tag): 

257 bibitems = [] 

258 for item in soup.select(".circle-list__item"): 

259 citation_builder = MixedCitation() 

260 label_tag = item.select_one(".circle-list__item__number") 

261 if label_tag: 

262 citation_builder.label = escape(cleanup_str(label_tag.text)) 

263 citation_content = item.select_one(".circle-list__item__grouped__content") 

264 if citation_content: 

265 self.parse_cambridge_ref_nodes(citation_content, citation_builder) 

266 

267 # Group all StringNames into one PersonGroup object 

268 persongroup_builder = GenericRefElement() 

269 persongroup_builder.name = "person-group" 

270 # Index of StringNames objects 

271 i = [ 

272 index 

273 for index, element in enumerate(citation_builder.elements) 

274 if isinstance(element, GenericRefElement) and element.name == "string-name" 

275 ] 

276 if len(i) > 0: 

277 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1] 

278 del citation_builder.elements[i[0] : i[-1] + 1] 

279 citation_builder.elements.insert(i[0], persongroup_builder) 

280 

281 bibitems.append(citation_builder.get_jats_ref()) 

282 return bibitems 

283 

284 def parse_cambridge_ref_nodes( 

285 self, 

286 current_tag: Tag, 

287 current_builder: GenericRefElement, 

288 ): 

289 "recursive function that parses references tags" 

290 for element in current_tag.children: 

291 if isinstance(element, str): 

292 current_builder.elements.append(escape(element)) 

293 continue 

294 if isinstance(element, Tag): 

295 tag_class = element.get("class") 

296 if isinstance(tag_class, list): 

297 if len(tag_class) > 0: 

298 tag_class = tag_class[0] 

299 else: 

300 tag_class = None 

301 

302 if not tag_class: 

303 continue 

304 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"): 

305 continue 

306 if element.name == "a": 

307 href = element.get("href") 

308 if isinstance(href, str): 

309 current_builder.elements.append(" ") 

310 current_builder.elements.append( 

311 ExtLinkXml(escape(href), escape(element.text)) 

312 ) 

313 continue 

314 

315 if tag_class in [ 

316 "surname", 

317 "given-names", 

318 "string-name", 

319 "person-group", 

320 "publisher-name", 

321 "source", 

322 "volume", 

323 "year", 

324 "fpage", 

325 "lpage", 

326 "article-title", 

327 "issue", 

328 "chapter-title", 

329 "inline-formula", 

330 "collab", 

331 "alternatives", 

332 "italic", 

333 "publisher-loc", 

334 "roman", 

335 "edition", 

336 "suffix", 

337 ]: 

338 refnode_builder = GenericRefElement() 

339 refnode_builder.name = tag_class 

340 current_builder.elements.append(refnode_builder) 

341 self.parse_cambridge_ref_nodes(element, refnode_builder) 

342 continue 

343 

344 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}") 

345 current_builder.elements.append(escape(element.text))