Coverage for src / crawler / by_source / cup_crawler.py: 10%

185 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-30 12:41 +0000

1import logging 

2import re 

3from urllib.parse import urljoin 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.cmds.xml.xml_utils import escape 

7from ptf.model_data import create_abstract, create_articledata, create_contributor 

8 

9from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

10from crawler.cmds.mixed_citation import ( 

11 ExtLinkXml, 

12 GenericRefElement, 

13 MixedCitation, 

14) 

15from crawler.utils import cleanup_str, regex_to_dict 

16 

17logger = logging.getLogger(__name__) 

18 

19 

20class CupCrawler(BaseCollectionCrawler): 

21 source_name = "Cambridge University Press" 

22 source_domain = "CUP" 

23 source_website = "https://www.cambridge.org" 

24 

25 issue_re = r"Issue (?P<issue>\S+)" 

26 issue_error_re = r"Volume (?P<issue_nb>\d+)" 

27 volume_re = r"Volume (?P<volume>\d+)" 

28 archive_volume_re = r"Vol (?P<volume>\d+)" 

29 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)" 

30 

31 def parse_collection_content(self, content): 

32 xissues = [] 

33 soup = BeautifulSoup(content, "html.parser") 

34 

35 volumes_tag = soup.select( 

36 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation" 

37 ) 

38 for volume_tag in volumes_tag: 

39 issue_defaut_nb = "1" 

40 volume = volume_tag.select_one("a") 

41 if volume is None: 

42 raise ValueError("Couldn't parse volume tag") 

43 

44 try: 

45 volume_group = regex_to_dict( 

46 self.volume_re, volume.text, error_msg="Couldn't parse volume number" 

47 ) 

48 except ValueError: 

49 try: 

50 volume_group = regex_to_dict( 

51 self.archive_volume_re, 

52 volume.text, 

53 error_msg="Couldn't parse volume number", 

54 ) 

55 except ValueError: 

56 raise ValueError(f"Couldn't parse volume number from text: {volume.text}") 

57 

58 issues_tag = volume_tag.select("div > ul > li > ul > li > a") 

59 

60 ## If no issue listed : we consider the volume has only one issue 

61 if not issues_tag: 

62 issue_href = volume.get("href") 

63 year_span = volume.select_one("span.date") 

64 if not year_span: 

65 raise ValueError("Couldn't parse year for volume with no issue") 

66 year = year_span.text.split(" ")[-1] 

67 xissues.append( 

68 self.create_xissue( 

69 urljoin(self.source_website, issue_href), 

70 year, 

71 volume_group.get("volume"), 

72 "1", 

73 ) 

74 ) 

75 continue 

76 

77 # Get all the volume listed issues 

78 for issue_tag in issues_tag: 

79 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data( 

80 issue_tag, issue_defaut_nb 

81 ) 

82 xissues.append( 

83 self.create_xissue( 

84 urljoin(self.source_website, issue_href), 

85 issue_year, 

86 volume_group.get("volume"), 

87 issue_nb, 

88 ) 

89 ) 

90 return xissues 

91 

92 def get_issue_data(self, issue_tag, default_issue_nb): 

93 """ 

94 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1) 

95 """ 

96 year_span = issue_tag.select_one("span.date") 

97 if not year_span: 

98 raise ValueError("Couldn't parse year for issue") 

99 year = year_span.text.split(" ")[-1] 

100 

101 issue_href = issue_tag.get("href") 

102 if not isinstance(issue_href, str): 

103 raise ValueError("Couldn't parse issue href") 

104 

105 try: 

106 issue = regex_to_dict( 

107 self.issue_re, issue_tag.text, error_msg="Couldn't parse issue number" 

108 ) 

109 except ValueError: 

110 try: 

111 issue = regex_to_dict( 

112 self.issue_error_re, issue_tag.text, error_msg="Couldn't parse issue number" 

113 ) 

114 except ValueError: 

115 raise ValueError(f"Couldn't parse issue number from text: {issue_tag.text}") 

116 

117 issue_nb = issue.get("issue") 

118 return issue_nb, issue_href, year, default_issue_nb 

119 

120 def parse_issue_content(self, content, xissue): 

121 soup = BeautifulSoup(content, "html.parser") 

122 articles = soup.select("div.representation") 

123 article_number = 0 

124 for article in articles: 

125 xarticle = create_articledata() 

126 article_href = article.select_one("a.part-link").get("href") 

127 if not isinstance(article_href, str): 

128 raise ValueError("Couldn't parse article href") 

129 xarticle.url = urljoin(self.source_website, article_href) 

130 xarticle.pid = "a" + str(article_number) 

131 xissue.articles.append(xarticle) 

132 article_number += 1 

133 

134 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')") 

135 if has_pagination: 

136 pagination_link = has_pagination.get("href") 

137 if isinstance(pagination_link, str): 

138 page_url = urljoin(xissue.url, pagination_link) 

139 content = self.download_file(page_url) 

140 

141 self.parse_issue_content(content, xissue) 

142 

143 def parse_article_content(self, content, xissue, xarticle, url): 

144 soup = BeautifulSoup(content, "html.parser") 

145 

146 self.get_metadata_using_citation_meta( 

147 xarticle, 

148 xissue, 

149 soup, 

150 [ 

151 "pdf", 

152 "page", 

153 "doi", 

154 "publisher", 

155 "citation_keywords", 

156 "citation_reference", 

157 ], 

158 ) 

159 

160 ## Title 

161 title_tag = soup.select_one("hgroup > h1") 

162 if title_tag is None: 

163 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}") 

164 xarticle.title_tex = cleanup_str(title_tag.text) 

165 

166 ## Abstract 

167 abstract_tag = soup.select_one("div.abstract") 

168 

169 if abstract_tag: 

170 abstract = cleanup_str(abstract_tag.text) 

171 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang)) 

172 else: 

173 logger.info(f"No abstract found for article with url: {xarticle.url}") 

174 

175 ## keywords 

176 keywords_tag = soup.select_one("div.keywords") 

177 keywords = keywords_tag.select("span") if keywords_tag else [] 

178 for keyword in keywords: 

179 xarticle.kwds.append( 

180 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)} 

181 ) 

182 

183 ## Contributors name doi email 

184 self.parse_cup_contributors(soup, xarticle) 

185 

186 references_list = soup.select_one("#references-list") 

187 if references_list: 

188 xarticle.bibitems = self.parse_cambridge_references(references_list) 

189 return xarticle 

190 

191 def parse_cup_contributors(self, soup, xarticle): 

192 # Fetch ORCIDs [Name, ORCID] 

193 contributors = soup.select_one("div.contributors-details") 

194 if not contributors: 

195 raise ValueError("Couldn't parse contributors") 

196 

197 orcid_by_name = {} 

198 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}): 

199 name = orcid_link["data-test-orcid"] 

200 href = orcid_link.get("href", "") 

201 orcid_id = href.rstrip("/").split("/")[-1] if href else None 

202 orcid_by_name[name] = orcid_id 

203 

204 # Fetch Emails [Name, Email] 

205 email_by_name = {} 

206 for corresp in contributors.find_all(class_="corresp"): 

207 mailto = corresp.find("a", href=re.compile(r"^mailto:")) 

208 if mailto: 

209 email = mailto["href"].replace("mailto:", "") 

210 # Le nom du correspondant est souvent juste avant dans le texte 

211 # On cherche dans les blocs .author le lien corresp 

212 email_by_name["__corresp__"] = email # sera affiné ci-dessous 

213 

214 # Fetch Authors 

215 for author_block in contributors.find_all(attrs={"data-test-author": True}): 

216 string_name = author_block["data-test-author"] 

217 

218 # Split name into first and last name 

219 parts = string_name.strip().split() 

220 if len(parts) >= 2: 

221 first_name = " ".join(parts[:-1]) 

222 last_name = parts[-1] 

223 else: 

224 first_name = "" 

225 last_name = string_name 

226 

227 # ORCID 

228 orcid = orcid_by_name.get(string_name) 

229 

230 # Email 

231 email = "" 

232 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:")) 

233 if mailto_tag: 

234 email = mailto_tag["href"].replace("mailto:", "") 

235 

236 xarticle.contributors.append( 

237 create_contributor( 

238 role="author", 

239 string_name=string_name, 

240 first_name=first_name, 

241 last_name=last_name, 

242 orcid=orcid, 

243 email=email, 

244 ) 

245 ) 

246 return xarticle 

247 

248 def parse_cambridge_references(self, soup: Tag): 

249 bibitems = [] 

250 for item in soup.select(".circle-list__item"): 

251 citation_builder = MixedCitation() 

252 label_tag = item.select_one(".circle-list__item__number") 

253 if label_tag: 

254 citation_builder.label = escape(cleanup_str(label_tag.text)) 

255 citation_content = item.select_one(".circle-list__item__grouped__content") 

256 if citation_content: 

257 self.parse_cambridge_ref_nodes(citation_content, citation_builder) 

258 

259 # Group all StringNames into one PersonGroup object 

260 persongroup_builder = GenericRefElement() 

261 persongroup_builder.name = "person-group" 

262 # Index of StringNames objects 

263 i = [ 

264 index 

265 for index, element in enumerate(citation_builder.elements) 

266 if isinstance(element, GenericRefElement) and element.name == "string-name" 

267 ] 

268 if len(i) > 0: 

269 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1] 

270 del citation_builder.elements[i[0] : i[-1] + 1] 

271 citation_builder.elements.insert(i[0], persongroup_builder) 

272 

273 bibitems.append(citation_builder.get_jats_ref()) 

274 return bibitems 

275 

276 def parse_cambridge_ref_nodes( 

277 self, 

278 current_tag: Tag, 

279 current_builder: GenericRefElement, 

280 ): 

281 "recursive function that parses references tags" 

282 for element in current_tag.children: 

283 if isinstance(element, str): 

284 current_builder.elements.append(escape(element)) 

285 continue 

286 if isinstance(element, Tag): 

287 tag_class = element.get("class") 

288 if isinstance(tag_class, list): 

289 if len(tag_class) > 0: 

290 tag_class = tag_class[0] 

291 else: 

292 tag_class = None 

293 

294 if not tag_class: 

295 continue 

296 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"): 

297 continue 

298 if element.name == "a": 

299 href = element.get("href") 

300 if isinstance(href, str): 

301 current_builder.elements.append(" ") 

302 current_builder.elements.append( 

303 ExtLinkXml(escape(href), escape(element.text)) 

304 ) 

305 continue 

306 

307 if tag_class in [ 

308 "surname", 

309 "given-names", 

310 "string-name", 

311 "person-group", 

312 "publisher-name", 

313 "source", 

314 "volume", 

315 "year", 

316 "fpage", 

317 "lpage", 

318 "article-title", 

319 "issue", 

320 "chapter-title", 

321 "inline-formula", 

322 "collab", 

323 "alternatives", 

324 "italic", 

325 "publisher-loc", 

326 "roman", 

327 "edition", 

328 "suffix", 

329 ]: 

330 refnode_builder = GenericRefElement() 

331 refnode_builder.name = tag_class 

332 current_builder.elements.append(refnode_builder) 

333 self.parse_cambridge_ref_nodes(element, refnode_builder) 

334 continue 

335 

336 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}") 

337 current_builder.elements.append(escape(element.text))