Coverage for src/crawler/by_source/cup

1import logging

2import re

3from urllib.parse import urljoin

5from bs4 import BeautifulSoup, Tag

6from ptf.cmds.xml.xml_utils import escape

7from ptf.model_data import create_abstract, create_articledata, create_contributor

9from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler

10from crawler.cmds.mixed_citation import (

11 ExtLinkXml,

12 GenericRefElement,

13 MixedCitation,

14)

15from crawler.utils import cleanup_str, regex_to_dict

17logger = logging.getLogger(__name__)

20class CupCrawler(BaseCollectionCrawler):

21 source_name = "Cambridge University Press"

22 source_domain = "CUP"

23 source_website = "https://www.cambridge.org"

25 issue_re = r"Issue (?P<issue>\S+)"

26 issue_error_re = r"Volume (?P<issue_nb>\d+)"

27 volume_re = r"Volume (?P<volume>\d+)"

28 archive_volume_re = r"Vol (?P<volume>\d+)"

29 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)"

31 def parse_collection_content(self, content):

32 xissues = []

33 soup = BeautifulSoup(content, "html.parser")

35 volumes_tag = soup.select(

36 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation"

37 )

38 for volume_tag in volumes_tag:

39 issue_defaut_nb = "1"

40 volume = volume_tag.select_one("a")

41 if volume is None:

42 raise ValueError("Couldn't parse volume tag")

44 try:

45 volume_group = regex_to_dict(

46 self.volume_re, volume.text, error_msg="Couldn't parse volume number"

47 )

48 except ValueError:

49 try:

50 volume_group = regex_to_dict(

51 self.archive_volume_re,

52 volume.text,

53 error_msg="Couldn't parse volume number",

54 )

55 except ValueError:

56 raise ValueError(f"Couldn't parse volume number from text: {volume.text}")

58 issues_tag = volume_tag.select("div > ul > li > ul > li > a")

60 ## If no issue listed : we consider the volume has only one issue

61 if not issues_tag:

62 issue_href = volume.get("href")

63 year_span = volume.select_one("span.date")

64 if not year_span:

65 raise ValueError("Couldn't parse year for volume with no issue")

66 year = year_span.text.split(" ")[-1]

67 xissues.append(

68 self.create_xissue(

69 urljoin(self.source_website, issue_href),

70 year,

71 volume_group.get("volume"),

72 "1",

73 )

74 )

75 continue

77 # Get all the volume listed issues

78 for issue_tag in issues_tag:

79 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data(

80 issue_tag, issue_defaut_nb

81 )

82 xissues.append(

83 self.create_xissue(

84 urljoin(self.source_website, issue_href),

85 issue_year,

86 volume_group.get("volume"),

87 issue_nb,

88 )

89 )

90 return xissues

92 def get_issue_data(self, issue_tag, default_issue_nb):

93 """

94 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1)

95 """

96 year_span = issue_tag.select_one("span.date")

97 if not year_span:

98 raise ValueError("Couldn't parse year for issue")

99 year = year_span.text.split(" ")[-1]

100

101 issue_href = issue_tag.get("href")

102 if not isinstance(issue_href, str):

103 raise ValueError("Couldn't parse issue href")

104

105 try:

106 issue = regex_to_dict(

107 self.issue_re, issue_tag.text, error_msg="Couldn't parse issue number"

108 )

109 except ValueError:

110 try:

111 issue = regex_to_dict(

112 self.issue_error_re, issue_tag.text, error_msg="Couldn't parse issue number"

113 )

114 except ValueError:

115 raise ValueError(f"Couldn't parse issue number from text: {issue_tag.text}")

116

117 issue_nb = issue.get("issue")

118 return issue_nb, issue_href, year, default_issue_nb

119

120 def parse_issue_content(self, content, xissue):

121 soup = BeautifulSoup(content, "html.parser")

122 articles = soup.select("div.representation")

123 article_number = 0

124 for article in articles:

125 xarticle = create_articledata()

126 article_href = article.select_one("a.part-link").get("href")

127 if not isinstance(article_href, str):

128 raise ValueError("Couldn't parse article href")

129 xarticle.url = urljoin(self.source_website, article_href)

130 xarticle.pid = "a" + str(article_number)

131 xissue.articles.append(xarticle)

132 article_number += 1

133

134 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')")

135 if has_pagination:

136 pagination_link = has_pagination.get("href")

137 if isinstance(pagination_link, str):

138 page_url = urljoin(xissue.url, pagination_link)

139 content = self.download_file(page_url)

140

141 self.parse_issue_content(content, xissue)

142

143 def parse_article_content(self, content, xissue, xarticle, url):

144 soup = BeautifulSoup(content, "html.parser")

145

146 self.get_metadata_using_citation_meta(

147 xarticle,

148 xissue,

149 soup,

150 [

151 "pdf",

152 "page",

153 "doi",

154 "publisher",

155 "citation_keywords",

156 "citation_reference",

157 ],

158 )

159

160 ## Title

161 title_tag = soup.select_one("hgroup > h1")

162 if title_tag is None:

163 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}")

164 xarticle.title_tex = cleanup_str(title_tag.text)

165

166 ## Abstract

167 abstract_tag = soup.select_one("div.abstract")

168

169 if abstract_tag:

170 abstract = cleanup_str(abstract_tag.text)

171 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))

172 else:

173 logger.info(f"No abstract found for article with url: {xarticle.url}")

174

175 ## keywords

176 keywords_tag = soup.select_one("div.keywords")

177 keywords = keywords_tag.select("span") if keywords_tag else []

178 for keyword in keywords:

179 xarticle.kwds.append(

180 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)}

181 )

182

183 ## Contributors name doi email

184 self.parse_cup_contributors(soup, xarticle)

185

186 references_list = soup.select_one("#references-list")

187 if references_list:

188 xarticle.bibitems = self.parse_cambridge_references(references_list)

189 return xarticle

190

191 def parse_cup_contributors(self, soup, xarticle):

192 # Fetch ORCIDs [Name, ORCID]

193 contributors = soup.select_one("div.contributors-details")

194 if not contributors:

195 raise ValueError("Couldn't parse contributors")

196

197 orcid_by_name = {}

198 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}):

199 name = orcid_link["data-test-orcid"]

200 href = orcid_link.get("href", "")

201 orcid_id = href.rstrip("/").split("/")[-1] if href else None

202 orcid_by_name[name] = orcid_id

203

204 # Fetch Emails [Name, Email]

205 email_by_name = {}

206 for corresp in contributors.find_all(class_="corresp"):

207 mailto = corresp.find("a", href=re.compile(r"^mailto:"))

208 if mailto:

209 email = mailto["href"].replace("mailto:", "")

210 # Le nom du correspondant est souvent juste avant dans le texte

211 # On cherche dans les blocs .author le lien corresp

212 email_by_name["__corresp__"] = email # sera affiné ci-dessous

213

214 # Fetch Authors

215 for author_block in contributors.find_all(attrs={"data-test-author": True}):

216 string_name = author_block["data-test-author"]

217

218 # Split name into first and last name

219 parts = string_name.strip().split()

220 if len(parts) >= 2:

221 first_name = " ".join(parts[:-1])

222 last_name = parts[-1]

223 else:

224 first_name = ""

225 last_name = string_name

226

227 # ORCID

228 orcid = orcid_by_name.get(string_name)

229

230 # Email

231 email = ""

232 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:"))

233 if mailto_tag:

234 email = mailto_tag["href"].replace("mailto:", "")

235

236 xarticle.contributors.append(

237 create_contributor(

238 role="author",

239 string_name=string_name,

240 first_name=first_name,

241 last_name=last_name,

242 orcid=orcid,

243 email=email,

244 )

245 )

246 return xarticle

247

248 def parse_cambridge_references(self, soup: Tag):

249 bibitems = []

250 for item in soup.select(".circle-list__item"):

251 citation_builder = MixedCitation()

252 label_tag = item.select_one(".circle-list__item__number")

253 if label_tag:

254 citation_builder.label = escape(cleanup_str(label_tag.text))

255 citation_content = item.select_one(".circle-list__item__grouped__content")

256 if citation_content:

257 self.parse_cambridge_ref_nodes(citation_content, citation_builder)

258

259 # Group all StringNames into one PersonGroup object

260 persongroup_builder = GenericRefElement()

261 persongroup_builder.name = "person-group"

262 # Index of StringNames objects

263 i = [

264 index

265 for index, element in enumerate(citation_builder.elements)

266 if isinstance(element, GenericRefElement) and element.name == "string-name"

267 ]

268 if len(i) > 0:

269 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1]

270 del citation_builder.elements[i[0] : i[-1] + 1]

271 citation_builder.elements.insert(i[0], persongroup_builder)

272

273 bibitems.append(citation_builder.get_jats_ref())

274 return bibitems

275

276 def parse_cambridge_ref_nodes(

277 self,

278 current_tag: Tag,

279 current_builder: GenericRefElement,

280 ):

281 "recursive function that parses references tags"

282 for element in current_tag.children:

283 if isinstance(element, str):

284 current_builder.elements.append(escape(element))

285 continue

286 if isinstance(element, Tag):

287 tag_class = element.get("class")

288 if isinstance(tag_class, list):

289 if len(tag_class) > 0:

290 tag_class = tag_class[0]

291 else:

292 tag_class = None

293

294 if not tag_class:

295 continue

296 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"):

297 continue

298 if element.name == "a":

299 href = element.get("href")

300 if isinstance(href, str):

301 current_builder.elements.append(" ")

302 current_builder.elements.append(

303 ExtLinkXml(escape(href), escape(element.text))

304 )

305 continue

306

307 if tag_class in [

308 "surname",

309 "given-names",

310 "string-name",

311 "person-group",

312 "publisher-name",

313 "source",

314 "volume",

315 "year",

316 "fpage",

317 "lpage",

318 "article-title",

319 "issue",

320 "chapter-title",

321 "inline-formula",

322 "collab",

323 "alternatives",

324 "italic",

325 "publisher-loc",

326 "roman",

327 "edition",

328 "suffix",

329 ]:

330 refnode_builder = GenericRefElement()

331 refnode_builder.name = tag_class

332 current_builder.elements.append(refnode_builder)

333 self.parse_cambridge_ref_nodes(element, refnode_builder)

334 continue

335

336 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}")

337 current_builder.elements.append(escape(element.text))

Coverage for src / crawler / by_source / cup_crawler.py: 10%

185 statements