Coverage for src / crawler / by_source / cup_crawler.py: 10%
186 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-21 12:58 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-21 12:58 +0000
1import logging
2import re
3from urllib.parse import urljoin
5from bs4 import BeautifulSoup, Tag
6from ptf.cmds.xml.xml_utils import escape
7from ptf.model_data import create_abstract, create_articledata, create_contributor
9from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
10from crawler.cmds.mixed_citation import (
11 ExtLinkXml,
12 GenericRefElement,
13 MixedCitation,
14)
15from crawler.utils import cleanup_str, regex_to_dict
17logger = logging.getLogger(__name__)
20class CupCrawler(MatchingCrawler):
21 source_name = "Cambridge University Press"
22 source_domain = "CUP"
23 source_website = "https://www.cambridge.org/core/"
25 issue_re = r"Issue (?P<issue>\S+)"
26 issue_error_re = r"Volume (?P<issue_nb>\d+)"
27 volume_re = r"Volume (?P<volume>\d+)"
28 archive_volume_re = r"Vol (?P<volume>\d+)"
29 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)"
31 pid_year_restrictions = {
32 "GLMJ": 6,
33 "CJM": 6,
34 "CMB": 6,
35 }
37 def parse_collection_content(self, content):
38 xissues = []
39 soup = BeautifulSoup(content, "html.parser")
41 volumes_tag = soup.select(
42 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation"
43 )
44 for volume_tag in volumes_tag:
45 issue_defaut_nb = "1"
46 volume = volume_tag.select_one("a")
47 if volume is None:
48 raise ValueError("Couldn't parse volume tag")
50 try:
51 volume_group = regex_to_dict(
52 self.volume_re, volume.text, error_msg="Couldn't parse volume number"
53 )
54 except ValueError:
55 try:
56 volume_group = regex_to_dict(
57 self.archive_volume_re,
58 volume.text,
59 error_msg="Couldn't parse volume number",
60 )
61 except ValueError:
62 raise ValueError(f"Couldn't parse volume number from text: {volume.text}")
64 issues_tag = volume_tag.select("div > ul > li > ul > li > a")
66 ## If no issue listed : we consider the volume has only one issue
67 if not issues_tag:
68 issue_href = volume.get("href")
69 year_span = volume.select_one("span.date")
70 if not year_span:
71 raise ValueError("Couldn't parse year for volume with no issue")
72 year = year_span.text.split(" ")[-1]
73 xissues.append(
74 self.create_xissue(
75 urljoin(self.source_website, issue_href),
76 year,
77 volume_group.get("volume"),
78 "1",
79 )
80 )
81 continue
83 # Get all the volume listed issues
84 for issue_tag in issues_tag:
85 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data(
86 issue_tag, issue_defaut_nb
87 )
88 # # Cambridge has declared articles younger than 5 not as open access
89 # if issue_year < current_year:
90 xissues.append(
91 self.create_xissue(
92 urljoin(self.source_website, issue_href),
93 issue_year,
94 volume_group.get("volume"),
95 issue_nb,
96 )
97 )
98 return xissues
100 def get_issue_data(self, issue_tag, default_issue_nb):
101 """
102 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1)
103 """
104 year_span = issue_tag.select_one("span.date")
105 if not year_span:
106 raise ValueError("Couldn't parse year for issue")
107 year = year_span.text.split(" ")[-1]
109 issue_href = issue_tag.get("href")
110 if not isinstance(issue_href, str):
111 raise ValueError("Couldn't parse issue href")
113 try:
114 issue = regex_to_dict(
115 self.issue_re, issue_tag.text, error_msg="Couldn't parse issue number"
116 )
117 except ValueError:
118 try:
119 issue = regex_to_dict(
120 self.issue_error_re, issue_tag.text, error_msg="Couldn't parse issue number"
121 )
122 except ValueError:
123 raise ValueError(f"Couldn't parse issue number from text: {issue_tag.text}")
125 issue_nb = issue.get("issue")
126 return issue_nb, issue_href, year, default_issue_nb
128 def parse_issue_content(self, content, xissue):
129 soup = BeautifulSoup(content, "html.parser")
130 articles = soup.select("div.representation")
131 article_number = 0
132 for article in articles:
133 xarticle = create_articledata()
134 article_href = article.select_one("a.part-link").get("href")
135 if not isinstance(article_href, str):
136 raise ValueError("Couldn't parse article href")
137 xarticle.url = urljoin(self.source_website, article_href)
138 xarticle.pid = "a" + str(article_number)
139 xissue.articles.append(xarticle)
140 article_number += 1
142 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')")
143 if has_pagination:
144 pagination_link = has_pagination.get("href")
145 if isinstance(pagination_link, str):
146 page_url = urljoin(xissue.url, pagination_link)
147 content = self.download_file(page_url)
149 self.parse_issue_content(content, xissue)
151 def parse_article_content(self, content, xissue, xarticle, url):
152 soup = BeautifulSoup(content, "html.parser")
154 self.get_metadata_using_citation_meta(
155 xarticle,
156 xissue,
157 soup,
158 [
159 "pdf",
160 "page",
161 "doi",
162 "publisher",
163 "citation_keywords",
164 "citation_reference",
165 ],
166 )
168 ## Title
169 title_tag = soup.select_one("hgroup > h1")
170 if title_tag is None:
171 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}")
172 xarticle.title_tex = cleanup_str(title_tag.text)
174 ## Abstract
175 abstract_tag = soup.select_one("div.abstract")
177 if abstract_tag:
178 abstract = cleanup_str(abstract_tag.text)
179 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))
180 else:
181 logger.info(f"No abstract found for article with url: {xarticle.url}")
183 ## keywords
184 keywords_tag = soup.select_one("div.keywords")
185 keywords = keywords_tag.select("span") if keywords_tag else []
186 for keyword in keywords:
187 xarticle.kwds.append(
188 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)}
189 )
191 ## Contributors name doi email
192 self.parse_cup_contributors(soup, xarticle)
194 references_list = soup.select_one("#references-list")
195 if references_list:
196 xarticle.bibitems = self.parse_cambridge_references(references_list)
197 return xarticle
199 def parse_cup_contributors(self, soup, xarticle):
200 # Fetch ORCIDs [Name, ORCID]
201 contributors = soup.select_one("div.contributors-details")
202 if not contributors:
203 raise ValueError("Couldn't parse contributors")
205 orcid_by_name = {}
206 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}):
207 name = orcid_link["data-test-orcid"]
208 href = orcid_link.get("href", "")
209 orcid_id = href.rstrip("/").split("/")[-1] if href else None
210 orcid_by_name[name] = orcid_id
212 # Fetch Emails [Name, Email]
213 email_by_name = {}
214 for corresp in contributors.find_all(class_="corresp"):
215 mailto = corresp.find("a", href=re.compile(r"^mailto:"))
216 if mailto:
217 email = mailto["href"].replace("mailto:", "")
218 # Le nom du correspondant est souvent juste avant dans le texte
219 # On cherche dans les blocs .author le lien corresp
220 email_by_name["__corresp__"] = email # sera affiné ci-dessous
222 # Fetch Authors
223 for author_block in contributors.find_all(attrs={"data-test-author": True}):
224 string_name = author_block["data-test-author"]
226 # Split name into first and last name
227 parts = string_name.strip().split()
228 if len(parts) >= 2:
229 first_name = " ".join(parts[:-1])
230 last_name = parts[-1]
231 else:
232 first_name = ""
233 last_name = string_name
235 # ORCID
236 orcid = orcid_by_name.get(string_name)
238 # Email
239 email = ""
240 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:"))
241 if mailto_tag:
242 email = mailto_tag["href"].replace("mailto:", "")
244 xarticle.contributors.append(
245 create_contributor(
246 role="author",
247 string_name=string_name,
248 first_name=first_name,
249 last_name=last_name,
250 orcid=orcid,
251 email=email,
252 )
253 )
254 return xarticle
256 def parse_cambridge_references(self, soup: Tag):
257 bibitems = []
258 for item in soup.select(".circle-list__item"):
259 citation_builder = MixedCitation()
260 label_tag = item.select_one(".circle-list__item__number")
261 if label_tag:
262 citation_builder.label = escape(cleanup_str(label_tag.text))
263 citation_content = item.select_one(".circle-list__item__grouped__content")
264 if citation_content:
265 self.parse_cambridge_ref_nodes(citation_content, citation_builder)
267 # Group all StringNames into one PersonGroup object
268 persongroup_builder = GenericRefElement()
269 persongroup_builder.name = "person-group"
270 # Index of StringNames objects
271 i = [
272 index
273 for index, element in enumerate(citation_builder.elements)
274 if isinstance(element, GenericRefElement) and element.name == "string-name"
275 ]
276 if len(i) > 0:
277 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1]
278 del citation_builder.elements[i[0] : i[-1] + 1]
279 citation_builder.elements.insert(i[0], persongroup_builder)
281 bibitems.append(citation_builder.get_jats_ref())
282 return bibitems
284 def parse_cambridge_ref_nodes(
285 self,
286 current_tag: Tag,
287 current_builder: GenericRefElement,
288 ):
289 "recursive function that parses references tags"
290 for element in current_tag.children:
291 if isinstance(element, str):
292 current_builder.elements.append(escape(element))
293 continue
294 if isinstance(element, Tag):
295 tag_class = element.get("class")
296 if isinstance(tag_class, list):
297 if len(tag_class) > 0:
298 tag_class = tag_class[0]
299 else:
300 tag_class = None
302 if not tag_class:
303 continue
304 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"):
305 continue
306 if element.name == "a":
307 href = element.get("href")
308 if isinstance(href, str):
309 current_builder.elements.append(" ")
310 current_builder.elements.append(
311 ExtLinkXml(escape(href), escape(element.text))
312 )
313 continue
315 if tag_class in [
316 "surname",
317 "given-names",
318 "string-name",
319 "person-group",
320 "publisher-name",
321 "source",
322 "volume",
323 "year",
324 "fpage",
325 "lpage",
326 "article-title",
327 "issue",
328 "chapter-title",
329 "inline-formula",
330 "collab",
331 "alternatives",
332 "italic",
333 "publisher-loc",
334 "roman",
335 "edition",
336 "suffix",
337 ]:
338 refnode_builder = GenericRefElement()
339 refnode_builder.name = tag_class
340 current_builder.elements.append(refnode_builder)
341 self.parse_cambridge_ref_nodes(element, refnode_builder)
342 continue
344 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}")
345 current_builder.elements.append(escape(element.text))