Coverage for src / crawler / by_source / cup_crawler.py: 10%
185 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-30 12:41 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-30 12:41 +0000
1import logging
2import re
3from urllib.parse import urljoin
5from bs4 import BeautifulSoup, Tag
6from ptf.cmds.xml.xml_utils import escape
7from ptf.model_data import create_abstract, create_articledata, create_contributor
9from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
10from crawler.cmds.mixed_citation import (
11 ExtLinkXml,
12 GenericRefElement,
13 MixedCitation,
14)
15from crawler.utils import cleanup_str, regex_to_dict
17logger = logging.getLogger(__name__)
20class CupCrawler(BaseCollectionCrawler):
21 source_name = "Cambridge University Press"
22 source_domain = "CUP"
23 source_website = "https://www.cambridge.org"
25 issue_re = r"Issue (?P<issue>\S+)"
26 issue_error_re = r"Volume (?P<issue_nb>\d+)"
27 volume_re = r"Volume (?P<volume>\d+)"
28 archive_volume_re = r"Vol (?P<volume>\d+)"
29 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)"
31 def parse_collection_content(self, content):
32 xissues = []
33 soup = BeautifulSoup(content, "html.parser")
35 volumes_tag = soup.select(
36 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation"
37 )
38 for volume_tag in volumes_tag:
39 issue_defaut_nb = "1"
40 volume = volume_tag.select_one("a")
41 if volume is None:
42 raise ValueError("Couldn't parse volume tag")
44 try:
45 volume_group = regex_to_dict(
46 self.volume_re, volume.text, error_msg="Couldn't parse volume number"
47 )
48 except ValueError:
49 try:
50 volume_group = regex_to_dict(
51 self.archive_volume_re,
52 volume.text,
53 error_msg="Couldn't parse volume number",
54 )
55 except ValueError:
56 raise ValueError(f"Couldn't parse volume number from text: {volume.text}")
58 issues_tag = volume_tag.select("div > ul > li > ul > li > a")
60 ## If no issue listed : we consider the volume has only one issue
61 if not issues_tag:
62 issue_href = volume.get("href")
63 year_span = volume.select_one("span.date")
64 if not year_span:
65 raise ValueError("Couldn't parse year for volume with no issue")
66 year = year_span.text.split(" ")[-1]
67 xissues.append(
68 self.create_xissue(
69 urljoin(self.source_website, issue_href),
70 year,
71 volume_group.get("volume"),
72 "1",
73 )
74 )
75 continue
77 # Get all the volume listed issues
78 for issue_tag in issues_tag:
79 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data(
80 issue_tag, issue_defaut_nb
81 )
82 xissues.append(
83 self.create_xissue(
84 urljoin(self.source_website, issue_href),
85 issue_year,
86 volume_group.get("volume"),
87 issue_nb,
88 )
89 )
90 return xissues
92 def get_issue_data(self, issue_tag, default_issue_nb):
93 """
94 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1)
95 """
96 year_span = issue_tag.select_one("span.date")
97 if not year_span:
98 raise ValueError("Couldn't parse year for issue")
99 year = year_span.text.split(" ")[-1]
101 issue_href = issue_tag.get("href")
102 if not isinstance(issue_href, str):
103 raise ValueError("Couldn't parse issue href")
105 try:
106 issue = regex_to_dict(
107 self.issue_re, issue_tag.text, error_msg="Couldn't parse issue number"
108 )
109 except ValueError:
110 try:
111 issue = regex_to_dict(
112 self.issue_error_re, issue_tag.text, error_msg="Couldn't parse issue number"
113 )
114 except ValueError:
115 raise ValueError(f"Couldn't parse issue number from text: {issue_tag.text}")
117 issue_nb = issue.get("issue")
118 return issue_nb, issue_href, year, default_issue_nb
120 def parse_issue_content(self, content, xissue):
121 soup = BeautifulSoup(content, "html.parser")
122 articles = soup.select("div.representation")
123 article_number = 0
124 for article in articles:
125 xarticle = create_articledata()
126 article_href = article.select_one("a.part-link").get("href")
127 if not isinstance(article_href, str):
128 raise ValueError("Couldn't parse article href")
129 xarticle.url = urljoin(self.source_website, article_href)
130 xarticle.pid = "a" + str(article_number)
131 xissue.articles.append(xarticle)
132 article_number += 1
134 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')")
135 if has_pagination:
136 pagination_link = has_pagination.get("href")
137 if isinstance(pagination_link, str):
138 page_url = urljoin(xissue.url, pagination_link)
139 content = self.download_file(page_url)
141 self.parse_issue_content(content, xissue)
143 def parse_article_content(self, content, xissue, xarticle, url):
144 soup = BeautifulSoup(content, "html.parser")
146 self.get_metadata_using_citation_meta(
147 xarticle,
148 xissue,
149 soup,
150 [
151 "pdf",
152 "page",
153 "doi",
154 "publisher",
155 "citation_keywords",
156 "citation_reference",
157 ],
158 )
160 ## Title
161 title_tag = soup.select_one("hgroup > h1")
162 if title_tag is None:
163 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}")
164 xarticle.title_tex = cleanup_str(title_tag.text)
166 ## Abstract
167 abstract_tag = soup.select_one("div.abstract")
169 if abstract_tag:
170 abstract = cleanup_str(abstract_tag.text)
171 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))
172 else:
173 logger.info(f"No abstract found for article with url: {xarticle.url}")
175 ## keywords
176 keywords_tag = soup.select_one("div.keywords")
177 keywords = keywords_tag.select("span") if keywords_tag else []
178 for keyword in keywords:
179 xarticle.kwds.append(
180 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)}
181 )
183 ## Contributors name doi email
184 self.parse_cup_contributors(soup, xarticle)
186 references_list = soup.select_one("#references-list")
187 if references_list:
188 xarticle.bibitems = self.parse_cambridge_references(references_list)
189 return xarticle
191 def parse_cup_contributors(self, soup, xarticle):
192 # Fetch ORCIDs [Name, ORCID]
193 contributors = soup.select_one("div.contributors-details")
194 if not contributors:
195 raise ValueError("Couldn't parse contributors")
197 orcid_by_name = {}
198 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}):
199 name = orcid_link["data-test-orcid"]
200 href = orcid_link.get("href", "")
201 orcid_id = href.rstrip("/").split("/")[-1] if href else None
202 orcid_by_name[name] = orcid_id
204 # Fetch Emails [Name, Email]
205 email_by_name = {}
206 for corresp in contributors.find_all(class_="corresp"):
207 mailto = corresp.find("a", href=re.compile(r"^mailto:"))
208 if mailto:
209 email = mailto["href"].replace("mailto:", "")
210 # Le nom du correspondant est souvent juste avant dans le texte
211 # On cherche dans les blocs .author le lien corresp
212 email_by_name["__corresp__"] = email # sera affiné ci-dessous
214 # Fetch Authors
215 for author_block in contributors.find_all(attrs={"data-test-author": True}):
216 string_name = author_block["data-test-author"]
218 # Split name into first and last name
219 parts = string_name.strip().split()
220 if len(parts) >= 2:
221 first_name = " ".join(parts[:-1])
222 last_name = parts[-1]
223 else:
224 first_name = ""
225 last_name = string_name
227 # ORCID
228 orcid = orcid_by_name.get(string_name)
230 # Email
231 email = ""
232 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:"))
233 if mailto_tag:
234 email = mailto_tag["href"].replace("mailto:", "")
236 xarticle.contributors.append(
237 create_contributor(
238 role="author",
239 string_name=string_name,
240 first_name=first_name,
241 last_name=last_name,
242 orcid=orcid,
243 email=email,
244 )
245 )
246 return xarticle
248 def parse_cambridge_references(self, soup: Tag):
249 bibitems = []
250 for item in soup.select(".circle-list__item"):
251 citation_builder = MixedCitation()
252 label_tag = item.select_one(".circle-list__item__number")
253 if label_tag:
254 citation_builder.label = escape(cleanup_str(label_tag.text))
255 citation_content = item.select_one(".circle-list__item__grouped__content")
256 if citation_content:
257 self.parse_cambridge_ref_nodes(citation_content, citation_builder)
259 # Group all StringNames into one PersonGroup object
260 persongroup_builder = GenericRefElement()
261 persongroup_builder.name = "person-group"
262 # Index of StringNames objects
263 i = [
264 index
265 for index, element in enumerate(citation_builder.elements)
266 if isinstance(element, GenericRefElement) and element.name == "string-name"
267 ]
268 if len(i) > 0:
269 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1]
270 del citation_builder.elements[i[0] : i[-1] + 1]
271 citation_builder.elements.insert(i[0], persongroup_builder)
273 bibitems.append(citation_builder.get_jats_ref())
274 return bibitems
276 def parse_cambridge_ref_nodes(
277 self,
278 current_tag: Tag,
279 current_builder: GenericRefElement,
280 ):
281 "recursive function that parses references tags"
282 for element in current_tag.children:
283 if isinstance(element, str):
284 current_builder.elements.append(escape(element))
285 continue
286 if isinstance(element, Tag):
287 tag_class = element.get("class")
288 if isinstance(tag_class, list):
289 if len(tag_class) > 0:
290 tag_class = tag_class[0]
291 else:
292 tag_class = None
294 if not tag_class:
295 continue
296 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"):
297 continue
298 if element.name == "a":
299 href = element.get("href")
300 if isinstance(href, str):
301 current_builder.elements.append(" ")
302 current_builder.elements.append(
303 ExtLinkXml(escape(href), escape(element.text))
304 )
305 continue
307 if tag_class in [
308 "surname",
309 "given-names",
310 "string-name",
311 "person-group",
312 "publisher-name",
313 "source",
314 "volume",
315 "year",
316 "fpage",
317 "lpage",
318 "article-title",
319 "issue",
320 "chapter-title",
321 "inline-formula",
322 "collab",
323 "alternatives",
324 "italic",
325 "publisher-loc",
326 "roman",
327 "edition",
328 "suffix",
329 ]:
330 refnode_builder = GenericRefElement()
331 refnode_builder.name = tag_class
332 current_builder.elements.append(refnode_builder)
333 self.parse_cambridge_ref_nodes(element, refnode_builder)
334 continue
336 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}")
337 current_builder.elements.append(escape(element.text))