Coverage for src / crawler / by_source / cup_crawler.py: 14%
115 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-02 13:20 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-02 13:20 +0000
1import logging
2import re
3from urllib.parse import urljoin
5from bs4 import BeautifulSoup
6from ptf.model_data import create_abstract, create_articledata, create_contributor
8from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
9from crawler.utils import cleanup_str, regex_to_dict
11logger = logging.getLogger(__name__)
14class CupCrawler(BaseCollectionCrawler):
15 source_name = "Cambridge"
16 source_domain = "CUP"
17 source_website = "https://www.cambridge.org"
19 issue_re = r"Issue (?P<issue>\S+)"
20 issue_error_re = r"Volume (?P<issue_nb>\d+)"
21 volume_re = r"Volume (?P<volume>\d+)"
22 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)"
24 def parse_collection_content(self, content):
25 xissues = []
26 soup = BeautifulSoup(content, "html.parser")
28 volumes_tag = soup.select(
29 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation"
30 )
31 for volume_tag in volumes_tag:
32 issue_defaut_nb = "1"
33 volume = volume_tag.select_one("a")
34 if volume is None:
35 raise ValueError("Couldn't parse volume tag")
37 try:
38 volume_group = regex_to_dict(
39 self.volume_re, volume.text, error_msg="Couldn't parse volume number"
40 )
41 except ValueError:
42 raise ValueError("Couldn't parse volume number from text: '{volume.text}'")
44 issues_tag = volume_tag.select("div > ul > li > ul > li > a")
46 ## If no issue listed : we consider the volume has only one issue
47 if not issues_tag:
48 issue_href = volume.get("href")
49 year_span = volume.select_one("span.date")
50 if not year_span:
51 raise ValueError("Couldn't parse year for volume with no issue")
52 year = year_span.text.split(" ")[-1]
53 xissues.append(
54 self.create_xissue(
55 urljoin(self.source_website, issue_href),
56 year,
57 volume_group.get("volume"),
58 "1",
59 )
60 )
61 continue
63 # Get all the volume listed issues
64 for issue_tag in issues_tag:
65 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data(
66 issue_tag, issue_defaut_nb
67 )
68 xissues.append(
69 self.create_xissue(
70 urljoin(self.source_website, issue_href),
71 issue_year,
72 volume_group.get("volume"),
73 issue_nb,
74 )
75 )
76 return xissues
78 def get_issue_data(self, issue_tag, default_issue_nb):
79 """
80 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1)
81 """
82 year_span = issue_tag.select_one("span.date")
83 if not year_span:
84 raise ValueError("Couldn't parse year for issue")
85 year = year_span.text.split(" ")[-1]
87 issue_href = issue_tag.get("href")
88 if not isinstance(issue_href, str):
89 raise ValueError("Couldn't parse issue href")
91 text = issue_tag.text
93 issue_nb = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue number").get(
94 "issue"
95 )
96 return issue_nb, issue_href, year, default_issue_nb
98 def parse_issue_content(self, content, xissue):
99 soup = BeautifulSoup(content, "html.parser")
100 articles = soup.select("div.representation")
101 article_number = 0
102 for article in articles:
103 xarticle = create_articledata()
104 article_href = article.select_one("a.part-link").get("href")
105 if not isinstance(article_href, str):
106 raise ValueError("Couldn't parse article href")
107 xarticle.url = urljoin(self.source_website, article_href)
108 xarticle.pid = "a" + str(article_number)
109 xissue.articles.append(xarticle)
110 article_number += 1
112 def parse_article_content(self, content, xissue, xarticle, url):
113 soup = BeautifulSoup(content, "html.parser")
115 self.get_metadata_using_citation_meta(
116 xarticle,
117 xissue,
118 soup,
119 [
120 "pdf",
121 "page",
122 "doi",
123 "publisher",
124 "citation_keywords",
125 "citation_reference",
126 ],
127 )
129 ## Title
130 title_tag = soup.select_one("hgroup > h1")
131 if title_tag is None:
132 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}")
133 xarticle.title_tex = cleanup_str(title_tag.text)
135 ## Abstract
136 abstract_tag = soup.select_one("div.abstract")
138 if abstract_tag:
139 abstract = cleanup_str(abstract_tag.text)
140 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang))
141 else:
142 logger.info(f"No abstract found for article with url: {xarticle.url}")
144 ## keywords
145 keywords_tag = soup.select_one("div.keywords")
146 keywords = keywords_tag.select("span") if keywords_tag else []
147 for keyword in keywords:
148 xarticle.kwds.append(
149 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)}
150 )
152 ## Contributors name doi email
153 self.parse_cup_contributors(soup, xarticle)
155 return xarticle
157 def parse_cup_contributors(self, soup, xarticle):
158 # Fetch ORCIDs [Name, ORCID]
159 contributors = soup.select_one("div.contributors-details")
160 if not contributors:
161 raise ValueError("Couldn't parse contributors")
163 orcid_by_name = {}
164 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}):
165 name = orcid_link["data-test-orcid"]
166 href = orcid_link.get("href", "")
167 orcid_id = href.rstrip("/").split("/")[-1] if href else None
168 orcid_by_name[name] = orcid_id
170 # Fetch Emails [Name, Email]
171 email_by_name = {}
172 for corresp in contributors.find_all(class_="corresp"):
173 mailto = corresp.find("a", href=re.compile(r"^mailto:"))
174 if mailto:
175 email = mailto["href"].replace("mailto:", "")
176 # Le nom du correspondant est souvent juste avant dans le texte
177 # On cherche dans les blocs .author le lien corresp
178 email_by_name["__corresp__"] = email # sera affiné ci-dessous
180 # Fetch Authors
181 for author_block in contributors.find_all(attrs={"data-test-author": True}):
182 string_name = author_block["data-test-author"]
184 # Split name into first and last name
185 parts = string_name.strip().split()
186 if len(parts) >= 2:
187 first_name = " ".join(parts[:-1])
188 last_name = parts[-1]
189 else:
190 first_name = ""
191 last_name = string_name
193 # ORCID
194 orcid = orcid_by_name.get(string_name)
196 # Email
197 email = ""
198 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:"))
199 if mailto_tag:
200 email = mailto_tag["href"].replace("mailto:", "")
202 xarticle.contributors.append(
203 create_contributor(
204 role="author",
205 string_name=string_name,
206 first_name=first_name,
207 last_name=last_name,
208 orcid=orcid,
209 email=email,
210 )
211 )
212 return xarticle