Coverage for src / crawler / by_source / cup_crawler.py: 14%

115 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-02 13:20 +0000

1import logging 

2import re 

3from urllib.parse import urljoin 

4 

5from bs4 import BeautifulSoup 

6from ptf.model_data import create_abstract, create_articledata, create_contributor 

7 

8from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

9from crawler.utils import cleanup_str, regex_to_dict 

10 

11logger = logging.getLogger(__name__) 

12 

13 

14class CupCrawler(BaseCollectionCrawler): 

15 source_name = "Cambridge" 

16 source_domain = "CUP" 

17 source_website = "https://www.cambridge.org" 

18 

19 issue_re = r"Issue (?P<issue>\S+)" 

20 issue_error_re = r"Volume (?P<issue_nb>\d+)" 

21 volume_re = r"Volume (?P<volume>\d+)" 

22 archive_year_re = r"Archive content \n\n\n (?P<year>\S+)" 

23 

24 def parse_collection_content(self, content): 

25 xissues = [] 

26 soup = BeautifulSoup(content, "html.parser") 

27 

28 volumes_tag = soup.select( 

29 "div.journal-all-issues > ul > li > div.content > ul.accordion > li.accordion-navigation" 

30 ) 

31 for volume_tag in volumes_tag: 

32 issue_defaut_nb = "1" 

33 volume = volume_tag.select_one("a") 

34 if volume is None: 

35 raise ValueError("Couldn't parse volume tag") 

36 

37 try: 

38 volume_group = regex_to_dict( 

39 self.volume_re, volume.text, error_msg="Couldn't parse volume number" 

40 ) 

41 except ValueError: 

42 raise ValueError("Couldn't parse volume number from text: '{volume.text}'") 

43 

44 issues_tag = volume_tag.select("div > ul > li > ul > li > a") 

45 

46 ## If no issue listed : we consider the volume has only one issue 

47 if not issues_tag: 

48 issue_href = volume.get("href") 

49 year_span = volume.select_one("span.date") 

50 if not year_span: 

51 raise ValueError("Couldn't parse year for volume with no issue") 

52 year = year_span.text.split(" ")[-1] 

53 xissues.append( 

54 self.create_xissue( 

55 urljoin(self.source_website, issue_href), 

56 year, 

57 volume_group.get("volume"), 

58 "1", 

59 ) 

60 ) 

61 continue 

62 

63 # Get all the volume listed issues 

64 for issue_tag in issues_tag: 

65 issue_nb, issue_href, issue_year, issue_defaut_nb = self.get_issue_data( 

66 issue_tag, issue_defaut_nb 

67 ) 

68 xissues.append( 

69 self.create_xissue( 

70 urljoin(self.source_website, issue_href), 

71 issue_year, 

72 volume_group.get("volume"), 

73 issue_nb, 

74 ) 

75 ) 

76 return xissues 

77 

78 def get_issue_data(self, issue_tag, default_issue_nb): 

79 """ 

80 Get issue number in classic case but also in the special case of volume 27 with no issue number (defaults to issue 1) 

81 """ 

82 year_span = issue_tag.select_one("span.date") 

83 if not year_span: 

84 raise ValueError("Couldn't parse year for issue") 

85 year = year_span.text.split(" ")[-1] 

86 

87 issue_href = issue_tag.get("href") 

88 if not isinstance(issue_href, str): 

89 raise ValueError("Couldn't parse issue href") 

90 

91 text = issue_tag.text 

92 

93 issue_nb = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue number").get( 

94 "issue" 

95 ) 

96 return issue_nb, issue_href, year, default_issue_nb 

97 

98 def parse_issue_content(self, content, xissue): 

99 soup = BeautifulSoup(content, "html.parser") 

100 articles = soup.select("div.representation") 

101 article_number = 0 

102 for article in articles: 

103 xarticle = create_articledata() 

104 article_href = article.select_one("a.part-link").get("href") 

105 if not isinstance(article_href, str): 

106 raise ValueError("Couldn't parse article href") 

107 xarticle.url = urljoin(self.source_website, article_href) 

108 xarticle.pid = "a" + str(article_number) 

109 xissue.articles.append(xarticle) 

110 article_number += 1 

111 

112 def parse_article_content(self, content, xissue, xarticle, url): 

113 soup = BeautifulSoup(content, "html.parser") 

114 

115 self.get_metadata_using_citation_meta( 

116 xarticle, 

117 xissue, 

118 soup, 

119 [ 

120 "pdf", 

121 "page", 

122 "doi", 

123 "publisher", 

124 "citation_keywords", 

125 "citation_reference", 

126 ], 

127 ) 

128 

129 ## Title 

130 title_tag = soup.select_one("hgroup > h1") 

131 if title_tag is None: 

132 raise ValueError(f"Couldn't parse article title for article with url: {xarticle.url}") 

133 xarticle.title_tex = cleanup_str(title_tag.text) 

134 

135 ## Abstract 

136 abstract_tag = soup.select_one("div.abstract") 

137 

138 if abstract_tag: 

139 abstract = cleanup_str(abstract_tag.text) 

140 xarticle.abstracts.append(create_abstract(value_tex=abstract, lang=xarticle.lang)) 

141 else: 

142 logger.info(f"No abstract found for article with url: {xarticle.url}") 

143 

144 ## keywords 

145 keywords_tag = soup.select_one("div.keywords") 

146 keywords = keywords_tag.select("span") if keywords_tag else [] 

147 for keyword in keywords: 

148 xarticle.kwds.append( 

149 {"type": "", "lang": xarticle.lang, "value": cleanup_str(keyword.text)} 

150 ) 

151 

152 ## Contributors name doi email 

153 self.parse_cup_contributors(soup, xarticle) 

154 

155 return xarticle 

156 

157 def parse_cup_contributors(self, soup, xarticle): 

158 # Fetch ORCIDs [Name, ORCID] 

159 contributors = soup.select_one("div.contributors-details") 

160 if not contributors: 

161 raise ValueError("Couldn't parse contributors") 

162 

163 orcid_by_name = {} 

164 for orcid_link in contributors.find_all("a", {"data-test-orcid": True}): 

165 name = orcid_link["data-test-orcid"] 

166 href = orcid_link.get("href", "") 

167 orcid_id = href.rstrip("/").split("/")[-1] if href else None 

168 orcid_by_name[name] = orcid_id 

169 

170 # Fetch Emails [Name, Email] 

171 email_by_name = {} 

172 for corresp in contributors.find_all(class_="corresp"): 

173 mailto = corresp.find("a", href=re.compile(r"^mailto:")) 

174 if mailto: 

175 email = mailto["href"].replace("mailto:", "") 

176 # Le nom du correspondant est souvent juste avant dans le texte 

177 # On cherche dans les blocs .author le lien corresp 

178 email_by_name["__corresp__"] = email # sera affiné ci-dessous 

179 

180 # Fetch Authors 

181 for author_block in contributors.find_all(attrs={"data-test-author": True}): 

182 string_name = author_block["data-test-author"] 

183 

184 # Split name into first and last name 

185 parts = string_name.strip().split() 

186 if len(parts) >= 2: 

187 first_name = " ".join(parts[:-1]) 

188 last_name = parts[-1] 

189 else: 

190 first_name = "" 

191 last_name = string_name 

192 

193 # ORCID 

194 orcid = orcid_by_name.get(string_name) 

195 

196 # Email 

197 email = "" 

198 mailto_tag = author_block.find("a", href=re.compile(r"^mailto:")) 

199 if mailto_tag: 

200 email = mailto_tag["href"].replace("mailto:", "") 

201 

202 xarticle.contributors.append( 

203 create_contributor( 

204 role="author", 

205 string_name=string_name, 

206 first_name=first_name, 

207 last_name=last_name, 

208 orcid=orcid, 

209 email=email, 

210 ) 

211 ) 

212 return xarticle