Coverage for src/crawler/utils.py: 47%

76 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import json 

2import os 

3import re 

4import unicodedata 

5from functools import lru_cache 

6 

7import requests 

8from django.contrib.auth.models import User 

9from history.views import insert_history_event 

10from ptf import model_helpers 

11from ptf.cmds import ptf_cmds 

12from ptf.exceptions import ResourceDoesNotExist 

13from ptf.model_data import ResourceData, create_extlink, create_publicationdata 

14from ptf.models import Collection 

15 

16from crawler.types import JSONCol 

17 

18# from ptf.models import ResourceId 

19 

20 

21def insert_crawl_event_in_history( 

22 colid, source_name, username, status, tasks_count, message, event_type="import", title=None 

23): 

24 collection = model_helpers.get_collection(colid) 

25 user = User.objects.get(username=username) 

26 

27 event_data = { 

28 "type": event_type, 

29 "pid": f"{colid}-{source_name}", 

30 "col": colid, 

31 "source": source_name, 

32 "status": status, 

33 "title": collection.title_html if collection else (title or colid), 

34 "userid": user.id, 

35 "type_error": "", 

36 "data": { 

37 "ids_count": tasks_count, 

38 "message": message, 

39 "target": "", 

40 }, 

41 } 

42 

43 insert_history_event(event_data) 

44 

45 

46def col_has_source(col: JSONCol, filter: str): 

47 return any(source for source in col["sources"] if source == filter) 

48 

49 

50def get_cols_by_source(source: str) -> list[JSONCol]: 

51 """ 

52 Get all cols by source 

53 @param source: str 

54 @return: list of collections 

55 """ 

56 data = get_all_cols() 

57 

58 return [col for col in data.values() if col_has_source(col, source)] 

59 

60 

61def get_all_cols_by_source(): 

62 """ 

63 Get all cols by source 

64 @return: dict of collections by source 

65 """ 

66 data = get_all_cols() 

67 

68 sources = {} 

69 for col in data.values(): 

70 for source in col["sources"]: 

71 if source not in sources: 

72 sources[source] = [] 

73 sources[source].append(col) 

74 

75 return sources 

76 

77 

78@lru_cache(maxsize=None) 

79def get_all_cols() -> dict[str, JSONCol]: 

80 with open( 

81 os.path.dirname(os.path.abspath(__file__)) + "/data/all_cols.json", encoding="utf8" 

82 ) as data_collections: 

83 return json.load(data_collections) 

84 

85 

86def get_numdam_collections(): 

87 """ 

88 Returns a list of Numdam collection pids 

89 """ 

90 

91 url = "http://www.numdam.org/api-all-collections/" 

92 

93 response = requests.get(url) 

94 if response.status_code != 200: 

95 return [] 

96 

97 data = response.json() 

98 if "collections" not in data: 

99 return [] 

100 

101 return data["collections"] 

102 

103 

104def get_or_create_collection(pid: str): 

105 """ 

106 Creates a Collection based on its pid. 

107 The pid has to be in the list of collections given by the Documentation team (CSV then transformed in JSON) 

108 """ 

109 

110 all_collections = get_all_cols() 

111 

112 if pid not in all_collections: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError(f"{pid} is not listed in all_cols.csv") 

114 

115 col_data = [item for item in all_collections.items() if item[0] == pid][0][1] 

116 

117 collection: Collection | None = model_helpers.get_collection(pid) 

118 

119 if not collection: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 p = model_helpers.get_provider("mathdoc-id") 

121 

122 xcol = create_publicationdata() 

123 xcol.coltype = "journal" 

124 xcol.pid = pid 

125 xcol.title_tex = col_data["title"] 

126 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs 

127 # xcol.e_issn = col_data["ISSN_électronique"] 

128 # xcol.issn = col_data["ISSN_papier"] 

129 xcol.title_html = col_data["title"] 

130 xcol.title_xml = f"<title-group><title>{col_data['title']}</title></title-group>" 

131 xcol.lang = "en" 

132 

133 cmd = ptf_cmds.addCollectionPtfCmd({"xobj": xcol}) 

134 cmd.set_provider(p) 

135 collection = cmd.do() 

136 

137 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs 

138 # if col_data["ISSN_électronique"] != "": 

139 # e_issn = { 

140 # "resource_id": collection.resource_ptr_id, 

141 # "id_type": "e_issn", 

142 # "id_value": col_data["ISSN_électronique"], 

143 # } 

144 # ResourceId.objects.create(**e_issn) 

145 # 

146 # if col_data["ISSN_papier"] != "": 

147 # issn = { 

148 # "resource_id": collection.resource_ptr_id, 

149 # "id_type": "issn", 

150 # "id_value": col_data["ISSN_papier"], 

151 # } 

152 # ResourceId.objects.create(**issn) 

153 

154 if not collection: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 raise ResourceDoesNotExist(f"Resource {pid} does not exist") 

156 

157 return collection 

158 

159 

160# ??? is this used ? 

161NUMDAM_COLLECTIONS = [ 

162 "ACIRM", 

163 "ALCO", 

164 "AFST", 

165 "AIHPC", 

166 "AIHPA", 

167 "AIHPB", 

168 "AIF", 

169 "AIHP", 

170 "AUG", 

171 "AMPA", 

172 "AHL", 

173 "AMBP", 

174 "ASENS", 

175 "ASCFPA", 

176 "ASCFM", 

177 "ASNSP", 

178 "AST", 

179 "BSMF", 

180 "BSMA", 

181 "CTGDC", 

182 "BURO", 

183 "CSHM", 

184 "CG", 

185 "CM", 

186 "CRMATH", 

187 "CML", 

188 "CJPS", 

189 "CIF", 

190 "DIA", 

191 "COCV", 

192 "M2AN", 

193 "PS", 

194 "GAU", 

195 "GEA", 

196 "STS", 

197 "TAN", 

198 "JSFS", 

199 "JEP", 

200 "JMPA", 

201 "JTNB", 

202 "JEDP", 

203 "CAD", 

204 "CCIRM", 

205 "RCP25", 

206 "MSIA", 

207 "MRR", 

208 "MSH", 

209 "MSMF", 

210 "MSM", 

211 "NAM", 

212 "OJMO", 

213 "PHSC", 

214 "PSMIR", 

215 "PDML", 

216 "PMB", 

217 "PMIHES", 

218 "PMIR", 

219 "RO", 

220 "RCP", 

221 "ITA", 

222 "RSMUP", 

223 "RSA", 

224 "RHM", 

225 "SG", 

226 "SB", 

227 "SBCD", 

228 "SC", 

229 "SCC", 

230 "SAF", 

231 "SDPP", 

232 "SMJ", 

233 "SPHM", 

234 "SPS", 

235 "STNB", 

236 "STNG", 

237 "TSG", 

238 "SD", 

239 "SE", 

240 "SEDP", 

241 "SHC", 

242 "SJ", 

243 "SJL", 

244 "SLSEDP", 

245 "SLDB", 

246 "SL", 

247 "SPK", 

248 "SAC", 

249 "SMS", 

250 "SLS", 

251 "SSL", 

252 "SENL", 

253 "SSS", 

254 "SAD", 

255 "THESE", 

256 "SMAI-JCM", 

257 "WBLN", 

258] 

259 

260 

261def cleanup_str(input: str): 

262 # some white spaces aren't actual space characters, like \xa0 

263 input = unicodedata.normalize("NFKC", input) 

264 # 

265 input = re.sub(r"[\x7f]+", "", input) 

266 # remove useless continuous \n and spaces from the string 

267 return re.sub(r"[\n\t\r ]+", " ", input).strip() 

268 

269 

270def add_pdf_link_to_xarticle(xarticle: ResourceData, pdf_url: str): 

271 xarticle.streams.append( 

272 { 

273 "rel": "full-text", 

274 "mimetype": "application/pdf", 

275 "location": pdf_url, 

276 "base": "", 

277 "text": "Full Text", 

278 } 

279 ) 

280 

281 # The pdf url is already added as a stream (just above) but might be replaced by a file later on. 

282 # Keep the pdf url as an Extlink if we want to propose both option: 

283 # - direct download of a local PDF 

284 # - URL to the remote PDF 

285 ext_link = create_extlink(rel="article-pdf", location=pdf_url) 

286 xarticle.ext_links.append(ext_link)