Coverage for src/crawler/utils.py: 52%

71 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import json 

2import os 

3import re 

4import unicodedata 

5from functools import lru_cache 

6 

7import requests 

8from django.contrib.auth.models import User 

9from history.views import insert_history_event 

10from ptf import model_helpers 

11from ptf.cmds import ptf_cmds 

12from ptf.exceptions import ResourceDoesNotExist 

13from ptf.model_data import ArticleData, create_extlink, create_publicationdata 

14from ptf.models import Collection 

15 

16from crawler.types import JSONCol 

17 

18# from ptf.models import ResourceId 

19 

20 

21def insert_crawl_event_in_history( 

22 colid, source_name, username, status, tasks_count, message, event_type="import" 

23): 

24 collection = model_helpers.get_collection(colid) 

25 if collection is None: 

26 UserWarning(f"Collection {colid} cannot be found inside model_helpers") 

27 return 

28 user = User.objects.get(username=username) 

29 

30 event_data = { 

31 "type": event_type, 

32 "pid": f"{colid}-{source_name}", 

33 "col": colid, 

34 "source": source_name, 

35 "status": status, 

36 "title": collection.title_html if collection is not None else "", 

37 "userid": user.id, 

38 "type_error": "", 

39 "data": { 

40 "ids_count": tasks_count, 

41 "message": message, 

42 "target": "", 

43 }, 

44 } 

45 

46 insert_history_event(event_data) 

47 

48 

49def col_has_source(col: JSONCol, filter: str): 

50 return any(source for source in col["sources"] if source == filter) 

51 

52 

53def get_cols_by_source(source: str) -> list[JSONCol]: 

54 """ 

55 Get all cols by source 

56 @param source: str 

57 @return: list of collections 

58 """ 

59 data = get_all_cols() 

60 

61 return [col for col in data.values() if col_has_source(col, source)] 

62 

63 

64@lru_cache(maxsize=None) 

65def get_all_cols() -> dict[str, JSONCol]: 

66 with open( 

67 os.path.dirname(os.path.abspath(__file__)) + "/data/all_cols.json", encoding="utf8" 

68 ) as data_collections: 

69 return json.load(data_collections) 

70 

71 

72def get_numdam_collections(): 

73 """ 

74 Returns a list of Numdam collection pids 

75 """ 

76 

77 url = "http://www.numdam.org/api-all-collections/" 

78 

79 response = requests.get(url) 

80 if response.status_code != 200: 

81 return [] 

82 

83 data = response.json() 

84 if "collections" not in data: 

85 return [] 

86 

87 return data["collections"] 

88 

89 

90def get_or_create_collection(pid: str): 

91 """ 

92 Creates a Collection based on its pid. 

93 The pid has to be in the list of collections given by the Documentation team (CSV then transformed in JSON) 

94 """ 

95 

96 all_collections = get_all_cols() 

97 

98 if pid not in all_collections: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise ValueError(f"{pid} is not listed in all_cols.csv") 

100 

101 col_data = [item for item in all_collections.items() if item[0] == pid][0][1] 

102 

103 collection: Collection | None = model_helpers.get_collection(pid) 

104 

105 if not collection: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 p = model_helpers.get_provider("mathdoc-id") 

107 

108 xcol = create_publicationdata() 

109 xcol.coltype = "journal" 

110 xcol.pid = pid 

111 xcol.title_tex = col_data["title"] 

112 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs 

113 # xcol.e_issn = col_data["ISSN_électronique"] 

114 # xcol.issn = col_data["ISSN_papier"] 

115 xcol.title_html = col_data["title"] 

116 xcol.title_xml = f"<title-group><title>{col_data['title']}</title></title-group>" 

117 xcol.lang = "en" 

118 

119 cmd = ptf_cmds.addCollectionPtfCmd({"xobj": xcol}) 

120 cmd.set_provider(p) 

121 collection = cmd.do() 

122 

123 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs 

124 # if col_data["ISSN_électronique"] != "": 

125 # e_issn = { 

126 # "resource_id": collection.resource_ptr_id, 

127 # "id_type": "e_issn", 

128 # "id_value": col_data["ISSN_électronique"], 

129 # } 

130 # ResourceId.objects.create(**e_issn) 

131 # 

132 # if col_data["ISSN_papier"] != "": 

133 # issn = { 

134 # "resource_id": collection.resource_ptr_id, 

135 # "id_type": "issn", 

136 # "id_value": col_data["ISSN_papier"], 

137 # } 

138 # ResourceId.objects.create(**issn) 

139 

140 if not collection: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 raise ResourceDoesNotExist(f"Resource {pid} does not exist") 

142 

143 return collection 

144 

145 

146# ??? is this used ? 

147NUMDAM_COLLECTIONS = [ 

148 "ACIRM", 

149 "ALCO", 

150 "AFST", 

151 "AIHPC", 

152 "AIHPA", 

153 "AIHPB", 

154 "AIF", 

155 "AIHP", 

156 "AUG", 

157 "AMPA", 

158 "AHL", 

159 "AMBP", 

160 "ASENS", 

161 "ASCFPA", 

162 "ASCFM", 

163 "ASNSP", 

164 "AST", 

165 "BSMF", 

166 "BSMA", 

167 "CTGDC", 

168 "BURO", 

169 "CSHM", 

170 "CG", 

171 "CM", 

172 "CRMATH", 

173 "CML", 

174 "CJPS", 

175 "CIF", 

176 "DIA", 

177 "COCV", 

178 "M2AN", 

179 "PS", 

180 "GAU", 

181 "GEA", 

182 "STS", 

183 "TAN", 

184 "JSFS", 

185 "JEP", 

186 "JMPA", 

187 "JTNB", 

188 "JEDP", 

189 "CAD", 

190 "CCIRM", 

191 "RCP25", 

192 "MSIA", 

193 "MRR", 

194 "MSH", 

195 "MSMF", 

196 "MSM", 

197 "NAM", 

198 "OJMO", 

199 "PHSC", 

200 "PSMIR", 

201 "PDML", 

202 "PMB", 

203 "PMIHES", 

204 "PMIR", 

205 "RO", 

206 "RCP", 

207 "ITA", 

208 "RSMUP", 

209 "RSA", 

210 "RHM", 

211 "SG", 

212 "SB", 

213 "SBCD", 

214 "SC", 

215 "SCC", 

216 "SAF", 

217 "SDPP", 

218 "SMJ", 

219 "SPHM", 

220 "SPS", 

221 "STNB", 

222 "STNG", 

223 "TSG", 

224 "SD", 

225 "SE", 

226 "SEDP", 

227 "SHC", 

228 "SJ", 

229 "SJL", 

230 "SLSEDP", 

231 "SLDB", 

232 "SL", 

233 "SPK", 

234 "SAC", 

235 "SMS", 

236 "SLS", 

237 "SSL", 

238 "SENL", 

239 "SSS", 

240 "SAD", 

241 "THESE", 

242 "SMAI-JCM", 

243 "WBLN", 

244] 

245 

246 

247def cleanup_str(input: str): 

248 # some white spaces aren't actual space characters, like \xa0 

249 input = unicodedata.normalize("NFKC", input) 

250 # 

251 input = re.sub(r"[\x7f]+", "", input) 

252 # remove useless continuous \n and spaces from the string 

253 return re.sub(r"[\n\t\r ]+", " ", input).strip() 

254 

255 

256def add_pdf_link_to_xarticle(xarticle: ArticleData, pdf_url: str): 

257 data = { 

258 "rel": "full-text", 

259 "mimetype": "application/pdf", 

260 "location": pdf_url, 

261 "base": "", 

262 "text": "Full Text", 

263 } 

264 xarticle.streams.append(data) 

265 

266 # The pdf url is already added as a stream (just above) but might be replaced by a file later on. 

267 # Keep the pdf url as an Extlink if we want to propose both option: 

268 # - direct download of a local PDF 

269 # - URL to the remote PDF 

270 ext_link = create_extlink(rel="article-pdf", location=pdf_url) 

271 xarticle.ext_links.append(ext_link)