Coverage for src/crawler/utils.py: 47%
76 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import json
2import os
3import re
4import unicodedata
5from functools import lru_cache
7import requests
8from django.contrib.auth.models import User
9from history.views import insert_history_event
10from ptf import model_helpers
11from ptf.cmds import ptf_cmds
12from ptf.exceptions import ResourceDoesNotExist
13from ptf.model_data import ResourceData, create_extlink, create_publicationdata
14from ptf.models import Collection
16from crawler.types import JSONCol
18# from ptf.models import ResourceId
21def insert_crawl_event_in_history(
22 colid, source_name, username, status, tasks_count, message, event_type="import", title=None
23):
24 collection = model_helpers.get_collection(colid)
25 user = User.objects.get(username=username)
27 event_data = {
28 "type": event_type,
29 "pid": f"{colid}-{source_name}",
30 "col": colid,
31 "source": source_name,
32 "status": status,
33 "title": collection.title_html if collection else (title or colid),
34 "userid": user.id,
35 "type_error": "",
36 "data": {
37 "ids_count": tasks_count,
38 "message": message,
39 "target": "",
40 },
41 }
43 insert_history_event(event_data)
46def col_has_source(col: JSONCol, filter: str):
47 return any(source for source in col["sources"] if source == filter)
50def get_cols_by_source(source: str) -> list[JSONCol]:
51 """
52 Get all cols by source
53 @param source: str
54 @return: list of collections
55 """
56 data = get_all_cols()
58 return [col for col in data.values() if col_has_source(col, source)]
61def get_all_cols_by_source():
62 """
63 Get all cols by source
64 @return: dict of collections by source
65 """
66 data = get_all_cols()
68 sources = {}
69 for col in data.values():
70 for source in col["sources"]:
71 if source not in sources:
72 sources[source] = []
73 sources[source].append(col)
75 return sources
78@lru_cache(maxsize=None)
79def get_all_cols() -> dict[str, JSONCol]:
80 with open(
81 os.path.dirname(os.path.abspath(__file__)) + "/data/all_cols.json", encoding="utf8"
82 ) as data_collections:
83 return json.load(data_collections)
86def get_numdam_collections():
87 """
88 Returns a list of Numdam collection pids
89 """
91 url = "http://www.numdam.org/api-all-collections/"
93 response = requests.get(url)
94 if response.status_code != 200:
95 return []
97 data = response.json()
98 if "collections" not in data:
99 return []
101 return data["collections"]
104def get_or_create_collection(pid: str):
105 """
106 Creates a Collection based on its pid.
107 The pid has to be in the list of collections given by the Documentation team (CSV then transformed in JSON)
108 """
110 all_collections = get_all_cols()
112 if pid not in all_collections: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError(f"{pid} is not listed in all_cols.csv")
115 col_data = [item for item in all_collections.items() if item[0] == pid][0][1]
117 collection: Collection | None = model_helpers.get_collection(pid)
119 if not collection: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 p = model_helpers.get_provider("mathdoc-id")
122 xcol = create_publicationdata()
123 xcol.coltype = "journal"
124 xcol.pid = pid
125 xcol.title_tex = col_data["title"]
126 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs
127 # xcol.e_issn = col_data["ISSN_électronique"]
128 # xcol.issn = col_data["ISSN_papier"]
129 xcol.title_html = col_data["title"]
130 xcol.title_xml = f"<title-group><title>{col_data['title']}</title></title-group>"
131 xcol.lang = "en"
133 cmd = ptf_cmds.addCollectionPtfCmd({"xobj": xcol})
134 cmd.set_provider(p)
135 collection = cmd.do()
137 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs
138 # if col_data["ISSN_électronique"] != "":
139 # e_issn = {
140 # "resource_id": collection.resource_ptr_id,
141 # "id_type": "e_issn",
142 # "id_value": col_data["ISSN_électronique"],
143 # }
144 # ResourceId.objects.create(**e_issn)
145 #
146 # if col_data["ISSN_papier"] != "":
147 # issn = {
148 # "resource_id": collection.resource_ptr_id,
149 # "id_type": "issn",
150 # "id_value": col_data["ISSN_papier"],
151 # }
152 # ResourceId.objects.create(**issn)
154 if not collection: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 raise ResourceDoesNotExist(f"Resource {pid} does not exist")
157 return collection
160# ??? is this used ?
161NUMDAM_COLLECTIONS = [
162 "ACIRM",
163 "ALCO",
164 "AFST",
165 "AIHPC",
166 "AIHPA",
167 "AIHPB",
168 "AIF",
169 "AIHP",
170 "AUG",
171 "AMPA",
172 "AHL",
173 "AMBP",
174 "ASENS",
175 "ASCFPA",
176 "ASCFM",
177 "ASNSP",
178 "AST",
179 "BSMF",
180 "BSMA",
181 "CTGDC",
182 "BURO",
183 "CSHM",
184 "CG",
185 "CM",
186 "CRMATH",
187 "CML",
188 "CJPS",
189 "CIF",
190 "DIA",
191 "COCV",
192 "M2AN",
193 "PS",
194 "GAU",
195 "GEA",
196 "STS",
197 "TAN",
198 "JSFS",
199 "JEP",
200 "JMPA",
201 "JTNB",
202 "JEDP",
203 "CAD",
204 "CCIRM",
205 "RCP25",
206 "MSIA",
207 "MRR",
208 "MSH",
209 "MSMF",
210 "MSM",
211 "NAM",
212 "OJMO",
213 "PHSC",
214 "PSMIR",
215 "PDML",
216 "PMB",
217 "PMIHES",
218 "PMIR",
219 "RO",
220 "RCP",
221 "ITA",
222 "RSMUP",
223 "RSA",
224 "RHM",
225 "SG",
226 "SB",
227 "SBCD",
228 "SC",
229 "SCC",
230 "SAF",
231 "SDPP",
232 "SMJ",
233 "SPHM",
234 "SPS",
235 "STNB",
236 "STNG",
237 "TSG",
238 "SD",
239 "SE",
240 "SEDP",
241 "SHC",
242 "SJ",
243 "SJL",
244 "SLSEDP",
245 "SLDB",
246 "SL",
247 "SPK",
248 "SAC",
249 "SMS",
250 "SLS",
251 "SSL",
252 "SENL",
253 "SSS",
254 "SAD",
255 "THESE",
256 "SMAI-JCM",
257 "WBLN",
258]
261def cleanup_str(input: str):
262 # some white spaces aren't actual space characters, like \xa0
263 input = unicodedata.normalize("NFKC", input)
264 #
265 input = re.sub(r"[\x7f]+", "", input)
266 # remove useless continuous \n and spaces from the string
267 return re.sub(r"[\n\t\r ]+", " ", input).strip()
270def add_pdf_link_to_xarticle(xarticle: ResourceData, pdf_url: str):
271 xarticle.streams.append(
272 {
273 "rel": "full-text",
274 "mimetype": "application/pdf",
275 "location": pdf_url,
276 "base": "",
277 "text": "Full Text",
278 }
279 )
281 # The pdf url is already added as a stream (just above) but might be replaced by a file later on.
282 # Keep the pdf url as an Extlink if we want to propose both option:
283 # - direct download of a local PDF
284 # - URL to the remote PDF
285 ext_link = create_extlink(rel="article-pdf", location=pdf_url)
286 xarticle.ext_links.append(ext_link)