Coverage for src/crawler/utils.py: 52%
71 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import json
2import os
3import re
4import unicodedata
5from functools import lru_cache
7import requests
8from django.contrib.auth.models import User
9from history.views import insert_history_event
10from ptf import model_helpers
11from ptf.cmds import ptf_cmds
12from ptf.exceptions import ResourceDoesNotExist
13from ptf.model_data import ArticleData, create_extlink, create_publicationdata
14from ptf.models import Collection
16from crawler.types import JSONCol
18# from ptf.models import ResourceId
21def insert_crawl_event_in_history(
22 colid, source_name, username, status, tasks_count, message, event_type="import"
23):
24 collection = model_helpers.get_collection(colid)
25 if collection is None:
26 UserWarning(f"Collection {colid} cannot be found inside model_helpers")
27 return
28 user = User.objects.get(username=username)
30 event_data = {
31 "type": event_type,
32 "pid": f"{colid}-{source_name}",
33 "col": colid,
34 "source": source_name,
35 "status": status,
36 "title": collection.title_html if collection is not None else "",
37 "userid": user.id,
38 "type_error": "",
39 "data": {
40 "ids_count": tasks_count,
41 "message": message,
42 "target": "",
43 },
44 }
46 insert_history_event(event_data)
49def col_has_source(col: JSONCol, filter: str):
50 return any(source for source in col["sources"] if source == filter)
53def get_cols_by_source(source: str) -> list[JSONCol]:
54 """
55 Get all cols by source
56 @param source: str
57 @return: list of collections
58 """
59 data = get_all_cols()
61 return [col for col in data.values() if col_has_source(col, source)]
64@lru_cache(maxsize=None)
65def get_all_cols() -> dict[str, JSONCol]:
66 with open(
67 os.path.dirname(os.path.abspath(__file__)) + "/data/all_cols.json", encoding="utf8"
68 ) as data_collections:
69 return json.load(data_collections)
72def get_numdam_collections():
73 """
74 Returns a list of Numdam collection pids
75 """
77 url = "http://www.numdam.org/api-all-collections/"
79 response = requests.get(url)
80 if response.status_code != 200:
81 return []
83 data = response.json()
84 if "collections" not in data:
85 return []
87 return data["collections"]
90def get_or_create_collection(pid: str):
91 """
92 Creates a Collection based on its pid.
93 The pid has to be in the list of collections given by the Documentation team (CSV then transformed in JSON)
94 """
96 all_collections = get_all_cols()
98 if pid not in all_collections: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 raise ValueError(f"{pid} is not listed in all_cols.csv")
101 col_data = [item for item in all_collections.items() if item[0] == pid][0][1]
103 collection: Collection | None = model_helpers.get_collection(pid)
105 if not collection: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 p = model_helpers.get_provider("mathdoc-id")
108 xcol = create_publicationdata()
109 xcol.coltype = "journal"
110 xcol.pid = pid
111 xcol.title_tex = col_data["title"]
112 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs
113 # xcol.e_issn = col_data["ISSN_électronique"]
114 # xcol.issn = col_data["ISSN_papier"]
115 xcol.title_html = col_data["title"]
116 xcol.title_xml = f"<title-group><title>{col_data['title']}</title></title-group>"
117 xcol.lang = "en"
119 cmd = ptf_cmds.addCollectionPtfCmd({"xobj": xcol})
120 cmd.set_provider(p)
121 collection = cmd.do()
123 # Mis en commentaire car trop tôt, Taban n'a pas encore validé les ISSNs
124 # if col_data["ISSN_électronique"] != "":
125 # e_issn = {
126 # "resource_id": collection.resource_ptr_id,
127 # "id_type": "e_issn",
128 # "id_value": col_data["ISSN_électronique"],
129 # }
130 # ResourceId.objects.create(**e_issn)
131 #
132 # if col_data["ISSN_papier"] != "":
133 # issn = {
134 # "resource_id": collection.resource_ptr_id,
135 # "id_type": "issn",
136 # "id_value": col_data["ISSN_papier"],
137 # }
138 # ResourceId.objects.create(**issn)
140 if not collection: 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 raise ResourceDoesNotExist(f"Resource {pid} does not exist")
143 return collection
146# ??? is this used ?
147NUMDAM_COLLECTIONS = [
148 "ACIRM",
149 "ALCO",
150 "AFST",
151 "AIHPC",
152 "AIHPA",
153 "AIHPB",
154 "AIF",
155 "AIHP",
156 "AUG",
157 "AMPA",
158 "AHL",
159 "AMBP",
160 "ASENS",
161 "ASCFPA",
162 "ASCFM",
163 "ASNSP",
164 "AST",
165 "BSMF",
166 "BSMA",
167 "CTGDC",
168 "BURO",
169 "CSHM",
170 "CG",
171 "CM",
172 "CRMATH",
173 "CML",
174 "CJPS",
175 "CIF",
176 "DIA",
177 "COCV",
178 "M2AN",
179 "PS",
180 "GAU",
181 "GEA",
182 "STS",
183 "TAN",
184 "JSFS",
185 "JEP",
186 "JMPA",
187 "JTNB",
188 "JEDP",
189 "CAD",
190 "CCIRM",
191 "RCP25",
192 "MSIA",
193 "MRR",
194 "MSH",
195 "MSMF",
196 "MSM",
197 "NAM",
198 "OJMO",
199 "PHSC",
200 "PSMIR",
201 "PDML",
202 "PMB",
203 "PMIHES",
204 "PMIR",
205 "RO",
206 "RCP",
207 "ITA",
208 "RSMUP",
209 "RSA",
210 "RHM",
211 "SG",
212 "SB",
213 "SBCD",
214 "SC",
215 "SCC",
216 "SAF",
217 "SDPP",
218 "SMJ",
219 "SPHM",
220 "SPS",
221 "STNB",
222 "STNG",
223 "TSG",
224 "SD",
225 "SE",
226 "SEDP",
227 "SHC",
228 "SJ",
229 "SJL",
230 "SLSEDP",
231 "SLDB",
232 "SL",
233 "SPK",
234 "SAC",
235 "SMS",
236 "SLS",
237 "SSL",
238 "SENL",
239 "SSS",
240 "SAD",
241 "THESE",
242 "SMAI-JCM",
243 "WBLN",
244]
247def cleanup_str(input: str):
248 # some white spaces aren't actual space characters, like \xa0
249 input = unicodedata.normalize("NFKC", input)
250 #
251 input = re.sub(r"[\x7f]+", "", input)
252 # remove useless continuous \n and spaces from the string
253 return re.sub(r"[\n\t\r ]+", " ", input).strip()
256def add_pdf_link_to_xarticle(xarticle: ArticleData, pdf_url: str):
257 data = {
258 "rel": "full-text",
259 "mimetype": "application/pdf",
260 "location": pdf_url,
261 "base": "",
262 "text": "Full Text",
263 }
264 xarticle.streams.append(data)
266 # The pdf url is already added as a stream (just above) but might be replaced by a file later on.
267 # Keep the pdf url as an Extlink if we want to propose both option:
268 # - direct download of a local PDF
269 # - URL to the remote PDF
270 ext_link = create_extlink(rel="article-pdf", location=pdf_url)
271 xarticle.ext_links.append(ext_link)