-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloadOrRun.m
364 lines (316 loc) · 13.8 KB
/
loadOrRun.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
function varargout = loadOrRun(func, args, options)
%LOADORRUN load cached results from a file, or compute and save them.
%
% ... = LOADORRUN(func, {arg1, arg2, ..}, options) If the cache file does not exist, computes
% func(arg1, arg2, ..) and saves the results. If it does exist, simply loads the results and returns
% them. Return values are identical to whatever func returns. If func has multiple outputs, the same
% number of outputs must always be captured. For example the following will NOT work:
%
% x1 = LOADORRUN(@func, args);
% [x1, x2] = LOADORRUN(@func, args);
%
% but the following WILL work:
%
% [x1, ~] = LOADORRUN(@func, args);
% [x1, x2] = LOADORRUN(@func, args);
%
%
% 'options' is an optional struct controlling the behavior of LOADORRUN. It may contain any of the
% following fields:
% - cachePath - where save results (default '.cache/'). Note that the '.' prefix makes the directory
% hidden on unix and linux systems.
% - metaPath - where to save metadata about function dependencies (default '.meta/')
% - recompute - boolean flag to force a call to func() even if cached result exists, or a datenum
% timestamp indicating that all files older than this should be recomputed (this allows recompute
% to be set to the matlab function 'now' to recompute each function once). (default false)
% - verbose - integer flag for level of extra diagnostic messages in range 0-2 (default 0)
% - errorHandling - how to handle errors. Options are 'none' to do nothing, or 'cache' to save and
% immediately rethrow errors on future calls. The cache option is recommended if the calling
% function already contains a 'try/catch' block. 'cache' will save the text of the error message
% in a .error file in the cachePath directory, but does not give access to stack traces (default
% 'none')
% - numPrecision - precision digits for queries based on numerical values (default 4)
% - onDependencyChange - what to do with cached results of functions whose dependencies have been
% modified. Options are 'ignore' to skip checks, 'warn' to print a warning, or 'autoremove' to
% automatically and aggressively delete any upstream file that may have been affected (default
% 'warn')
% - uid - a hard-coded unique identifier for creating the cached file. This overrides the UID that
% would have been created based on args.
% - defaultArgs - a cell array of the same size or smaller than args. Any args that match those in
% 'options.defaultArgs' will not be added to the UID. Any values in defaultArgs set to [] will
% always be ignored regardless of value. Defaults are applied recursively to struct or cell array
% arguments.
% - defaultString - a short string to replace any args that are ignored or have default values.
% (default 'default')
% - dryRun - a flag indicating that loadOrRun should just return metadata about what it would do,
% returning early without computing anything. This allows, for example checking if the cache
% file exists without calling the function (default false)
%
% For example, if options.uid = 'myuid12345', then results will be saved in a file (in the
% options.cachePath directory) called '<funcName>-myuid12345.mat' (where <funcName> is the string
% name of 'func'). When using the 'uid' option, it is the responsibility of the user to ensure that
% distinct function calls are given different IDs. When options.uid is not supplied, a UID is
% automatically constructed from 'args'. Args may be numeric, logical, strings, structs, or cell
% arrays. If at any point a filename becomes too long, it will be hashed to something like
% '<funcName>-AF4D2F80.mat', or some other random string of hex characters.
%
%
% The options.cachePath directory will be populated with
% 1. <uid>.mat - contains the results of func()
% 2. <uid>.id.mat - contains the true (long) uid. (only used if uid was hashed, and is used to
% check for hash collisions)
% 3. <uid>.error - text contents of an error message if options.errorHandling is 'cache' or
% 'warn'
%
% Copyright (c) 2018, Richard Lange
if nargin < 3, options = struct(); end
%% Configuration and initialization
% Ensure that dependencies are on the path
if exist('string2hash', 'file') ~= 2, addpath('string2hash'); end
if exist('getsemaphore', 'file') ~= 2, addpath('semaphore'); end
% Set up default options.
options = populateDefaultOptions(options);
% Check inputs.
assert(iscell(args), 'loadOrRun(@fun, args): args must be a cell array');
assert(any(options.verbose == [0 1 2]));
assert(any(strcmpi(options.errorHandling, {'cache', 'none'})));
assert(any(strcmpi(options.onDependencyChange, {'ignore', 'warn', 'autoremove'})));
% Create necessary directories if they don't exist yet.
if ~exist(options.cachePath, 'dir')
if options.verbose
disp(['Caching directory ' options.cachePath ' does not exist. Creating it now.']);
end
mkdir(options.cachePath);
end
if ~exist(options.metaPath, 'dir')
if options.verbose
disp(['Metadata directory ' options.metaPath ' does not exist. Creating it now.']);
end
mkdir(options.metaPath);
end
if islogical(options.recompute)
if options.recompute
recomputeTime = inf;
else
recomputeTime = -inf;
end
else
recomputeTime = options.recompute;
end
%% Get information about the true name of 'func', its source file, etc.
funcInfo = functions(func);
funcName = funcInfo.function;
sourceFile = funcInfo.file;
isPackage = contains(funcName, '.');
hasSource = true;
if isPackage
% Fix odd behavior in Matlab where functions(@package.func) cannot find the source of a file,
% but which(functions(@package.func).function) can find it.
sourceFile = which(funcName);
% Further fix odd behavior where dbstack() from within package functions strips off the name of
% the package - as far as monitoring dependencies goes, this means that dependencies of
% packageA.packageFun and packageB.packageFun will be 'merged', which could trigger more
% warnings and updates than is strictly necessary.
nameParts = strsplit(funcName, '.');
if options.verbose
warning(['Note: package functions have surprising behavior! %s() will be stored as just '...
'''%s'' when checking for changed dependencies - loadOrRun cannot tell the difference '...
'between this and a function of the same name in another package!!'], funcName, nameParts{end});
end
keyFuncName = nameParts{end};
else
keyFuncName = funcName;
end
if isempty(sourceFile)
if ~exist(funcName, 'builtin')
warning('Source file for %s cannot be inferred (is it an anonymous function??)\n', funcName);
elseif options.verbose == 2
fprintf('%s appears to be a built-in function. loadOrRun will not try to check for changes to its source.\n', funcName);
end
hasSource = false;
elseif ~exist(sourceFile, 'file')
warning('Source file for %s is not visible from the current path settings (source: ''%s'')\n', funcName, sourceFile);
hasSource = false;
end
%% Get UID or create from args
uid = getOrCreateUID(args, options);
% Max name length on unix is 255. Max length is reduced by length(funcName) because '<funcName>-'
% will be prepended. 6 additional characters are subtracted off for the '.error' extension.
MAX_FILENAME_LENGTH = 255 - (length(funcName) + 1) - 6;
[uidFinal, isHashed] = maybeHash(uid, MAX_FILENAME_LENGTH);
% After sorting out the uid and hashing, prepend '<funcName>-' and get filenames.
uidFinal = [funcName '-' uidFinal];
cacheFile = fullfile(options.cachePath, [uidFinal '.mat']);
idFile = fullfile(options.cachePath, [uidFinal '.id.mat']);
errorFile = fullfile(options.cachePath, [uidFinal '.error']);
cacheSem = fullfile(options.metaPath, uidFinal);
idSem = fullfile(options.metaPath, [uidFinal '.id']);
errorSem = fullfile(options.metaPath, [uidFinal '.error']);
if options.verbose == 2
disp(['Full UID is ''' uid '''']);
if isHashed
disp(['UID hashed to ''' uidFinal '''']);
end
end
%% Update dependencies metadata by searching up the current call stack
% 'dependencies' tracks what 'loadOrRun' functions are called above the current one, so that if the
% current one is changed we can detect from 'higher' ones that they must be recomputed. A file named
% <funcName>-sourceDependencies.mat will contain a cell array of paths to .m files that <funcName>
% depends on.
if hasSource
% First, add own source file as a dependency to track
addSourceDependency(keyFuncName, sourceFile, options);
% Next, search up the stack trace for other calls to 'loadOrRun' to flag this file as a
% dependency of its parent function(s)
stack = dbstack();
for i=2:length(stack)
if strcmpi(stack(i).name, 'loadorrun')
callerFuncName = stack(i-1).name;
addSourceDependency(callerFuncName, sourceFile, options);
end
end
end
%% Check modification times and (maybe) remove cache file if dependencies changed
dependencyUpdate = false;
if ~strcmpi(options.onDependencyChange, 'ignore')
depFile = fullfile(options.metaPath, [keyFuncName '-sourceDependencies.mat']);
if (exist(cacheFile, 'file') || exist(errorFile, 'file')) && exist(depFile, 'file')
% Get list of dependencies' source files to compare against the existing cache file (this
% includes the source file of 'func' itself).
sem = getsemaphore(depFile);
contents = load(depFile);
releasesemaphore(sem);
dependencies = contents.dependencies;
if options.verbose == 2
fprintf('Loaded dependencies from %s:\n', depFile);
for i=1:length(dependencies)
fprintf('\t%s -> %s\n', keyFuncName, dependencies{i});
end
end
for i=1:length(dependencies)
dependencyUpdate = dependencyUpdate || removeCacheIfSourceChanged(options, cacheFile, dependencies{i});
% Also remove error files if dependencies changed since the error may now be fixed.
dependencyUpdate = dependencyUpdate || removeCacheIfSourceChanged(options, errorFile, dependencies{i});
end
end
end
%% If last call to func was an error and errorHandling is set to 'cache', rethrow the previous error immediately
if strcmpi(options.errorHandling, 'cache') && exist(errorFile, 'file')
sem = getsemaphore(errorSem);
f = fopen(errorFile, 'r');
errorText = fread(f, inf, 'uint8=>char');
fclose(f);
releasesemaphore(sem);
error(errorText(:)');
end
%% Determine whether a call to func is needed
cacheInfo = dir(cacheFile);
doCompute = dependencyUpdate || ~exist(cacheFile, 'file') || (cacheInfo.datenum < recomputeTime);
if doCompute && options.verbose == 2
if ~exist(cacheFile, 'file')
fprintf('Reason: no cache file\n');
elseif cacheInfo.datenum < recomputeTime
fprintf('Reason: old cache file\n');
else
fprintf('Reason: ???\n');
end
end
% Check for hash collision. Note that cacheFile might be large, so we separately save the full uid
% in the '.id.mat' file, which is very fast to load and verify.
if exist(idFile, 'file')
sem = getsemaphore(idSem);
idContents = load(idFile);
releasesemaphore(sem);
if ~strcmp(idContents.uid, uid)
warning('Hash collision!! Original uids:\n\t%s\n\t%s', idContents.uid, uid);
doCompute = true;
end
end
%% When 'dryRun' is set, simply return information about filenames and whether computation is needed
if options.dryRun
info.uid = uid;
info.uidFinal = uidFinal;
info.cacheFile = cacheFile;
info.idFile = idFile;
info.errorFile = errorFile;
info.needsCompute = doCompute;
varargout{1} = info;
return;
end
%% Call func or load cached results.
% Start by attempting to load. If it fails, we can fall back on recomputing things.
if ~doCompute
if options.verbose
fprintf('Loading cached results from %s...\t\n', cacheFile);
end
sem = getsemaphore(cacheSem);
try
contents = load(cacheFile);
if options.verbose
fprintf('done.\n');
end
results = contents.results;
catch err
if options.verbose == 1
fprintf('\tLoading failed! Falling back on recomputing.\n');
elseif options.verbose == 2
fprintf('\tLoading failed! Message:\n%s\n', getReport(err));
end
doCompute = true;
end
releasesemaphore(sem);
end
if doCompute
% Call func(args) and capture as many return values as have been requested by whoever called
% this function.
if options.verbose
fprintf('Computing results for %s-%s with %d outputs\n', funcName, uid, nargout);
end
results = cell(1, nargout);
try
[results{:}] = func(args{:});
if exist(errorFile, 'file')
sem = getsemaphore(errorSem);
delete(errorFile);
releasesemaphore(sem);
end
catch e
if options.verbose
fprintf('error!\n');
end
% Save text of error to file
errorText = getReport(e);
sem = getsemaphore(errorSem);
f = fopen(errorFile, 'w');
fwrite(f, errorText);
fclose(f);
releasesemaphore(sem);
rethrow(e);
end
if options.verbose
fprintf('done. Saving to %s...\n', cacheFile);
end
% Save results to the file.
sem = getsemaphore(cacheSem);
save(cacheFile, 'results', '-v7.3');
releasesemaphore(sem);
if isHashed
sem = getsemaphore(idSem);
save(idFile, 'uid', '-v7.3');
releasesemaphore(sem);
end
end
varargout = results;
end
function [uid, isHashed] = maybeHash( uid, maxLength )
%MAYBE_HASH hashes uid if its length is larger than maxLength (default 250)
if nargin < 2, maxLength = 250; end
if length(uid) > maxLength
% Hash the string.
uid = sprintf('%X', string2hash(uid));
isHashed = true;
else
% It's short enough; keep the string as-is.
isHashed = false;
end
end