-
Notifications
You must be signed in to change notification settings - Fork 11
/
hdf5_deeplearn_utils.py
294 lines (268 loc) · 12.8 KB
/
hdf5_deeplearn_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import h5py
import numpy as np
from scipy.misc import imresize
from collections import defaultdict
def get_real_endpoint(h5f):
if h5f['timestamp'][-1] != 0:
end_len = len(h5f['timestamp'])
else:
# Get the last index where it changes, before it becomes zero
end_len = np.where(abs(np.diff(h5f['timestamp']))>0)[0][-1]+1
return end_len
def chunker(seq, size):
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
def yield_chunker(seq, size):
for pos in xrange(0, len(seq), size):
yield seq[pos:pos + size]
def check_and_fix_timestamps(h5f):
all_timestamps = np.array(h5f['timestamp']).copy()
if np.all(np.diff(all_timestamps) >= 0):
return
else:
print('Broken timestamps found! Fixing...')
curr_idx = 0
while curr_idx < len(all_timestamps):
loop_point = np.where(np.diff(all_timestamps)<0)[0]
if len(loop_point)==0:
orig_timestamps = np.array(h5f['timestamp']).copy()
if 'orig_timestamp' in h5f:
del h5f['orig_timestamp']
h5f['orig_timestamp'] = orig_timestamps
if 'timestamp' in h5f:
del h5f['timestamp']
h5f['timestamp'] = all_timestamps
return
else:
curr_idx = loop_point[0]
all_timestamps[curr_idx+1:] += abs(all_timestamps[curr_idx+1]-all_timestamps[curr_idx])
def get_start_stop_contigs(all_idxs):
# Find where there are jumps
jumps = np.where(np.diff(all_idxs)>1)[0]
starts = [0,]+list(jumps+1)
stops = list(jumps+1)+[len(all_idxs)]
return starts, stops
def calc_data_mean(h5f, group_key, chunksize=1024*10, axes=(0), force=False):
ep = get_real_endpoint(h5f)
# Make chunks of contiguous blocks
train_idxs = list(h5f['train_idxs'])
starts, stops = get_start_stop_contigs(h5f['train_idxs'])
# Use a nested list comprehension over contiguous blocks of chunks of images, summing all.
# Chunking on ranges (start:stop) is actually a fair bit faster than chunking on
# individual indices.
new_mean = np.sum([np.sum([group.astype('double').sum(axis=axes)
for group in chunker(h5f[group_key][start:stop], chunksize)], axis=0)
for start, stop in zip(starts, stops)], axis=0)
new_mean = new_mean.astype('float64') / len(h5f[group_key][train_idxs])
# Replace
if force and group_key+'_mean' in h5f:
del h5f[group_key+'_mean']
h5f.create_dataset(group_key+'_mean', data=np.array(new_mean))
return
def calc_data_std(h5f, group_key, chunksize=1024*10, axes=(0), force=False):
ep = get_real_endpoint(h5f)
train_idxs = list(h5f['train_idxs'])
mean_val = h5f[group_key+'_mean']
starts, stops = get_start_stop_contigs(h5f['train_idxs'])
# Do it as parallel as possible, using the mean to upcast it to double
sum_sq_val = np.sum([np.sum([np.sum((group-mean_val)**2,axis=axes)
for group in chunker(h5f[group_key][start:stop], chunksize)], axis=0)
for start, stop in zip(starts, stops)], axis=0)
std_val = np.sqrt(sum_sq_val/len(h5f[group_key][train_idxs]))
std_val[std_val==0] = 1.
# Replace
if force and group_key+'_std' in h5f:
del h5f[group_key+'_std']
h5f.create_dataset(group_key+'_std', data=np.array(std_val))
return
def build_train_test_split(h5f, train_div=5*60, test_div=1*60, force=False):
ep = get_real_endpoint(h5f)
train_idxs, test_idxs = [], []
curr_idx, curr_ts = 0, h5f['timestamp'][0]
while curr_idx < len(h5f['timestamp'][:ep]):
# Get train indexes
stop_idx = np.where(h5f['timestamp'][curr_idx:ep] > curr_ts+train_div)[0]
stop_idx = stop_idx[0] if np.any(stop_idx) else ep-curr_idx
train_idxs.extend(range(curr_idx, curr_idx+stop_idx))
curr_ts = curr_ts+train_div
curr_idx = curr_idx+stop_idx
# Get test indexes
stop_idx = np.where(h5f['timestamp'][curr_idx:ep] > curr_ts+test_div)[0]
stop_idx = stop_idx[0] if np.any(stop_idx) else ep-curr_idx
test_idxs.extend(range(curr_idx, curr_idx+stop_idx))
curr_ts = curr_ts+test_div
curr_idx = curr_idx+stop_idx
# Replace as necessary
if force and 'train_idxs' in h5f:
del h5f['train_idxs']
h5f.create_dataset('train_idxs', data=np.array(train_idxs))
if force and 'test_idxs' in h5f:
del h5f['test_idxs']
h5f.create_dataset('test_idxs', data=np.array(test_idxs))
return
class MultiHDF5SeqVisualIterator(object):
def flow(self, h5fs, dataset_keys, indexes_key, batch_size, seq_length=30, shuffle=True, return_time=False, speed_gt=0):
# Get some constants
all_data_idxs = []
dataset_lookup = {h5f:dataset_key for h5f, dataset_key in zip(h5fs, dataset_keys)}
for h5f in h5fs:
avail_idxs = set(np.array(h5f[indexes_key]))
for idx in np.array(h5f[indexes_key]):
# Check to make sure whole sequence is in the same train/test, and filter on speed
above_speed = np.all(np.array(h5f['vehicle_speed'][idx:idx+seq_length]) > speed_gt)
if (idx+seq_length in avail_idxs) and above_speed:
all_data_idxs.append((h5f, idx))
num_examples = len(all_data_idxs)
num_batches = int(np.ceil(float(num_examples)/batch_size))
# Shuffle the data
if shuffle:
np.random.shuffle(all_data_idxs)
b = 0
while b < num_batches:
curr_idxs = all_data_idxs[b*batch_size:(b+1)*batch_size]
todo_dict = defaultdict(list)
for (h5f, idx) in curr_idxs:
todo_dict[h5f].append(idx)
vids, bY, times = [], [], []
for h5f, idxs in todo_dict.items():
vids.extend([np.array(h5f[dataset_lookup[h5f]][curr_idx:curr_idx+seq_length]) for curr_idx in idxs])
bY.extend([h5f['steering_wheel_angle'][curr_idx:curr_idx+seq_length] for curr_idx in idxs])
times.extend([h5f['timestamp'][curr_idx:curr_idx+seq_length] for curr_idx in idxs])
# Add a single-dimensional color channel for grayscale
vids = np.expand_dims(vids, axis=2).astype('float32')/255.-0.5
# Get rid of infinity and NaNs which sometimes happens from normalizing
vids[np.isnan(vids)] = 0.
vids[np.isinf(vids)] = 0.
# Turn into a numpy-compatible vector
bY = np.array(bY)
if return_time:
yield [vids.astype('float32'), bY.astype('float32'), np.array(times).astype('float32')]
else:
yield [vids.astype('float32'), bY.astype('float32')]
b += 1
class HDF5SeqVisualIterator(object):
def flow(self, h5f, dataset_key, indexes_key, batch_size, set_length=30, shuffle=True):
# Get some constants
mean_vid, std_vid = h5f[dataset_key + '_mean'], h5f[dataset_key + '_mean']
data_idxs = np.array(h5f[indexes_key])
data_idx_set = set(data_idxs)
allowed_idxs = np.array([data_idx for data_idx in data_idxs if data_idx+set_length in data_idx_set])
num_examples = len(allowed_idxs)
num_batches = int(np.ceil(float(num_examples)/batch_size))
# Shuffle the data
if shuffle:
np.random.shuffle(allowed_idxs)
b = 0
while b < num_batches:
curr_idxs = list(np.sort(data_idxs[b*batch_size:(b+1)*batch_size]))
vids = [(np.array(h5f[dataset_key][curr_idx:curr_idx+set_length]) - mean_vid) / std_vid for curr_idx in curr_idxs]
bY = [h5f['steering_wheel_angle'][curr_idx:curr_idx+set_length] for curr_idx in curr_idxs]
# Add a single-dimensional color channel for grayscale
vids = np.expand_dims(vids, axis=2)
vids[np.isnan(vids)] = 0
vids[np.isinf(vids)] = 0
bY = np.array(bY)
yield [vids.astype('float32'), bY.astype('float32')]
b += 1
class HDF5VisualIterator(object):
def flow(self, h5f, dataset_key, indexes_key, batch_size, shuffle=True):
# Get some constants
mean_vid, std_vid = h5f[dataset_key + '_mean'], h5f[dataset_key + '_mean']
data_idxs = np.array(h5f[indexes_key])
num_examples = len(data_idxs)
num_batches = int(np.ceil(float(num_examples)/batch_size))
# Shuffle the data
if shuffle:
np.random.shuffle(data_idxs)
b = 0
while b < num_batches:
curr_idxs = list(np.sort(data_idxs[b*batch_size:(b+1)*batch_size]))
vid = (h5f[dataset_key][curr_idxs] - mean_vid) / std_vid
bY = h5f['steering_wheel_angle'][curr_idxs]
# Add a single-dimensional color channel for grayscale
vid = np.expand_dims(vid, axis=1)
vid[np.isnan(vid)] = 0.
vid[np.isinf(vid)] = 0.
bY = np.expand_dims(bY, axis=1)
yield [vid.astype('float32'), bY.astype('float32')]
b += 1
class MultiHDF5VisualIterator(object):
def flow(self, h5fs, dataset_keys, indexes_key, batch_size, shuffle=True):
# Get some constants
all_data_idxs = []
dataset_lookup = {h5f:dataset_key for h5f, dataset_key in zip(h5fs, dataset_keys)}
for h5f in h5fs:
for idx in np.array(h5f[indexes_key]):
all_data_idxs.append((h5f, idx))
num_examples = len(all_data_idxs)
num_batches = int(np.ceil(float(num_examples)/batch_size))
# Shuffle the data
if shuffle:
np.random.shuffle(all_data_idxs)
b = 0
while b < num_batches:
curr_idxs = all_data_idxs[b*batch_size:(b+1)*batch_size]
todo_dict = defaultdict(list)
for (h5f, idx) in curr_idxs:
todo_dict[h5f].append(idx)
vids = []
bY = []
for h5f, idxs in todo_dict.items():
vids.extend(h5f[dataset_lookup[h5f]][sorted(idxs)])
bY.extend(h5f['steering_wheel_angle'][sorted(idxs)])
# Add a single-dimensional color channel for grayscale
vids = np.expand_dims(vids, axis=1).astype('float32')/255.-0.5
vids[np.isnan(vids)] = 0.
vids[np.isinf(vids)] = 0.
bY = np.expand_dims(bY, axis=1)
yield [vids.astype('float32'), bY.astype('float32')]
b += 1
def resize_int8(frame, size):
return imresize(frame, size)
def resize_int16(frame, size=(60,80), method='bilinear', climit=[-15,15]):
# Assumes data is some small amount around the mean, i.e., DVS event sums
return imresize((np.clip(frame, climit[0], climit[1]).astype('float32')+127), size, interp=method).astype('uint8')
def resize_data_into_new_key(h5f, key, new_key, new_size, chunk_size=1024):
chunk_generator = yield_chunker(h5f[key], chunk_size)
# Set some basics
dtype = h5f[key].dtype
row_idx = 0
if key == 'aps_frame':
do_resize = resize_int8
elif key == 'dvs_frame':
do_resize = resize_int16
else:
raise AssertionError, 'Unknown data type'
# Initialize a resizable dataset to hold the output
resized_shape = (chunk_size,) + new_size
max_shape = (h5f[key].shape[0],) + resized_shape[1:]
if new_key in h5f:
dset = h5f[new_key]
else:
dset = h5f.create_dataset(new_key, shape=max_shape, maxshape=max_shape,
chunks=resized_shape, dtype='uint8')
# Write all data out, one chunk at a time
for chunk in chunk_generator:
# Operate on the data
resized_chunk = np.array([do_resize(frame, new_size) for frame in chunk])
# Write the next chunk
dset[row_idx:row_idx+chunk.shape[0]] = resized_chunk
# Increment the row count
row_idx += chunk.shape[0]
# def pad_datastreams(datastreams, rand_pad=10):
# num_streams = len(datastreams)
# max_len = np.max([[len(item) for item in datastream] for datastream in datastreams])
# max_len = max_len + rand_pad
# returned_streams = []
# for datastream in datastreams:
# num_items = len(datastream)
# # First dim is batch size, second is length (going to be overwritten to max)
# size_tuple = datastream[0].shape[1:]
# data_tensor = np.zeros( (num_items, max_len) + size_tuple, dtype='float32')
# mask_tensor = np.zeros( (num_items, max_len), dtype='float32')
# for idx, item in enumerate(datastream):
# start_offset = np.random.randint(low=0, high=max_len-len(item))
# data_tensor[idx, start_offset:start_offset+len(item)] = item
# mask_tensor[idx, start_offset:start_offset+len(item)] = 1.
# returned_streams.append(data_tensor)
# returned_streams.append(mask_tensor)
# return returned_streams