Skip to content

Commit

Permalink
Add shuffling (and a bit of housekeeping) (#92)
Browse files Browse the repository at this point in the history
Introduce training using shuffling per batch and using part of the training set as the per-epoch validation set. A future PR should thoroughly revise the code to either deprecate --validfile or make it an option to use --validfile or a portion of the training set.
  • Loading branch information
caledezma authored Jan 18, 2019
1 parent 932293b commit 1c9fb10
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 195 deletions.
6 changes: 3 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ jobs:
# Download and cache dependencies
- restore_cache:
keys:
- v2-dependencies-{{ checksum "requirements.txt" }}
- v3-dependencies-{{ checksum "requirements.txt" }}
# fallback to using the latest cache if no exact match is found
- v2-dependencies-
- v3-dependencies-

- run:
name: install dependencies
Expand All @@ -31,7 +31,7 @@ jobs:
- save_cache:
paths:
- ./venv
key: v2-dependencies-{{ checksum "requirements.txt" }}
key: v3-dependencies-{{ checksum "requirements.txt" }}


- run:
Expand Down
17 changes: 9 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
pytest
dill
sklearn
numpy==1.15.2
pytest==4.0.1
dill==0.2.8.2
sklearn==0.0
theano
scikit-image
pymongo
keras
pyyaml
tensorflow
scikit-image==0.14.1
pymongo==3.7.1
keras==2.2.4
PyYAML==3.13
tensorflow==1.11.0
88 changes: 18 additions & 70 deletions toupee/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import math
import time
import h5py
import warnings

from keras.callbacks import Callback
from keras.utils import Sequence
Expand All @@ -41,7 +42,6 @@ def __init__(self, monitor='val_loss', verbose=0, mode='auto'):
super(ModelCheckpointInMemory, self).__init__()
self.monitor = monitor
self.verbose = verbose

self.best_model = h5py.File("/dev/null", driver = 'core',
backing_store = False)
self.best_epoch = None
Expand Down Expand Up @@ -84,30 +84,25 @@ def on_epoch_end(self, epoch, logs={}):
if self.verbose > 0:
print('Epoch %05d: %s did not improve' %
(epoch, self.monitor))




#Joao: I tried to prefetch the data from the disk in this generator, but it led
# to multiple complications (especially with resampled data).
# But it can decrease train&test time - do it in the future!
class DataGenerator(Sequence):
'''
Data holder generator class for .npz/.h5 data
'''
Data holder generator class for .npz/.h5 data
-- keras Sequence based (for better generator performance)
[requires __len__(self) and __getitem__(self, idx)]
'''

def __init__(self, data_file, batch_size, sampled_indexes, hold_y = True):

#define x
if 'x' in data_file:
xlabel = 'x'
elif 'X' in data_file:
xlabel = 'X'

self.data_x = data_file[xlabel]


#auxiliary variables
self.sampled_indexes = sampled_indexes
Expand All @@ -117,8 +112,6 @@ def __init__(self, data_file, batch_size, sampled_indexes, hold_y = True):
self.num_examples = self.data_x.shape[0]
self.batch_size = batch_size
self.number_of_batches = math.ceil(self.num_examples/self.batch_size)


# define y if needed
self.hold_y = hold_y
if hold_y:
Expand All @@ -128,97 +121,75 @@ def __init__(self, data_file, batch_size, sampled_indexes, hold_y = True):
assert self.n_classes > 1
self.data_y = data_file['y']


def sequential_batch(self, step):
#sequential iteration over the data

#defines the indexes for this batch
if (step+1) == self.number_of_batches: #<- last batch
batch_indexes = list(range(step*self.batch_size,
batch_indexes = list(range(step*self.batch_size,
self.num_examples))
else:
batch_indexes = list(range(step*self.batch_size,
batch_indexes = list(range(step*self.batch_size,
(step+1)*self.batch_size))


if self.hold_y:
# Return the arrays in the shape that fit_gen uses (data, target)
return (self.data_x[batch_indexes, ...],
self.data_y[batch_indexes, ...])
# else:
# Return the arrays in the shape that predict_generator uses (data)
return (self.data_x[batch_indexes, ...])


return (self.data_x[batch_indexes, ...])

def sliced_batch(self, step):
#problem with returning the "sampled_indexes" only:
# H5 can only slice given i) a sequencial list of integers or ii) a
# boolean array [i.e. there is no fancy slicing, as in numpy]
# since ii) might need a giant boolean array, let's do i) and then
# filter stuff

#gets the desired indexes for this batch
if (step+1) == self.number_of_batches: #<- last batch
batch_indexes = (self.sampled_indexes[step*self.batch_size :
batch_indexes = (self.sampled_indexes[step*self.batch_size :
self.num_examples])
else:
batch_indexes = (self.sampled_indexes[step*self.batch_size :
batch_indexes = (self.sampled_indexes[step*self.batch_size :
(step+1)*self.batch_size])

first_index = batch_indexes[0]
last_index = batch_indexes[-1]

#if the samples are too far appart, loads one by one
if last_index - first_index > 4096: #<- magic number

data_x = []
for i in batch_indexes:
data_x.append(self.data_x[i, ...])
data_x.append(self.data_x[i, ...])
data_x = numpy.asarray(data_x)

if self.hold_y:
data_y = []
for i in batch_indexes:
data_y.append(self.data_y[i, ...])
data_y.append(self.data_y[i, ...])
data_y = numpy.asarray(data_y)

#otherwise, loads the interval and then filters
else:
batch_indexes = batch_indexes - first_index

data_x = self.data_x[first_index:last_index+1, ...]
data_x = data_x[batch_indexes, ...]

if self.hold_y:
data_y = self.data_y[first_index:last_index+1, ...]
data_y = data_y[batch_indexes, ...]


if self.hold_y:
if self.hold_y:
return(data_x, data_y)

# else:
return(data_x)



def __len__(self):
#returns the dataset length
return self.number_of_batches



def __getitem__(self, step):
#gets a batch
if self.sampled_indexes is None:
return self.sequential_batch(step)
else:
return self.sliced_batch(step)





class Toupee:

def __init__(self):
self.reset()

Expand Down Expand Up @@ -300,19 +271,15 @@ def get_probabilities(classifier, file_object, batch_size):
"""
Predicts the train set using the trained model
"""

x_holder = DataGenerator(file_object, batch_size, None, hold_y = False)

#applies the correct method, depending on the classifier class
if hasattr(classifier, 'predict_generator'):
class_proba = classifier.predict_generator(x_holder,
max_queue_size=1000)
else:
class_proba = classifier.predict_proba(x_holder)

return class_proba


def errors(classifier, file_object, batch_size):
"""
Gets the model's binary error status for each sample
Expand All @@ -331,7 +298,6 @@ def errors(classifier, file_object, batch_size):
end = 0
r = numpy.empty(n_samples)
while end < n_samples:

start = end
end += 131072 # magic number, power of 2 :D
if end > n_samples:
Expand All @@ -344,65 +310,49 @@ def errors(classifier, file_object, batch_size):
r[start:end] = r[start:end].astype('int32')

return r



def accuracy(classifier, file_object, batch_size):

e = errors(classifier, file_object, batch_size)

return 1.0 - (float(e.sum()) / float(file_object['y'].shape[0]))



#TODO: this is kinda a redefinition of data.py's one_hot
# -> take care of the duplicates!
def one_hot(data, n_classes):
b = numpy.zeros((data.size, n_classes),dtype='float32')
b[numpy.arange(data.size), data] = 1.
return b



def count_classes(file_object):
"""Counts the number of entries on each class"""
n_samples, n_classes = file_object['y'].shape
sample_count = numpy.asarray([0]*n_classes)

end = 0
while end < n_samples:
start = end
end += 131072 # magic number, power of 2 :D
if end > n_samples:
end = n_samples

data_y = numpy.asarray(file_object['y'][start:end])
sample_count += numpy.sum(data_y, axis = 0)

return sample_count



def confidence(classifier, file_object, batch_size):
"""
Returns the model's confidence for the true label
"""

class_proba = get_probabilities(classifier, file_object, batch_size)
n_samples = file_object['y'].shape[0]

end = 0
h = numpy.empty(n_samples)
while end < n_samples:
start = end
end += 131072 # magic number, power of 2 :D
if end > n_samples:
end = n_samples

data_y = numpy.asarray(file_object['y'][start:end]).argmax(axis=-1)

for i in range(end - start):
h[start + i] = class_proba[start + i][data_y[i]]

return h


#----------------------------------------------------------
#for regression problems:
# def distance(predictor, test_set_x, test_set_y):
Expand All @@ -417,11 +367,9 @@ def confidence(classifier, file_object, batch_size):
# euclidian_distance_squared = distance(predictor, test_set_x, test_set_y)
# euclidian_distance = numpy.sqrt(euclidian_distance_squared)
# return(numpy.sum(euclidian_distance) / float(test_set_y.shape[0]))

# def relative_distance(predictor, test_set_x, test_set_y):
# relative_distance = distance(y-y_pred) / sqrt(y^2) [sqrt(y^2) = L2 norm]
# euclidian_distance_squared = distance(predictor, test_set_x, test_set_y)
# y_squared = numpy.sum(numpy.square(test_set_y), axis = 1) #both this and the previous line will need a sqrt, which can be done after the division
# relative_distance = numpy.sqrt(euclidian_distance_squared / y_squared)
# return(numpy.sum(relative_distance) / float(test_set_y.shape[0]))

Loading

0 comments on commit 1c9fb10

Please sign in to comment.