diff --git a/plato/core.py b/plato/core.py index 42c8895..11a6959 100644 --- a/plato/core.py +++ b/plato/core.py @@ -63,6 +63,103 @@ def my_symbolic_function(x, y): Variable.idtype = property(lambda self: (self.ival.dtype if isinstance(self.ival, np.ndarray) else type(self.ival))) + +class IFormat(object): + + @staticmethod + def check(data, f): + """ + Assert that data is in correct format. Otherwise, throw SymbolicFormatError. f is the reference to the function + whose inputs/outputs/updates are being inspected. f is passed in so that it can be used in the error message, + if any. + """ + +class PassAnythingFormat(IFormat): + + @staticmethod + def check(data, f): + pass + + +class AnyReturnFormat(IFormat): + + @staticmethod + def check(data, f): + pass + + +class SingleOutputFormat(IFormat): + + @staticmethod + def check(data, f): + if not _is_tensor(data): + raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data)) + + +class MultiOutputFormat(IFormat): + + @staticmethod + def check(data, f): + if not _is_tuple_of_tensors(data): + raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data)) + + +class NoOutputFormat(IFormat): + + @staticmethod + def check(data, f): + assert data is None, "Function %s should have returned no output, but it returned %s. If your intention was to return updates, use add_update instead." % (f, data) + + +class NoUpdatesFormat(IFormat): + + @staticmethod + def check(data, f): + assert isinstance(data, list), "Updates should be in the form of a list. Something is strange if this is not the case. Got {}".format(data) + if len(data)!=0: + raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data)) + + +class SomeUpdatesFormat(IFormat): + + @staticmethod + def check(data, f): + if isinstance(data, list): "Updates should be in the form of a list. Something is strange if this is not the case" + if len(data) == 0: + raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, )) + + +class NamedCollectionFormat(IFormat): + + @staticmethod + def check(data, f): + if not _is_named_collection(data): + raise SymbolicFormatError("Data should be a named collection, in a dict format. Right now it looks like this: %s" % (data, )) + + +class CollectionOfCollectionsOfTensorsFormat(IFormat): + + @staticmethod + def check(data, f): + if not _is_tuple_of_tuples_of_tensors(data): + raise SymbolicFormatError("Data should be a collection of collections of tensors. Right now it looks like this: %s" % (data, )) + + +class ConstantFormat(IFormat): + + @staticmethod + def check(data, f): + if not isinstance(data, (float, int, np.ndarray)): + raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc). Right now it looks like this: %s" % (data, )) + + +class SymbolicFormatError(Exception): + pass + + + + + def symbolic(fcn): """ Use this to decorate a symbolic function with any return format (it will be detected automatically). @@ -203,7 +300,7 @@ def __call__(self, *args, **kwargs): self.output_format.check(symbolic_return, self.fcn) return symbolic_return - def scan(self, **scan_kwargs): + def scan(self, *sequence_args, **scan_kwargs): """ Apply a scan to this function. For arguments, see thr :param scan_kwargs: See theano.scan doc: http://deeplearning.net/software/theano/library/scan.html#theano.scan @@ -211,26 +308,69 @@ def scan(self, **scan_kwargs): [sequences[0], ... sequences[-1], outputs_info[0], ... outputs_info[-1], non_sequences[0], ... non_sequences[-1]] :return: """ - outputs, updates = theano.scan(self._call_with_updates_returned, **scan_kwargs) - if self._had_to_add_dummies: - # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8 - # Basically, we need to undo some evil that is done in theano's scan function. See _call_with_updates_returned - outputs = outputs[:-2] + if len(sequence_args)>0: + assert 'sequences' not in scan_kwargs, 'You can either specify sequences as unnamed args or not' + scan_kwargs = scan_kwargs.copy() + scan_kwargs['sequences'] = sequence_args + + outputs, updates = theano.scan(self._call_with_updates_returned, return_list = True, **scan_kwargs) + + + # + # if self._had_to_add_dummies: + # # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8 + # # Basically, we need to undo some evil that is done in theano's scan function. See _call_with_updates_returned + # outputs = outputs[:-2] - if len(self._trace_info)>0: + if len(self._trace_info)>0: # Peel off trace variables if any trace_outputs = outputs[-len(self._trace_info):] outputs = outputs[:-len(self._trace_info)] for (trace_name, (_, batch_in_scan, callback)), trace_output in izip_equal(self._trace_info.iteritems(), trace_outputs): CaptureTraceVariables.CURRENT_CATCHER.add_trace(variable=trace_output if batch_in_scan else trace_output[-1], name=trace_name, batch_in_scan=batch_in_scan, callback=callback) - if self._single_output and isinstance(outputs, (list, tuple)): + if self._output_format is None: + outputs = None + elif self._output_format == 'single': assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not. +3163004422 seven' outputs, = outputs + else: + assert self._output_format == 'tuple' + outputs = outputs + + # outputs = \ + # None if self._output_format is None else \ + + + # if self._single_output and isinstance(outputs, (list, tuple)): + # assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not. +3163004422 seven' + # outputs, = outputs for (shared_var, new_val) in updates.items(): add_update(shared_var, new_val) return outputs + def _call_with_updates_returned(self, *args, **kwargs): + with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces: + outputs = self(*args, **kwargs) + + # self._single_output = isinstance(outputs, Variable) + self._trace_info = traces.get_trace_variable_info() + + # Due to trace variables, we will convert outputs to tuple. We preserve original format here. + self._output_format = None if outputs is None else \ + 'single' if isinstance(outputs, Variable) else \ + 'tuple' + + outputs = \ + () if self._output_format is None else \ + (outputs, ) if self._output_format =='single' else \ + outputs + + if len(traces)>0: + outputs = outputs + tuple(traces.values()) + + return outputs, OrderedDict(sc.get_updates()) + def eval(self, *args, **kwargs): """ Compile and evaluate the function for the given inputs. @@ -249,26 +389,8 @@ def __eq__(self, other): return True return False - def _call_with_updates_returned(self, *args, **kwargs): - with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces: - outputs = self(*args, **kwargs) - - self._single_output = isinstance(outputs, Variable) - self._trace_info = traces.get_trace_variable_info() - - if self._single_output and len(traces)>0: - outputs = (outputs, ) - elif outputs is None: - outputs = (tt.zeros(), ) - - if len(traces)>0: - outputs = outputs + tuple(traces.values()) - - self._had_to_add_dummies = isinstance(outputs, (list, tuple)) and len(outputs)==1 # Necessary evil to force theano.scan to return collection even if length is 1. - if self._had_to_add_dummies: - outputs = outputs + type(outputs)([tt.zeros(()), tt.zeros(())]) - - return outputs, OrderedDict(sc.get_updates()) + # def __hash__(self): + # return hash(self.fcn) def to_format(self, format_decorator): @@ -284,7 +406,7 @@ def partial(self, **fixed_kwargs): """ Partially define the input arguments and return a new symbolic function. """ - fixed_kwargs = {k: (tt.constant(v) if isinstance(v, np.ndarray) else v) for k, v in fixed_kwargs.iteritems()} # This prevents + fixed_kwargs = {k: (tt.constant(v) if isinstance(v, np.ndarray) else v) for k, v in fixed_kwargs.items()} # This prevents return _SymbolicFunctionWrapper(fcn=partial(self.fcn, **fixed_kwargs), input_format = PassAnythingFormat, output_format=self.output_format, update_format=self.update_format, attached_instance=self.attached_instance) @@ -301,7 +423,7 @@ def __get__(self, instance, other): # no reason to create a separate object every time we want to get the method, and (b) debugging - because we # attach the local variables to the method, and want to get them later, so the returned method better have # the same address every time we request it. - if instance in self._dispatched_methods: + if instance in tuple(self._dispatched_methods.keys()): return self._dispatched_methods[instance] else: return _SymbolicFunctionWrapper(self.fcn, input_format=self.input_format, output_format=self.output_format, update_format=self.update_format, attached_instance=instance) @@ -322,15 +444,8 @@ def locals(self): return self._captured_locals -class IFormat(object): - - @staticmethod - def check(data, f): - """ - Assert that data is in correct format. Otherwise, throw SymbolicFormatError. f is the reference to the function - whose inputs/outputs/updates are being inspected. f is passed in so that it can be used in the error message, - if any. - """ +# Need to do this here instead of decorating because _SymbolicFunctionWrapper is not defined yet at decoration-time. +_SymbolicFunctionWrapper.scan = symbolic(_SymbolicFunctionWrapper.scan) def _detect_format(data): @@ -376,89 +491,6 @@ def convert_formats(data, src_format, dest_format): raise SymbolicFormatError('No way to convert data from %s to %s' % (src_format, dest_format)) -class PassAnythingFormat(IFormat): - - @staticmethod - def check(data, f): - pass - - -class AnyReturnFormat(IFormat): - - @staticmethod - def check(data, f): - pass - - -class SingleOutputFormat(IFormat): - - @staticmethod - def check(data, f): - if not _is_tensor(data): - raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data)) - - -class MultiOutputFormat(IFormat): - - @staticmethod - def check(data, f): - if not _is_tuple_of_tensors(data): - raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data)) - - -class NoOutputFormat(IFormat): - - @staticmethod - def check(data, f): - assert data is None, "Function %s should have returned no output, but it returned %s. If your intention was to return updates, use add_update instead." % (f, data) - - -class NoUpdatesFormat(IFormat): - - @staticmethod - def check(data, f): - assert isinstance(data, list), "Updates should be in the form of a list. Something is strange if this is not the case" - if len(data)!=0: - raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data)) - - -class SomeUpdatesFormat(IFormat): - - @staticmethod - def check(data, f): - if isinstance(data, list): "Updates should be in the form of a list. Something is strange if this is not the case" - if len(data) == 0: - raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, )) - - -class NamedCollectionFormat(IFormat): - - @staticmethod - def check(data, f): - if not _is_named_collection(data): - raise SymbolicFormatError("Data should be a named collection, in a dict format. Right now it looks like this: %s" % (data, )) - - -class CollectionOfCollectionsOfTensorsFormat(IFormat): - - @staticmethod - def check(data, f): - if not _is_tuple_of_tuples_of_tensors(data): - raise SymbolicFormatError("Data should be a collection of collections of tensors. Right now it looks like this: %s" % (data, )) - - -class ConstantFormat(IFormat): - - @staticmethod - def check(data, f): - if not isinstance(data, (float, int, np.ndarray)): - raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc). Right now it looks like this: %s" % (data, )) - - -class SymbolicFormatError(Exception): - pass - - def _is_tensor(arg): return isinstance(arg, (Variable, np.ndarray)) @@ -475,7 +507,7 @@ def _is_tuple_of_tuples_of_tensors(args): def _is_named_collection(arg): if not isinstance(arg, dict): return False - if not all(isinstance(k, (basestring, int)) for k in arg.keys()): + if not all(isinstance(k, (str, int)) for k in arg.keys()): return False if not all(_is_tensor(v) for v in arg.values()): return False @@ -510,7 +542,8 @@ def my_function(x): f will be an AutoCompilingFunction """ - def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_values = False, debug_print_shapes=False, resettable=False, **theano_function_kwargs): + def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_values = False, debug_print_shapes=False, + resettable=False, print_initial_shapes = False, **theano_function_kwargs): """ :param fcn: A symbolic function (decorated with one of the above decorators) :param cast_to_floatx: Case inputs to the global float type (define this in ~/.theanorc). @@ -548,6 +581,8 @@ def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_va self._input_format = None self._output_format = None self.updated_variables = None # Used in reset() + self.print_initial_shapes = print_initial_shapes + self._none_output_indices = None # Indices of outputs of the funcition that are "None"... These are considered special and are passed straingt through # Create convenient debugging functions: showloc() and locinfo() __builtins__['showloc'] = show_all_locals @@ -582,7 +617,7 @@ def __call__(self, *args, **kwargs): # Find tensor versions of inputs based on data in first-call, collect list of inputs self._input_format = NestedType.from_data(input_data) flat_input_data = self._input_format.get_leaves(input_data) - args_and_kwarg_tensors = [_data_to_tensor(d, cast_to_floatx = self._cast_to_floatx, add_test_value = True if self._add_test_values else 'shape') for d in flat_input_data] + args_and_kwarg_tensors = [_data_to_tensor(d, cast_to_floatx = self._cast_to_floatx, add_test_value = True if self._add_test_values else 'shape') if d is not None else None for d in flat_input_data] self._shared_var_inputs = [trace_value for trace_value in args_and_kwarg_tensors if isinstance(trace_value, SharedVariable)] tensor_args, tensor_kwargs = self._input_format.expand_from_leaves(args_and_kwarg_tensors, check_types=False) # Because types will be different @@ -594,8 +629,10 @@ def __call__(self, *args, **kwargs): for cb in cc.get_callbacks(): self._callbacks.append(cb) - if outputs is None: + self.outputs_none = outputs is None + if self.outputs_none: outputs = () + PLATO_LOGGER.info('Done.') updates = sc.get_updates() @@ -620,15 +657,35 @@ def __call__(self, *args, **kwargs): self._local_variable_keys = self._original_fcn.locals().keys() self._n_outputs = len(flat_output_tensors) self._n_trace_vars = len(traces) - flat_output_tensors = flat_output_tensors+traces.values()+self._original_fcn.locals().values() + flat_output_tensors = flat_output_tensors+list(traces.values())+list(self._original_fcn.locals().values()) # Compile the theano function - PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates))) - args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable)] # Remove shared variables from passed-in tensor args + if self.print_initial_shapes: + PLATO_LOGGER.info('Compiling {func_name} with: \n {n_in} inputs: {in_shapes}\n {n_out} outputs: {out_shapes}\n {n_up} updates ({n_params} parameters): {up_shapes}'.format( + func_name = self._original_fcn.fcn_str(), + n_in = len(args_and_kwarg_tensors), + in_shapes = [f.shape if isinstance(f, np.ndarray) else () for f in flat_input_data], + n_out = 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), + out_shapes = '???', + n_up = len(updates), + n_params = sum(p.get_value().size for p, u in updates), + up_shapes = [p.get_value().shape for p, u in updates], + )) + else: + PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates))) + + args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable) and a is not None] # Remove shared variables from passed-in tensor args if self.resettable: self.updated_variables = [shared_var for shared_var, update in updates] self._original_variable_values = [var.get_value() for var in self.updated_variables] - self._compiled_fcn = theano.function(inputs = args_and_kwarg_tensors, outputs = flat_output_tensors, updates = updates, allow_input_downcast=self._cast_to_floatx, **self.theano_function_kwargs) + + if None in flat_output_tensors: + flat_non_none_output_tensors = list(x for x in flat_output_tensors if x is not None) + self._none_output_indices = [o is None for o in flat_output_tensors] + else: + flat_non_none_output_tensors = flat_output_tensors + self._none_output_indices = None + self._compiled_fcn = theano.function(inputs = args_and_kwarg_tensors, outputs = flat_non_none_output_tensors, updates = updates, allow_input_downcast=self._cast_to_floatx, **self.theano_function_kwargs) PLATO_LOGGER.info('Done.') # Ok, so this code runs every time you call the "compiled" function. @@ -641,7 +698,7 @@ def __call__(self, *args, **kwargs): "The shared variables you passed in, {}, Don't match the shared variables you passed in when you first called this compiled function: {}. " \ "This creates problems for us. Instead, compile your function a second time for the new shared inputs."\ .format(['{}@{}'.format(repr(trace_value), hex(id(trace_value))) for trace_value in shared_passed_in], ['{}@{}'.format(repr(trace_value), hex(id(trace_value))) for trace_value in self._shared_var_inputs]) - arg_and_kwarg_values = [a for a in arg_and_kwarg_values if not isinstance(a, SharedVariable)] # Remove shared variables from passed-in numeric args + arg_and_kwarg_values = [a for a in arg_and_kwarg_values if not isinstance(a, SharedVariable) and a is not None] # Remove shared variables from passed-in numeric args # Now, run the actual numeric function! if self._there_are_debug_variables: # Need to take care of stripping off the debug variables @@ -655,6 +712,11 @@ def __call__(self, *args, **kwargs): self._local_values = {k: v for k, v in zip(self._local_variable_keys, local_out)} else: # Normal case flat_output_data = all_out = self._compiled_fcn(*arg_and_kwarg_values) + + if self._none_output_indices is not None: + flat_output_iter = iter(flat_output_data) + flat_output_data = list(next(flat_output_iter) if not isnone else None for isnone in self._none_output_indices) + true_out = self._output_format.expand_from_leaves(flat_output_data, check_types=False) if len(flat_output_data)>0 else () if self._debug_print_shapes: @@ -674,7 +736,7 @@ def __call__(self, *args, **kwargs): for c in self._callbacks: c() - return true_out + return None if self.outputs_none else true_out def reset(self): assert self.resettable, "If you want to reset the state of your compiled function, you must compile with f.compile(resettable=True)" @@ -802,10 +864,10 @@ def flattenit(var, ndim): def show_all_locals(): locals_of_calling_frame = inspect.currentframe().f_back.f_locals - print '=== Locals ===' + print('=== Locals ===') for k, v_info in get_local_info(locals_of_calling_frame).iteritems(): - print '%s = %s' % (k, v_info) - print '--------------' + print('%s = %s' % (k, v_info)) + print('--------------') def get_local_info(locals_of_calling_frame=None): @@ -874,7 +936,7 @@ def find_leaf_ancestors(variable): def printit(var_name, var_val): - print '%s: %s' % (var_name, var_val) + print('%s: %s' % (var_name, var_val)) name_counts = {} @@ -1130,7 +1192,7 @@ def add_update(self, shared_var, new_val, accumulate = None): self._outer_catcher.add_update(shared_var, new_val) def get_updates(self, as_dict = False): - return OrderedDict(self._updates.items()) if as_dict else self._updates.items() + return OrderedDict(self._updates.items()) if as_dict else list(self._updates.items()) StateCatcher = CaptureUpdates # Backwards compatibility @@ -1268,7 +1330,7 @@ def create_shared_variable_from_zeros(shape, name = None, **shared_kwargs): :param shared_kwargs: Other keyword args for shared variable construction :return: A theano shared variable. """ - assert name is None or isinstance(name, basestring) # Mostly checks that you didn't accidentally call like create_shared_variable_from_zeros(3, 4) + assert name is None or isinstance(name, str) # Mostly checks that you didn't accidentally call like create_shared_variable_from_zeros(3, 4) return create_shared_variable(initializer_fcn=np.zeros(shape), name=name, **shared_kwargs) diff --git a/plato/examples/demo_prediction_example.py b/plato/examples/demo_prediction_example.py index 0d10dca..9654bae 100644 --- a/plato/examples/demo_prediction_example.py +++ b/plato/examples/demo_prediction_example.py @@ -22,7 +22,7 @@ def compare_example_predictors( minibatch_size = 10, ): """ - This demo shows how we can compare different online predictors. The demo trains both predictors on the dataset, + This demo shows how we can compare_learning_curves different online predictors. The demo trains both predictors on the dataset, returning an object that contains the results. :param test_mode: Set this to True to just run the demo quicky (but not to completion) to see that it doesn't break. @@ -37,7 +37,7 @@ def compare_example_predictors( n_epochs = 1 n_tests = 3 - # Here we compare three predictors on MNIST - an MLP, a Perceptron, and a Random Forest. + # Here we compare_learning_curves three predictors on MNIST - an MLP, a Perceptron, and a Random Forest. # - The MLP is defined using Plato's interfaces - we create a Symbolic Predictor (GradientBasedPredictor) and # then compile it into an IPredictor object # - The Perceptron directly implements the IPredictor interface. diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py index 47ec69e..2bb18ec 100644 --- a/plato/interfaces/helpers.py +++ b/plato/interfaces/helpers.py @@ -1,16 +1,16 @@ import numpy as np -from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic -from plato.interfaces.interfaces import IParameterized import theano -from theano.compile.sharedvalue import SharedVariable +import theano.tensor as tt +from theano.gof.graph import Variable from theano.ifelse import ifelse -from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams from theano.tensor.shared_randomstreams import RandomStreams -import theano.tensor as tt from theano.tensor.sharedvar import TensorSharedVariable from theano.tensor.var import TensorVariable +from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic +from plato.interfaces.interfaces import IParameterized + __author__ = 'peter' @@ -57,13 +57,17 @@ def get_theano_rng(seed, rngtype = 'mrg'): :return: """ + def load_cuda_rng(): + from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams + return CURAND_RandomStreams + stream_types = { - 'mrg': MRG_RandomStreams_ext, - 'mrg-old': MRG_RandomStreams, - 'default': RandomStreams, - 'cuda': CURAND_RandomStreams + 'mrg': lambda: MRG_RandomStreams_ext, + 'mrg-old': lambda: MRG_RandomStreams, + 'default': lambda: RandomStreams, + 'cuda': load_cuda_rng } - rng_con = stream_types[rngtype] + rng_con = stream_types[rngtype]() if isinstance(seed, np.random.RandomState): return rng_con(seed.randint(1e9)) @@ -71,7 +75,7 @@ def get_theano_rng(seed, rngtype = 'mrg'): return rng_con(seed) elif seed is None: return rng_con(np.random.randint(1e9)) - elif isinstance(seed, tuple(stream_types.values())): + elif isinstance(seed, tuple(v() for v in stream_types.values())): return seed else: raise Exception("Can't initialize a random number generator with %s" % (seed, )) @@ -99,6 +103,7 @@ def identity(x): 'softmax': softmax, 'sigm': tt.nnet.sigmoid, 'sig': tt.nnet.sigmoid, + 'clip': lambda x: tt.clip(x, 0, 1), 'd_sigm': lambda x: tt.nnet.sigmoid(x)-tt.nnet.sigmoid(-x), 'tanh': tt.tanh, 'sech2': lambda x: (4*tt.cosh(x)**2)/(tt.cosh(2*x)+1)**2, @@ -195,7 +200,7 @@ def __call__(self, x): return x - running_mean -def batchify_function(fcn, batch_size): +def batchify_function(fcn, batch_size, **scan_kwargs): """ Given a symbolic function, transform it so that computes its input in a sequence of minibatches, instead of in one go. This can be useful when: @@ -214,10 +219,17 @@ def batchify_function(fcn, batch_size): def batch_function(*args): start_ixs = tt.arange(0, args[0].shape[0], batch_size) @symbolic - def process_batch(start_ix, end_ix): + def process_batch(start_ix, end_ix, *args): return fcn(*[arg[start_ix:end_ix] for arg in args]) - out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size]) - return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1) + + out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size], non_sequences = args, **scan_kwargs) + # out = theano.scan(process_batch, sequences = [start_ixs, start_ixs+batch_size]) + if out is None: + return None + elif isinstance(out, Variable): + return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1) + else: + return out.__class__(o.reshape((-1, )+tuple(o.shape[i] for i in xrange(2, p.ndim)), ndim=o.ndim-1) for o in out) return batch_function diff --git a/plato/interfaces/interfaces.py b/plato/interfaces/interfaces.py index 0a38516..c1edfb3 100644 --- a/plato/interfaces/interfaces.py +++ b/plato/interfaces/interfaces.py @@ -20,6 +20,14 @@ def set_parameter_states(self, states): p.set_value(s) +def get_parameters(obj): + + if isinstance(obj, IParameterized) or hasattr(obj, 'parameters'): + return obj.parameters + else: + return [] + + class IFreeEnergy(object): __metaclass__ = ABCMeta diff --git a/plato/interfaces/test_helpers.py b/plato/interfaces/test_helpers.py index 95f2a05..f9661ac 100644 --- a/plato/interfaces/test_helpers.py +++ b/plato/interfaces/test_helpers.py @@ -43,6 +43,21 @@ def add_them(a, b): assert np.allclose(out, arr_a+arr_b) +def test_batch_without_return(): + + state = create_shared_variable(np.zeros(2)) + + @symbolic + def do_something_internal(a, b): + new_state = state+ a*b + add_update(state, new_state) + # return new_state + + out = batchify_function(do_something_internal, batch_size=2).compile()(np.arange(6).astype(float), np.arange(1,7).astype(float)) + assert out is None + assert np.array_equal(state.get_value(), [0*1+2*3+4*5, 1*2+3*4+5*6]) + + def test_compute_in_with_state(): @symbolic @@ -105,3 +120,4 @@ def accumulate(x): test_compute_in_with_state() test_on_first_pass() test_reshaping_shared_variable() + test_batch_without_return() diff --git a/plato/test_core.py b/plato/test_core.py index ae9f7c6..d6da9d3 100644 --- a/plato/test_core.py +++ b/plato/test_core.py @@ -1,8 +1,8 @@ from abc import abstractmethod from artemis.general.hashing import compute_fixed_hash, fixed_hash_eq -from plato.interfaces.helpers import create_shared_variable -from plato.tools.common.config import float_precision +from plato.interfaces.helpers import create_shared_variable, shared_like +from plato.tools.common.config import hold_float_precision from pytest import raises from plato.core import symbolic_simple, symbolic_updater, SymbolicFormatError, \ tdb_trace, get_tdb_traces, symbolic, set_enable_omniscence, EnableOmniscence, clear_tdb_traces, add_update, \ @@ -516,7 +516,7 @@ def do_some_ops(x): def test_arbitrary_structures(): - with float_precision(64): + with hold_float_precision(64): @symbolic def my_func(inp): """ @@ -607,6 +607,50 @@ def my_cumsum(x): assert np.array_equal(get_tdb_traces()['x_in_loop_catch_all'], np.arange(4)**3) +def test_easy_scan_syntax(): + + @symbolic + def accumulator(v, shape): + accum = create_shared_variable(np.zeros(shape)) + new_accum = accum + v + add_update(accum, new_accum) + return new_accum + + x = np.random.randn(5, 3) + f = accumulator.partial(shape=x.shape[1:]).scan.compile() + + assert np.allclose(f(x), np.cumsum(x, axis=0)) + + +def test_scan_no_return(): + + state = create_shared_variable(np.zeros(())) + + @symbolic + def do_something_internal(a, b): + new_state = state+ a*b + add_update(state, new_state) + + out = do_something_internal.scan.compile()(np.arange(6).astype(float), np.arange(1,7).astype(float)) + + assert out is None + assert np.array_equal(state.get_value(), np.arange(6).dot(np.arange(1, 7))) + + +def test_none_inputs_and_outputs(): + + @symbolic + def double_if_not_none(params): + return [p*2 if p is not None else None for p in params] + + f = double_if_not_none.compile() + assert f([1, 2, 3, None, 4]) == [2, 4, 6, None, 8] + assert f([1, 2, 3, None, 4]) == [2, 4, 6, None, 8] + + with pytest.raises(TypeError): # Warns that you're not calling in consistent way. + f([1, 2, 3, 3, 4]) + + if __name__ == '__main__': test_ival_ishape() test_catch_sneaky_updates() @@ -630,3 +674,6 @@ def my_cumsum(x): test_shared_input() test_function_reset() test_trace_var_in_scan() + test_easy_scan_syntax() + test_scan_no_return() + test_none_inputs_and_outputs() diff --git a/plato/tools/common/basic.py b/plato/tools/common/basic.py index 2ead9e9..c9288ed 100644 --- a/plato/tools/common/basic.py +++ b/plato/tools/common/basic.py @@ -81,7 +81,7 @@ def running_mean_and_variance(data, decay = None, shape = None, elementwise=True var_new = s_new add_update(mean_last, mean_new) add_update(s_last, s_new) - return var_new + return mean_new, var_new @symbolic @@ -93,4 +93,4 @@ def running_variance(data, decay=None, shape = None, elementwise=True, initial_v :param shape: :return: """ - return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value) + return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value)[1] diff --git a/plato/tools/common/config.py b/plato/tools/common/config.py index d55c7ef..160ec7d 100644 --- a/plato/tools/common/config.py +++ b/plato/tools/common/config.py @@ -5,7 +5,7 @@ @contextmanager -def float_precision(value): +def hold_float_precision(value): """ Change the theano float precesion variable (theano.config.floatX) for all code in a context. Temporarily overrides the value defined in .theanorc. @@ -25,3 +25,16 @@ def float_precision(value): theano.config.floatX = value yield theano.config.floatX = old_precision + + +float_precision = hold_float_precision # Back-compatibility + + +@contextmanager +def hold_theano_optimizer(value): + if value is None: + value = 'None' + old_val = theano.config.optimizer + theano.config.optimizer = value + yield + theano.config.optimizer = old_val diff --git a/plato/tools/common/online_predictors.py b/plato/tools/common/online_predictors.py index 4bc6281..56dc8ff 100644 --- a/plato/tools/common/online_predictors.py +++ b/plato/tools/common/online_predictors.py @@ -1,4 +1,6 @@ from abc import ABCMeta, abstractmethod +from contextlib import contextmanager + from plato.interfaces.decorators import symbolic_simple, symbolic_updater from plato.interfaces.interfaces import IParameterized from plato.tools.optimization.cost import get_named_cost_function @@ -80,6 +82,13 @@ def predict(self, inputs): def train(self, inputs, labels): feedforward_module = self._function if isinstance(self._function, FeedForwardModule) else ParametrizedFeedForwardModule(self._function) feedforward_module.train(x=inputs, y=labels, optimizer=self._optimizer, assert_all_params_optimized = self.assert_all_params_optimized, cost_fcn=self._cost_function, regularization_cost=self._regularization_cost) + # if isinstance(self._function, FeedForwardModule): # A bit ugly but oh well + # else: + # outputs = self._function.train_call(inputs) if isinstance(self._function, FeedForwardModule) else self._function(inputs) + # cost = self._cost_function(outputs, labels) + # if self._regularization_cost is not None: + # cost += self._regularization_cost(self._function.parameters) + # self._optimizer.update_parameters(cost = cost, parameters = self._function.parameters) @property def parameters(self): @@ -87,6 +96,25 @@ def parameters(self): return self._function.parameters + opt_params +_LOCAL_LOSSES = None + + +def declare_local_loss(loss): + if _LOCAL_LOSSES is not None: + _LOCAL_LOSSES.append(loss) + + +@contextmanager +def capture_local_losses(): + global _LOCAL_LOSSES + assert _LOCAL_LOSSES is None, "Local loss book already open" + _LOCAL_LOSSES = [] + try: + yield _LOCAL_LOSSES + finally: + _LOCAL_LOSSES = None + + class CompiledSymbolicPredictor(IPredictor, IParameterized): """ A Predictor containing the compiled methods for a SymbolicPredictor. @@ -125,7 +153,12 @@ def __call__(self, x): raise NotImplementedError() def train(self, x, y, cost_fcn, optimizer, assert_all_params_optimized=False, regularization_cost = None): - cost = cost_fcn(self.train_call(x), y) + with capture_local_losses() as local_losses: + cost = cost_fcn(self.train_call(x), y) + + if len(local_losses)>0: + cost = cost + sum(local_losses) + if regularization_cost is not None: cost = cost + regularization_cost(self.parameters) if isinstance(optimizer, dict): diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py index 117741b..519410f 100644 --- a/plato/tools/convnet/conv_specifiers.py +++ b/plato/tools/convnet/conv_specifiers.py @@ -1,6 +1,8 @@ from artemis.fileman.primitive_specifiers import PrimativeSpecifier +from artemis.general.numpy_helpers import get_rng from artemis.general.should_be_builtins import bad_value - +from artemis.ml.tools.neuralnets import initialize_weight_matrix, initialize_conv_kernel +import numpy as np __author__ = 'peter' @@ -46,11 +48,18 @@ def __init__(self, w, b, mode): """ assert w.ndim==4 assert b is False or (b.ndim==1 and w.shape[0] == len(b)), "Number of output maps must match" - assert isinstance(mode, int) or mode in ('same', 'valid', 'full'), 'Mode "%s" not allowed' % (mode, ) + assert isinstance(mode, int) or mode in ('same', 'valid', 'full', 'half'), 'Mode "%s" not allowed' % (mode, ) self.w=w self.b=b self.mode = mode + @staticmethod + def from_init(k_shape, mode, mag='xavier', use_biases=True, rng=None): + n_out_maps, n_in_maps, k_size_y, k_size_x = k_shape + w = initialize_conv_kernel(kernel_shape=k_shape, mag=mag, rng=rng) + b = np.zeros(n_out_maps) if use_biases else False + return ConvolverSpec(w, b, mode) + def shape_transfer(self, (n_samples, n_maps, size_y, size_x)): return (n_samples, self.w.shape[0])+{ 'same': (size_y, size_x), @@ -112,5 +121,28 @@ def shape_transfer(self, input_shape): n_samples, input_dims = input_shape return n_samples, self.w.shape[1] + @classmethod + def from_init(cls, n_in, n_out, mag = 'xavier', rng=None): + w = initialize_weight_matrix(n_in, n_out, mag=mag, rng=rng) + b = np.zeros(n_out) + return FullyConnectedSpec(w=w, b=b) + + +class ConvNetSpec(PrimativeSpecifier): + + def __init__(self, layer_ordered_dict): + self.layer_ordered_dict = layer_ordered_dict + + def shape_transfer(self): + raise NotImplementedError() + + +def compute_feature_shape(input_shape, specs): + + shape = input_shape + for spec in specs: + shape = spec.shape_transfer(shape) + + return shape # class ConvNetSpec \ No newline at end of file diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py index b943c87..3a56562 100644 --- a/plato/tools/convnet/convnet.py +++ b/plato/tools/convnet/convnet.py @@ -1,14 +1,19 @@ from collections import OrderedDict +from functools import partial + import numpy as np import theano import theano.tensor as tt +from plato.tools.misc.tdb_plotting import tdbplot +from theano.tensor.nnet import conv3d2d + from artemis.general.numpy_helpers import get_rng from plato.core import symbolic, create_shared_variable from plato.interfaces.helpers import get_named_activation_function, get_theano_rng from plato.interfaces.interfaces import IParameterized from plato.tools.common.online_predictors import FeedForwardModule from plato.tools.convnet.conv_specifiers import ConvInitSpec, ConvolverSpec, PoolerSpec, NonlinearitySpec, DropoutSpec, \ - FullyConnectedSpec + FullyConnectedSpec, ConvNetSpec from theano.tensor.signal.pool import pool_2d __author__ = 'peter' import logging @@ -48,6 +53,15 @@ def parameters(self): def to_spec(self): return ConvolverSpec(self.w.get_value(), self.b.get_value() if self.b is not False else False, self.border_mode) + @classmethod + def from_spec(cls, spec): + return ConvLayer( + w=spec.w, + b=spec.b, + border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode, + filter_flip=False + ) + @symbolic class Nonlinearity(FeedForwardModule): @@ -66,6 +80,49 @@ def to_spec(self): assert isinstance(self._activation_name, basestring), "Can't identify activation fcn" return NonlinearitySpec(self._activation_name) + @classmethod + def from_spec(cls, spec): + return Nonlinearity(spec.func) + + +@symbolic +class ChannelwiseCrossCorr(object): + + def __init__(self, border_mode='full', meansub=True, norm = False, subsample=(1, 1), eps=1e-7, flatten_channels=False): + self.border_mode = border_mode + self.subsample = subsample + self.meansub = meansub + self.norm = norm + self.eps = eps + self.flatten_channels = flatten_channels + + def __call__(self, (x1, x2)): + """ + :param (x1, x2): are each (n_samples, n_channels, size_y, size_x) images + :return: A (n_samples, n_channels, size_y*2-1, size_x*2-1) image representing the channelwise cross-correlation between + each pair of images. OR + (n_samples, size_y*2-1, size_x*2-1) if flatten=True + """ + from theano.tensor.signal.conv import conv2d as sconv2d + if self.meansub: + x1 = x1 - x1.mean(axis=(1, 2, 3), keepdims=True) + x2 = x2 - x2.mean(axis=(1, 2, 3), keepdims=True) + + + + x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x1.shape[2], x1.shape[3])) + x2_flat = x2.reshape((x2.shape[0]*x2.shape[1], x2.shape[2], x2.shape[3]))[:, ::-1, ::-1] + map_flat, _ = theano.scan(partial(sconv2d, border_mode=self.border_mode, subsample=self.subsample), sequences=[x1_flat, x2_flat]) + conv_maps = map_flat.reshape((x1.shape[0], x1.shape[1], map_flat.shape[1], map_flat.shape[2])) + + if self.norm: + conv_maps = conv_maps / tt.sqrt((conv_maps**2).mean(axis=(1, 2, 3), keepdims=True) + self.eps) + # tdbplot(conv_maps[0, :4, :, :], 'corrmaps') + + if self.flatten_channels: + conv_maps = conv_maps.mean(axis=1) + + return conv_maps @symbolic class Pooler(FeedForwardModule): @@ -92,11 +149,15 @@ def __call__(self, x): :param x: An (n_samples, n_maps, size_y, size_x) tensor :return: An (n_sample, n_maps, size_y/ds[0], size_x/ds[1]) tensor """ - return pool_2d(x, ds = self.region, st = self.stride, mode = self.mode, ignore_border=True) + # return pool_2d(x, ds = self.region, st = self.stride, mode = self.mode, ignore_border=True) + return pool_2d(x, ws = self.region, stride = self.stride, mode = self.mode, ignore_border=True) def to_spec(self): return PoolerSpec(region = self.region, stride=self.stride, mode=self.mode) + @classmethod + def from_spec(cls, spec): + return Pooler(region=spec.region, stride=spec.stride, mode=spec.mode) @symbolic class DropoutLayer(FeedForwardModule): @@ -125,6 +186,9 @@ def test_call(self, x): def to_spec(self): return DropoutSpec(self.dropout_rate) + @classmethod + def from_spec(cls, spec): + return cls(spec.dropout_rate, rng=rng) @symbolic class FullyConnectedLayer(FeedForwardModule): @@ -150,6 +214,10 @@ def parameters(self): def to_spec(self): return FullyConnectedSpec(w=self.w.get_value(), b=self.b.get_value() if self.b is not False else False) + @classmethod + def from_spec(cls, spec): + return FullyConnectedLayer(w=spec.w, b=spec.b) + @symbolic class ConvNet(IParameterized): @@ -194,7 +262,7 @@ def get_named_layer_activations(self, x, include_input = False, test_call=False) return named_activations @staticmethod - def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = True, rng=None): + def from_init(specifiers, input_shape=None, w_init=0.01, force_shared_parameters = True, rng=None): """ Convenient initialization function. :param specifiers: List/OrderedDict of layer speciefier objects (see conv_specifiers.py) @@ -205,12 +273,13 @@ def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = Tr :return: A ConvNet """ rng = get_rng(rng) - n_maps, n_rows, n_cols = input_shape + n_maps, n_rows, n_cols = input_shape if input_shape is not None else (None, None, None) layers = OrderedDict() if isinstance(specifiers, (list, tuple)): specifiers = OrderedDict((str(i), val) for i, val in enumerate(specifiers)) for spec_name, spec in specifiers.iteritems(): if isinstance(spec, ConvInitSpec): + assert n_maps is not None spec = ConvolverSpec( w=w_init*rng.randn(spec.n_maps, n_maps, spec.filter_size[0], spec.filter_size[1]), b=np.zeros(spec.n_maps) if spec.use_bias else False, @@ -218,17 +287,7 @@ def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = Tr ) if isinstance(spec, ConvolverSpec): n_maps = spec.w.shape[0] - if spec.mode == 'valid': - n_rows += -spec.w.shape[2] + 1 - n_cols += -spec.w.shape[3] + 1 - elif isinstance(spec.mode, int): - n_rows += -spec.w.shape[2] + 1 + spec.mode*2 - n_cols += -spec.w.shape[3] + 1 + spec.mode*2 - elif isinstance(spec, PoolerSpec): - n_rows /= spec.region[0] - n_cols /= spec.region[1] layers[spec_name] = specifier_to_layer(spec, force_shared_parameters=force_shared_parameters, rng=rng) - # LOGGER.info('Layer "%s" (%s) output shape: %s' % (spec_name, spec.__class__.__name__, (n_maps, n_rows, n_cols))) return ConvNet(layers) @property @@ -236,16 +295,25 @@ def parameters(self): return sum([l.parameters if isinstance(l, IParameterized) else [] for l in self.layers.values()], []) def to_spec(self): + # return ConvNetSpec(OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems())) return OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems()) + @classmethod + def from_spec(cls, spec): + if isinstance(spec, (list, tuple, OrderedDict)): # "old" format + return ConvNet.from_init(spec) + else: + return ConvNet.from_init(spec.layer_ordered_dict) + def specifier_to_layer(spec, force_shared_parameters=True, rng = None): + # TODO: Remove, replace with from_spec return { ConvolverSpec: lambda: ConvLayer( w=spec.w, b=spec.b, force_shared_parameters=force_shared_parameters, - border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode, + border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode, filter_flip=False ), NonlinearitySpec: lambda: Nonlinearity(spec.func), @@ -272,3 +340,16 @@ def normalize_convnet(convnet, inputs): cum_scale = this_std / cum_scale convnet.layers[name].w.set_value(convnet.layers[name].w.get_value()/cum_scale) convnet.layers[name].b.set_value(convnet.layers[name].b.get_value()/this_std) + + +def spec_to_object(spec): # Temporary measure... will do this more clanly later. + cls = { + ConvNetSpec: ConvNet, + NonlinearitySpec: Nonlinearity, + PoolerSpec: Pooler, + FullyConnectedSpec: FullyConnectedLayer, + ConvolverSpec: ConvLayer, + DropoutLayer: DropoutLayer + }[spec.__class__]\ + + return cls.from_spec(spec) diff --git a/plato/tools/convnet/test_convnet.py b/plato/tools/convnet/test_convnet.py index ab97f1b..8e07f2f 100644 --- a/plato/tools/convnet/test_convnet.py +++ b/plato/tools/convnet/test_convnet.py @@ -1,17 +1,19 @@ -from collections import OrderedDict import pickle +from collections import OrderedDict import numpy as np +from artemis.plotting.db_plotting import dbplot +from artemis.general.mymath import argmaxnd +from artemis.ml.datasets.cifar import get_cifar_10_dataset +from artemis.ml.predictors.train_and_test import percent_argmax_correct from plato.tools.common.online_predictors import GradientBasedPredictor from plato.tools.common.training import assess_online_symbolic_predictor from plato.tools.convnet.conv_specifiers import ConvInitSpec, NonlinearitySpec, PoolerSpec -from plato.tools.convnet.convnet import ConvNet, ConvLayer, Pooler, normalize_convnet, Nonlinearity +from plato.tools.convnet.convnet import ConvNet, ConvLayer, Pooler, normalize_convnet, Nonlinearity, \ + ChannelwiseCrossCorr from plato.tools.optimization.cost import negative_log_likelihood_dangerous from plato.tools.optimization.optimizers import AdaMax -from artemis.ml.predictors.train_and_test import percent_argmax_correct -from artemis.ml.datasets.cifar import get_cifar_10_dataset - __author__ = 'peter' @@ -92,6 +94,32 @@ def test_normalize_convnet(): assert 0.9999 < act[layer_name].std() < 1.0001 +def test_cross_conv_layer(): + + # Part 1: Same size + x_shift, y_shift = 3, -5 + rng = np.random.RandomState(1234) + full_x = rng.randn(1, 10, 40, 40) + x1 = full_x[:, :, 10:30, 10:30] + x2 = full_x[:, :, 10+y_shift:30+y_shift, 10+x_shift:30+x_shift] + func = ChannelwiseCrossCorr().compile() + y = func((x1, x2)) + assert y.shape==(1, 10, 39, 39) + dbplot(y) + ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)]) + assert np.all(ixs-39//2 == (y_shift, x_shift)) + + # Part 2: Different sizes + x3 = full_x[:, :, 15+y_shift:25+y_shift, 15+x_shift:25+x_shift] # Same center as before, just smaller + y = func((x1, x3)) + assert y.shape==(1, 10, 20+10-1, 20+10-1) + ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)]) + assert np.all(ixs-(20+10-1)//2 == (y_shift, x_shift)) + + + + if __name__ == '__main__': - test_convnet_serialization() - test_normalize_convnet() + # test_convnet_serialization() + # test_normalize_convnet() + test_cross_conv_layer() diff --git a/plato/tools/fa/__init__.py b/plato/tools/fa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plato/tools/fa/demo_feedback_alignment.py b/plato/tools/fa/demo_feedback_alignment.py new file mode 100644 index 0000000..f39d3f2 --- /dev/null +++ b/plato/tools/fa/demo_feedback_alignment.py @@ -0,0 +1,90 @@ +from artemis.experiments.experiment_record import experiment_root, capture_created_experiments, ExperimentFunction +from artemis.experiments.ui import browse_experiments +from artemis.general.numpy_helpers import get_rng +from artemis.ml.datasets.mnist import get_mnist_dataset +from artemis.ml.predictors.train_and_test import train_and_test_online_predictor +from plato.tools.common.online_predictors import GradientBasedPredictor +from plato.tools.fa.direct_feedback_alignment import create_direct_feedback_alignment_net +from plato.tools.fa.feedback_alignment import create_feedback_alignment_net +from plato.tools.mlp.mlp import MultiLayerPerceptron +from plato.tools.optimization.optimizers import GradientDescent + + +def create_network(version, layer_sizes, optimizer, nonlinearity, final_nonlinearity, backwards_nonlinearity, loss, + w_init = 'xavier', rng=None): + + if version == 'fa': + return create_feedback_alignment_net(layer_sizes=layer_sizes, optimizer=optimizer, backwards_nonlinearity=backwards_nonlinearity, + nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, w_init=w_init, rng=rng) + elif version == 'dfa': + return create_direct_feedback_alignment_net(layer_sizes=layer_sizes, optimizer=optimizer, backwards_nonlinearity=backwards_nonlinearity, + nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, w_init=w_init, rng=rng) + elif version == 'mlp': + return GradientBasedPredictor( + function=MultiLayerPerceptron.from_init( + layer_sizes=layer_sizes, + hidden_activations = nonlinearity, + output_activation = final_nonlinearity, + w_init=w_init, + rng=rng, + ), + + optimizer=optimizer, + cost_function=loss, + ) + else: + raise Exception('No network version "{}"'.format(version)) + + +@ExperimentFunction(is_root=True, one_liner_results=lambda scores: scores.get_oneliner()) +def demo_feedback_alignment_mnist( + version = 'fa', + hidden_sizes = [100], + nonlinearity = 'relu', + final_nonlinearity = 'linear', + loss = 'logistic-xe', + backwards_nonlinearity = 'deriv', + n_epochs=10, + minibatch_size = 10, + learning_rate = 0.01, + seed = 1234, + ): + + assert version in ('fa', 'dfa', 'mlp') + rng = get_rng(seed) + mnist = get_mnist_dataset(flat=True).to_onehot() + + nnet = create_network( + version=version, + layer_sizes=[mnist.input_size]+hidden_sizes+[10], + optimizer=GradientDescent(learning_rate), + backwards_nonlinearity=backwards_nonlinearity, nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, rng=rng + ) + + training_info = train_and_test_online_predictor( + dataset = mnist, + train_fcn=nnet.train.compile(add_test_values = True), + predict_fcn=nnet.predict.compile(add_test_values = True), + minibatch_size=minibatch_size, + n_epochs=n_epochs, + test_epochs=('every', 0.5), + ) + + return training_info + + +with capture_created_experiments() as exs: + demo_feedback_alignment_mnist.add_variant(version='fa') + demo_feedback_alignment_mnist.add_variant(version='dfa') + demo_feedback_alignment_mnist.add_variant(version='mlp') + +for e in exs: + e.add_variant(hidden_sizes=[200, 200, 200], n_epochs = 50) +for e in exs: + e.add_variant(hidden_sizes=[500, 500, 500, 500, 500], n_epochs = 50) + + +if __name__ == '__main__': + browse_experiments() + # demo_feedback_alignment_mnist(version = 'dfa', hidden_sizes=[200, 200, 200]) + diff --git a/plato/tools/fa/direct_feedback_alignment.py b/plato/tools/fa/direct_feedback_alignment.py new file mode 100644 index 0000000..fbdfbd3 --- /dev/null +++ b/plato/tools/fa/direct_feedback_alignment.py @@ -0,0 +1,80 @@ +import numpy as np +from artemis.general.numpy_helpers import get_rng +from artemis.general.should_be_builtins import izip_equal +from artemis.ml.tools.neuralnets import initialize_weight_matrix +from plato.core import create_shared_variable +from plato.interfaces.helpers import get_named_activation_function, get_named_activation_function_derivative +from plato.tools.mlp.manual_backprop_net import IManualBackpropLayer, ManualBackpropNet +import theano.tensor as tt + +class DirectFeedbackAlignmentLayer(IManualBackpropLayer): + """ + + + + """ + + def __init__(self, w, w_back, nonlinearity, b=None, backwards_nonlinearity = 'deriv'): + self.n_in, self.n_out = w.shape + + self.w = create_shared_variable(w) + self.b = create_shared_variable(np.zeros(w.shape[1]) if b is None else b) + if w_back is None: + self.w_back = None + else: + assert w_back.shape[1] == self.n_out + self.w_back = create_shared_variable(w_back) + + self.nonlinearity = get_named_activation_function(nonlinearity) if isinstance(nonlinearity, str) else nonlinearity + self.backwards_nonlinearity = \ + get_named_activation_function_derivative(nonlinearity) if backwards_nonlinearity=='deriv' else \ + get_named_activation_function(backwards_nonlinearity) if isinstance(backwards_nonlinearity, basestring) else \ + backwards_nonlinearity + + @property + def parameters(self): + return [self.w, self.b] + + def forward_pass_and_state(self, x): + pre_sig = x.dot(self.w) + return self.nonlinearity(pre_sig), (x, pre_sig, ) + + def backward_pass(self, state, grad, cost): + # assert cost is None, 'You need to initialize the outer network with pass_loss = False' + x, pre_sig = state + if cost is not None: # Just top layer + assert self.w_back is None + grad = this_grad = tt.grad(cost, wrt=pre_sig) + else: # Other layers + this_grad = grad.dot(self.w_back) + grad_presig = this_grad * self.backwards_nonlinearity(pre_sig) + return grad, [x.T.dot(grad_presig), grad_presig.mean(axis=0)] + + @classmethod + def from_init(cls, n_in, n_out, n_final, w_init='xavier', rng=None, **kwargs): + rng = get_rng(rng) + w = initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=w_init, rng=rng) + w_back = None if n_final is None else initialize_weight_matrix(n_in=n_final, n_out=n_out, mag=w_init, rng=rng) + return cls(w=w, w_back=w_back, **kwargs) + + +def create_direct_feedback_alignment_net(layer_sizes, nonlinearity, final_nonlinearity, optimizer, loss, + backwards_nonlinearity='deriv', w_init = 'xavier', rng = None): + + rng = get_rng(rng) + return ManualBackpropNet( + layers = [ + DirectFeedbackAlignmentLayer.from_init( + n_in=n_in, + n_out=n_out, + n_final=layer_sizes[-1] if i < len(layer_sizes)-2 else None, + nonlinearity=nonlinearity if i < len(layer_sizes)-2 else final_nonlinearity, + backwards_nonlinearity = backwards_nonlinearity, + w_init = w_init, + rng=rng + ) for i, (n_in, n_out) in enumerate(izip_equal(layer_sizes[:-1], layer_sizes[1:])) + ], + optimizer=optimizer, + pass_loss=True, + loss=loss + ) diff --git a/plato/tools/fa/feedback_alignment.py b/plato/tools/fa/feedback_alignment.py new file mode 100644 index 0000000..1d171f0 --- /dev/null +++ b/plato/tools/fa/feedback_alignment.py @@ -0,0 +1,71 @@ +import numpy as np +from artemis.general.numpy_helpers import get_rng +from artemis.general.should_be_builtins import izip_equal +from artemis.ml.tools.neuralnets import initialize_weight_matrix +from plato.core import create_shared_variable +from plato.interfaces.helpers import get_named_activation_function, get_named_activation_function_derivative +from plato.tools.mlp.manual_backprop_net import IManualBackpropLayer, ManualBackpropNet +from theano import tensor as tt + + +class FeedbackAlignmentLayer(IManualBackpropLayer): + + def __init__(self, w, w_back, nonlinearity, b=None, backwards_nonlinearity = 'deriv'): + self.n_in, self.n_out = w.shape + + assert w_back.shape == (self.n_out, self.n_in) + self.w = create_shared_variable(w) + self.b = create_shared_variable(np.zeros(w.shape[1]) if b is None else b) + self.w_back = create_shared_variable(w_back) + + self.nonlinearity = get_named_activation_function(nonlinearity) if isinstance(nonlinearity, str) else nonlinearity + self.backwards_nonlinearity = \ + get_named_activation_function_derivative(nonlinearity) if backwards_nonlinearity=='deriv' else \ + get_named_activation_function(backwards_nonlinearity) if isinstance(backwards_nonlinearity, basestring) else \ + backwards_nonlinearity + + @property + def parameters(self): + return [self.w, self.b] + + def forward_pass_and_state(self, x): + pre_sig = x.dot(self.w) + out = self.nonlinearity(pre_sig) + return out, (x, pre_sig, out) + + def backward_pass(self, state, grad, cost): + x, pre_sig, out = state + if grad is None: + grad_presig = tt.grad(cost, wrt = pre_sig) + else: + # return self.backward_pass(state=state, grad=grad, cost=None) + assert cost is None and grad is not None + grad_presig = grad * self.backwards_nonlinearity(pre_sig) + return grad_presig.dot(self.w_back), [x.T.dot(grad_presig), grad_presig.mean(axis=0)] + + @classmethod + def from_init(cls, n_in, n_out, w_init='xavier', rng=None, **kwargs): + rng = get_rng(rng) + w = initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=w_init, rng=rng) + w_back = initialize_weight_matrix(n_in=n_out, n_out=n_in, mag=w_init, rng=rng) + return cls(w=w, w_back=w_back, **kwargs) + + +def create_feedback_alignment_net(layer_sizes, nonlinearity, final_nonlinearity, optimizer, loss, backwards_nonlinearity='deriv', + w_init = 'xavier', rng = None): + + rng = get_rng(rng) + return ManualBackpropNet( + layers = [ + FeedbackAlignmentLayer.from_init( + n_in=n_in, + n_out=n_out, + nonlinearity=nonlinearity if i < len(layer_sizes)-2 else final_nonlinearity, + backwards_nonlinearity = backwards_nonlinearity, + w_init = w_init, + rng=rng + ) for i, (n_in, n_out) in enumerate(izip_equal(layer_sizes[:-1], layer_sizes[1:])) + ], + optimizer=optimizer, + loss=loss + ) \ No newline at end of file diff --git a/plato/tools/lstm/long_short_term_memory.py b/plato/tools/lstm/long_short_term_memory.py index dfe64d7..7a2bd38 100644 --- a/plato/tools/lstm/long_short_term_memory.py +++ b/plato/tools/lstm/long_short_term_memory.py @@ -1,6 +1,8 @@ -from plato.core import add_update, symbolic_multi, symbolic_simple, create_shared_variable +from artemis.general.numpy_helpers import get_rng +from artemis.ml.tools.neuralnets import initialize_weight_matrix +from plato.core import add_update, symbolic_multi, symbolic_simple from plato.interfaces.decorators import symbolic_updater -from plato.interfaces.helpers import create_shared_variable, get_theano_rng, get_named_activation_function, softmax +from plato.interfaces.helpers import create_shared_variable, get_theano_rng, get_named_activation_function from plato.tools.optimization.cost import mean_xe from plato.tools.optimization.optimizers import AdaMax import theano @@ -98,7 +100,7 @@ def parameters(self): self.w_ho, self.w_co, self.b_i, self.b_f, self.b_c, self.b_o] @classmethod - def from_initializer(cls, n_input, n_hidden, initializer_fcn, hidden_layer_type='tanh'): + def from_initializer(cls, n_input, n_hidden, initializer_fcn = 'xavier', hidden_layer_type='tanh', rng=None): """ :param n_input: Number of inputs :param n_hidden: Number of hiddens @@ -106,6 +108,11 @@ def from_initializer(cls, n_input, n_hidden, initializer_fcn, hidden_layer_type= :param initializer_fcn: Function taking a shape and returning parameters. :return: An LSTMLayer """ + if isinstance(initializer_fcn, basestring): + rng = get_rng(rng) + initializer = initializer_fcn + initializer_fcn = lambda (n_in, n_out): initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=initializer, rng=rng) + return LSTMLayer( w_xi = create_shared_variable(initializer_fcn, shape = (n_input, n_hidden)), w_xf = create_shared_variable(initializer_fcn, shape = (n_input, n_hidden)), @@ -128,10 +135,15 @@ class AutoencodingLSTM(object): """ An LSTM that learns to predict the next element in a sequence. """ - def __init__(self, n_input, n_hidden, initializer_fcn, input_layer_type = 'softmax', hidden_layer_type = 'tanh'): + def __init__(self, n_input, n_hidden, initializer_fcn='xavier', input_layer_type = 'softmax', hidden_layer_type = 'tanh', rng=None): + + if isinstance(initializer_fcn, basestring): + rng = get_rng(rng) + initializer = initializer_fcn + initializer_fcn = lambda (n_in, n_out): initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=initializer, rng=rng) self.lstm = LSTMLayer.from_initializer(n_input=n_input, n_hidden=n_hidden, initializer_fcn=initializer_fcn, - hidden_layer_type = hidden_layer_type) + hidden_layer_type = hidden_layer_type, rng=rng) self.w_hz = create_shared_variable(initializer_fcn, (n_hidden, n_input)) self.b_z = create_shared_variable(0, n_input) @@ -159,7 +171,7 @@ def get_generation_function(self, maintain_state = True, stochastic = True, rng """ Return a symbolic function that generates a sequence (and updates its internal state). :param stochastic: True to sample a onehot-vector from the output. False to simply reinsert the - distribution vector. + distribution vector (only makes sense on categorical variables, not regression). :param rng: A seed, numpy or theano random number generator :return: A symbolic function of the form: (outputs, updates) = generate(primer, n_steps) @@ -190,7 +202,11 @@ def do_step(i, x_, h_, c_): c_: A memory cell vector """ y_prob, h, c = self.step(x_, h_, c_) - y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX), y_prob) + if stochastic: + y_candidate = rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX) + else: + y_candidate = y_prob + # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX), y_prob) # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob.dimshuffle('x', 1))[0].astype(theano.config.floatX), y_prob) y = ifelse(i < n_primer_steps, primer[i], y_candidate) # Note: If you get error here, you just need to prime with something on first call. return y, h, c @@ -235,8 +251,8 @@ def parameters(self): def mysoftmax(x): # A little kludge we have to do because the build-in softmax is awkwardly restricted to being along the first # axis. - if x.indim==1: + if x.ndim==1: newx = x.dimshuffle('x', 0) return tt.nnet.softmax(newx)[0] - elif x.indim==2: + elif x.ndim==2: return tt.nnet.softmax(x) diff --git a/plato/tools/misc/tdb_plotting.py b/plato/tools/misc/tdb_plotting.py index 398a34a..0ab582f 100644 --- a/plato/tools/misc/tdb_plotting.py +++ b/plato/tools/misc/tdb_plotting.py @@ -54,8 +54,7 @@ def tdbplot(var, name = None, plot_type = None, draw_every=None, overwright_name # tdb_trace(var, name, callback=partial(set_plot_data_and_update, name=name, draw_every=draw_every), overwright_names=overwright_names) tdb_trace(var, name, overwrite_names=overwright_names) - CallbackCatcher.get_current().add_callback(plot_all_trace_variables) - + CallbackCatcher.get_current().add_callback(partial(plot_all_trace_variables, draw_every=draw_every)) @contextmanager diff --git a/plato/tools/mlp/demo_mnist_mlp.py b/plato/tools/mlp/demo_mnist_mlp.py index 5918a5c..d8a7fb9 100644 --- a/plato/tools/mlp/demo_mnist_mlp.py +++ b/plato/tools/mlp/demo_mnist_mlp.py @@ -1,8 +1,8 @@ -from artemis.experiments.experiment_record import experiment_function +from artemis.experiments.decorators import experiment_function from artemis.experiments.ui import browse_experiments from artemis.general.test_mode import is_test_mode from artemis.ml.datasets.mnist import get_mnist_dataset -from artemis.ml.predictors.train_and_test import train_and_test_online_predictor +from artemis.ml.predictors.deprecated.train_and_test_old import train_and_test_online_predictor from artemis.plotting.db_plotting import dbplot, hold_dbplots from plato.tools.common.online_predictors import GradientBasedPredictor from plato.tools.mlp.mlp import MultiLayerPerceptron @@ -83,13 +83,20 @@ def vis_callback(info, score): return info_score_pair_sequence -X=demo_mnist_mlp.add_variant('mini-mnist', max_training_samples=1000, max_test_samples=1000, hidden_sizes=[100], n_epochs=100, visualize_params=True) +demo_mnist_mlp.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000) +demo_mnist_mlp.add_variant('deep', hidden_sizes=[500, 500, 500, 500]) -X.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000) +# demo_mnist_mlp.get_variant('deep').run() +# print demo_mnist_mlp.get_variant('deep').get_latest_record().get_log() -X.add_variant('L2-loss', cost='mse', onehot=True, learning_rate=0.01) -demo_mnist_mlp.add_variant(hidden_sizes=[]) +# X=demo_mnist_mlp.add_variant('mini-mnist', max_training_samples=1000, max_test_samples=1000, hidden_sizes=[100], n_epochs=100, visualize_params=True) +# +# X.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000) +# +# X.add_variant('L2-loss', cost='mse', onehot=True, learning_rate=0.01) +# +# demo_mnist_mlp.add_variant(hidden_sizes=[]) if __name__ == '__main__': diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py new file mode 100644 index 0000000..025c04b --- /dev/null +++ b/plato/tools/mlp/manual_backprop_net.py @@ -0,0 +1,357 @@ +from abc import abstractmethod +from collections import OrderedDict + +import numpy as np +from plato.tools.misc.tdb_plotting import tdbplot + +from artemis.general.nested_structures import get_leaf_values, NestedType +from artemis.general.should_be_builtins import izip_equal +from plato.core import create_constant, symbolic, create_shared_variable, add_update +from plato.interfaces.helpers import batchify_function, get_named_activation_function, get_parameters_or_not +from plato.interfaces.interfaces import IParameterized +from plato.tools.common.online_predictors import ISymbolicPredictor +from plato.tools.mlp.mlp import FullyConnectedTransform +from plato.tools.optimization.cost import get_named_cost_function +from plato.tools.optimization.optimizers import IGradientOptimizer +from theano import tensor as tt + + +class ManualBackpropNet(ISymbolicPredictor): + """ + A sequential (chain) network where you can insert layers that do backprop manually. + """ + def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True, params_to_train = None, + return_prediction = False): + """ + :param layrs: + :param optimizer: + :param loss: + """ + if isinstance(layers, (OrderedDict, list, tuple)): # Backwards compatibility baby! + self.model = ChainNetwork(layers) + else: + self.model = layers + self.optimizer = optimizer + self.pass_loss = pass_loss + self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss + self.prediction_minibatch_size = prediction_minibatch_size + self.params_to_train = params_to_train + self.return_prediction = return_prediction + + @symbolic + def predict(self, x): + if self.prediction_minibatch_size is None: + return self._predict_in_single_pass(x) + else: + return batchify_function(self._predict_in_single_pass, batch_size=self.prediction_minibatch_size)(x) + + def _predict_in_single_pass(self, x): + # out, _ = self.model.forward_pass_and_state(x) + out, _ = forward_pass_and_state(self.model, x) + return out + + @symbolic + def train(self, x, y): + out, state = forward_pass_and_state(self.model, x) + loss = self.loss(out, y) + if self.pass_loss: + grad = None + else: + grad = tt.grad(loss, wrt=out) + loss = None + _, param_grad_pairs = backward_pass(self.model, state=state, grad=grad, loss=loss) + + if self.params_to_train is not None: + params_in_net = set(p for p, g in param_grad_pairs) + assert params_in_net.issuperset(self.params_to_train), 'You listed parameters to train {} which were not in the model'.format(set(self.params_to_train).difference(params_in_net)) + param_grad_pairs = [(p, g) for p, g in param_grad_pairs if p in self.params_to_train] + + if isinstance(self.optimizer, IGradientOptimizer): + all_params, all_param_grads = zip(*[(p, g) for p, g in param_grad_pairs]) if len(param_grad_pairs)>0 else ([], []) + self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads) + elif isinstance(self.optimizer, (list, tuple)): + for optimizer, layer_pairs in izip_equal(self.optimizer, param_grad_pairs): + params, grads = zip(*layer_pairs) + optimizer.update_from_gradients(parameters=params, gradients=grads) + + if self.return_prediction: + return out + + @property + def parameters(self): + return self.model.parameters + + +@symbolic +class IManualBackpropLayer(IParameterized): + + def forward_pass(self, x): + """ + :param x: A real (n_samples, n_dims_in) input + :return: A real (n_samples, n_dims_in) output + """ + out, _ = self.forward_pass_and_state(x) + return out + + def __call__(self, *args): + return self.forward_pass(*args) + + @abstractmethod + def forward_pass_and_state(self, x): + """ + :param x: + :return: out, state + Where: + out is the output of the layer + state is a list of state-variables to be passed into the backward pass. + Importantly, they must be in order (so that the last element of state is the one used to compute the gradient) + """ + raise NotImplementedError() + + @abstractmethod + def backward_pass(self, state, grad, loss): + """ + :param state: The list of state variables you returned in forward_pass_and_state + :param grad: The incoming gradient + :return: The outgoing gradient + """ + raise NotImplementedError() + + +def forward_pass_and_state(layer, x): + if isinstance(layer, IManualBackpropLayer): + out, layer_state = layer.forward_pass_and_state(x) + else: + out = layer(x) + layer_state = (x, out) + return out, layer_state + + +def backward_pass(layer, state, grad, loss): + if isinstance(layer, IManualBackpropLayer): + grad_inputs, param_grad_pairs = layer.backward_pass(state=state, grad=grad, loss= loss) + else: + inputs, y = state + params = list(get_parameters_or_not(layer)) + grad_inputs = tt.grad(cost=loss, wrt=inputs, known_grads={y: grad} if grad is not None else None) + grad_params = tt.grad(cost=loss, wrt=params, known_grads={y: grad} if grad is not None else None) + param_grad_pairs = [(p, g) for p, g in izip_equal(params, grad_params)] + return grad_inputs, param_grad_pairs + + +SNEAKILY_SAVE_ACTIVATIONS = False + + +def set_sneakily_save_activations(state): + global SNEAKILY_SAVE_ACTIVATIONS + SNEAKILY_SAVE_ACTIVATIONS = state + + +class ChainNetwork(IManualBackpropLayer): + + def __init__(self, layers, backprop_down_to=None, sneakily_save_activations = False): + if isinstance(layers, OrderedDict): + self.layer_names, self.layers = zip(*layers.items()) + else: + self.layer_names = range(len(layers)) + self.layers = layers + + self.sneakily_saved_activations = OrderedDict() + self.sneakily_saved_gradients = OrderedDict() + self.backprop_down_to = backprop_down_to + + @symbolic + def forward_pass_and_state(self, x): + if SNEAKILY_SAVE_ACTIVATIONS: + self.sneakily_saved_activations['input'] = create_shared_variable(np.zeros((1,) * x.ndim)) + add_update(self.sneakily_saved_activations['input'], x) + state = {} + for layer_name, layer in zip(self.layer_names, self.layers): + x, layer_state = forward_pass_and_state(layer, x) + state[layer]=layer_state + if SNEAKILY_SAVE_ACTIVATIONS: + self.sneakily_saved_activations[layer_name] = create_shared_variable(np.zeros((1,) * x.ndim)) + add_update(self.sneakily_saved_activations[layer_name], x) + return x, state + + @symbolic + def backward_pass(self, state, grad, loss): + assert (grad is None) != (loss is None), 'Give me a grad xor give me a loss.' + param_grad_pairs = [] + # if SNEAKILY_SAVE_ACTIVATIONS: + # self.sneakily_saved_gradients['output'] = create_shared_variable(np.zeros((1,) * grad.ndim)) + # add_update(self.sneakily_saved_activations['input'], grad) + + for layer_name, layer in zip(self.layer_names[::-1], self.layers[::-1]): + grad, layer_param_grad_pairs = backward_pass(layer, state[layer], grad, loss) + loss = None + param_grad_pairs += layer_param_grad_pairs + if SNEAKILY_SAVE_ACTIVATIONS: + self.sneakily_saved_gradients[layer_name] = create_shared_variable(np.zeros((1,) * grad.ndim)) + add_update(self.sneakily_saved_gradients[layer_name], grad) + if self.backprop_down_to is not None and layer_name==self.backprop_down_to: + break + return grad, param_grad_pairs + + @property + def parameters(self): + return [p for layer in self.layers for p in get_parameters_or_not(layer)] + + +class IdentityLayer(object): + + def __call__(self, x): + return x + + +class SiameseNetwork(IManualBackpropLayer): + """ + Implements: + + y = f_merge(f1(f_siamese(x1)), f2(f_siamese(x2))) + + """ + + def __init__(self, f_siamese, f_merge, f1 = None, f2 = None): + """ + :param f_siamese: A function or ManualBackpropLayer of the form f( + :param f_merge: + :return: + """ + self.f_siamese = f_siamese + self.f1 = IdentityLayer() if f1 is None else f1 + self.f2 = IdentityLayer() if f2 is None else f2 + self.f_merge = f_merge + + @symbolic + def forward_pass_and_state(self, (x1, x2)): + out1a, state1a = forward_pass_and_state(self.f_siamese, x1) + out2a, state2a = forward_pass_and_state(self.f_siamese, x2) + + out1b, state1b = forward_pass_and_state(self.f1, out1a) + out2b, state2b = forward_pass_and_state(self.f2, out2a) + + out, state_merge = forward_pass_and_state(self.f_merge, (out1b, out2b)) + return out, (state1a, state2a, state1b, state2b, state_merge) + + @symbolic + def backward_pass(self, state, grad, loss): + state1a, state2a, state1b, state2b, state_merge = state + (grad_out1b, grad_out2b), merge_param_grads = backward_pass(self.f_merge, state=state_merge, grad=grad, loss=loss) + grad_out1a, param_grads_1b = backward_pass(self.f1, state = state1b, grad=grad_out1b, loss=None) + grad_out2a, param_grads_2b = backward_pass(self.f2, state = state2b, grad=grad_out2b, loss=None) + grad1, param_grads_1a = backward_pass(self.f_siamese, state=state1a, grad=grad_out1a, loss=None) + grad2, param_grads_2a = backward_pass(self.f_siamese, state=state2a, grad=grad_out2a, loss=None) + + assert all(param1 is param2 for (param1, _), (param2, _) in zip(param_grads_1a, param_grads_2a)) + param_grads_siamese = [(p1, v1+v2) for (p1, v1), (p2, v2) in zip(param_grads_1a, param_grads_2a)] + param_grads = param_grads_siamese + param_grads_1b + param_grads_2b + merge_param_grads + return (grad1, grad2), param_grads + + @property + def parameters(self): + return get_parameters_or_not(self.f_siamese) + get_parameters_or_not(self.f_merge) + + +class AddingLayer(IManualBackpropLayer): + + def forward_pass_and_state(self, (x1, x2)): + return x1+x2, None + + def backward_pass(self, state, grad, loss): + return (grad, grad), [] + + @property + def parameters(self): + return [] + + +@symbolic +class ConcatenationLayer(object): + + def __call__(self, (x1, x2)): + + # tdbplot(x1[0, :9, :, :], 'x1') + # tdbplot(x2[0, :9, :, :], 'x2') + + return tt.concatenate([x1.flatten(2), x2.flatten(2)], axis=1) + + + +@symbolic +class ChannelConcatenationLayer(object): + + def __call__(self, (x1, x2)): + + # tdbplot(x1[0, :9, :, :], 'x1') + # tdbplot(x2[0, :9, :, :], 'x2') + + return tt.concatenate([x1, x2], axis=1) + + + + +@symbolic +class PlottingLayer(object): + + def __init__(self, func=None, name='plot_var'): + self.func = func + self.name = name + + def __call__(self, x): + from plato.tools.misc.tdb_plotting import tdbplot + plot_var = x if self.func is None else self.func(x) + tdbplot(plot_var, self.name) + return x + + +class ExactBackpropLayer(IManualBackpropLayer): + """ + Performs the function of a layer. + + (Not really useful, since you can now just feed any old function into a manual backprop net) + """ + + def __init__(self, linear_transform, nonlinearity): + """ + linear_transform: Can be: + A callable (e.g. FullyConnectedBridge/ConvolutionalBridge) which does a linear transform on the data. + A numpy array - in which case it will be used to instantiate a linear transform. + """ + if isinstance(linear_transform, np.ndarray): + assert (linear_transform.ndim == 2 and nonlinearity!='maxout') or (linear_transform.ndim == 3 and nonlinearity=='maxout'), \ + 'Your weight matrix must be 2-D (or 3-D if you have maxout units)' + linear_transform = FullyConnectedTransform(w=linear_transform) + if isinstance(nonlinearity, str): + nonlinearity = get_named_activation_function(nonlinearity) + self.linear_transform = linear_transform + self.nonlinearity = nonlinearity + + def forward_pass_and_state(self, x): + pre_sig = self.linear_transform(x) + return self.nonlinearity(pre_sig), (x, pre_sig, ) + + def backward_pass(self, state, grad, loss): + x, _ = state + if loss is None: + y, (x, pre_sig) = self.forward_pass_and_state(x) + dydp = tt.grad(y.sum(), wrt=pre_sig) + # Note... we rely on the (linear-transform, pointwise-nonlinearity) design here. We should figure out how + # to do it more generally (maybe using tt.jacobian), or somehow making a virtual cost. + dcdp = grad*dydp + dcdw = x.T.dot(dcdp) # Because I think if we did this directly for the ws we'd be in trouble + dcdb = dcdp.sum(axis=0) + dcdx = dcdp.dot(self.linear_transform.w.T) + return dcdx, list(izip_equal(self.linear_transform.parameters, [dcdw, dcdb])) + else: + param_grads = tt.grad(loss, wrt=self.linear_transform.parameters) + return tt.grad(loss, wrt=x), list(izip_equal(self.linear_transform.parameters, param_grads)) + + @property + def parameters(self): + return self.linear_transform.parameters + +# woooo +#fdsfdsf + +# ccccc \ No newline at end of file diff --git a/plato/tools/mlp/mlp.py b/plato/tools/mlp/mlp.py index c97a2e6..52e947b 100644 --- a/plato/tools/mlp/mlp.py +++ b/plato/tools/mlp/mlp.py @@ -28,8 +28,8 @@ def __call__(self, x): return x @symbolic - def get_layer_activations(self, x): - activations = [] + def get_layer_activations(self, x, include_input = False): + activations = [x] if include_input else [] for lay in self.layers: x = lay(x) activations.append(x) @@ -92,7 +92,7 @@ def from_weights(cls, weights, biases = None, hidden_activations ='sig', output_ nonlinearity=nonlinearity ) for w, b, nonlinearity, layer_no in - izip_equal(weights, [False]*len(weights) if biases is False else [0.]*len(weights) if biases in (True, None) else biases, [hidden_activations] * (n_layers - 1) + [output_activation], xrange(n_layers)) + izip_equal(weights, [False]*len(weights) if biases is False else [0.]*len(weights) if biases in (True, None) else biases, [hidden_activations] * (n_layers - 1) + [output_activation], range(n_layers)) ] return cls(layers) diff --git a/plato/tools/mlp/test_manual_backprop_net.py b/plato/tools/mlp/test_manual_backprop_net.py new file mode 100644 index 0000000..f90046d --- /dev/null +++ b/plato/tools/mlp/test_manual_backprop_net.py @@ -0,0 +1,70 @@ +import numpy as np +from artemis.ml.tools.neuralnets import initialize_network_params +from plato.tools.common.online_predictors import GradientBasedPredictor +from plato.tools.mlp.manual_backprop_net import ManualBackpropNet, ExactBackpropLayer +from plato.tools.mlp.mlp import MultiLayerPerceptron +from plato.tools.optimization.optimizers import GradientDescent + + +def test_exact_manual_backprop_net(): + + rng = np.random.RandomState(1234) + + n_samples = 5 + n_in, n_hid1, n_hid2, n_out = 10, 8, 7, 6 + ws = initialize_network_params(layer_sizes=[n_in, n_hid1, n_hid2, n_out], include_biases=False) + x, y = rng.randn(n_samples, n_in), rng.randn(n_samples, n_out) + + auto_mlp = GradientBasedPredictor( + function = MultiLayerPerceptron.from_weights(weights=ws, hidden_activations='relu', output_activation='linear'), + cost_function='softmax-xe', + optimizer=GradientDescent(0.1) + ) + stick_mlp = ManualBackpropNet( + layers = [ExactBackpropLayer(ws[0], 'relu'), ExactBackpropLayer(ws[1], 'relu'), ExactBackpropLayer(ws[2], 'linear')], + optimizer = GradientDescent(0.1), + loss = 'softmax-xe' + ) + stick_shifted_by_robot = ManualBackpropNet( + layers = MultiLayerPerceptron.from_weights(weights=ws, hidden_activations='relu', output_activation='linear').layers, + optimizer = GradientDescent(0.1), + loss = 'softmax-xe' + ) + + # Check forward passes match + fp_auto = auto_mlp.predict.compile() + fp_stick = stick_mlp.predict.compile() + fp_robot = stick_shifted_by_robot.predict.compile() + + out_auto = fp_auto(x) + out_stick = fp_stick(x) + out_robot = fp_robot(x) + assert np.allclose(out_auto, out_stick) + assert np.allclose(out_auto, out_robot) + + # 1 Iteration of training + ft_auto = auto_mlp.train.compile() + ft_stick = stick_mlp.train.compile() + ft_robot = stick_shifted_by_robot.train.compile() + ft_auto(x, y) + ft_stick(x, y) + ft_robot(x, y) + + # Check parameter changes match + dw0_auto = auto_mlp._function.layers[0].linear_transform.w.get_value() - ws[0] + dw0_stick = stick_mlp.model.layers[0].linear_transform.w.get_value() - ws[0] + dw0_robot = stick_shifted_by_robot.model.layers[0].linear_transform.w.get_value() - ws[0] + assert np.allclose(dw0_auto, dw0_stick) + assert np.allclose(dw0_auto, dw0_robot) + + # Check outputs match + new_out_auto = fp_auto(x) + new_out_stick = fp_stick(x) + new_out_robot = fp_robot(x) + assert np.allclose(new_out_auto, new_out_stick) + assert not np.allclose(new_out_stick, out_auto) + assert np.allclose(new_out_auto, new_out_robot) + + +if __name__ == '__main__': + test_exact_manual_backprop_net() diff --git a/plato/tools/optimization/cost.py b/plato/tools/optimization/cost.py index ef23e2b..11a7e38 100644 --- a/plato/tools/optimization/cost.py +++ b/plato/tools/optimization/cost.py @@ -61,7 +61,11 @@ def negative_log_likelihood_dangerous(actual, target): @symbolic_simple def mean_squared_error(actual, target): - return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0) + if actual.ndim==2: + return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0) + else: + return tt.mean(tt.sum((actual.flatten(2)-target.flatten(2))**2, axis = 1), axis = 0) + @symbolic_simple @@ -184,6 +188,7 @@ def l1_norm_error(actual, target, eps = 1e-7): 'onehot-mse': onehot_mse, 'norm_l1_error': l1_norm_error, 'softmax-xe': softmax_xe, + 'softmax_xe': softmax_xe, 'categorical-xe': categorical_xe, 'logistic-xe': logistic_xe, } diff --git a/plato/tools/optimization/demo_compare_optimizers.py b/plato/tools/optimization/demo_compare_optimizers.py index 4836c09..9d69a1d 100644 --- a/plato/tools/optimization/demo_compare_optimizers.py +++ b/plato/tools/optimization/demo_compare_optimizers.py @@ -1,10 +1,9 @@ -from artemis.experiments.experiment_record import run_experiment from artemis.general.mymath import sqrtspace from artemis.general.test_mode import is_test_mode, set_test_mode from artemis.ml.datasets.mnist import get_mnist_dataset from artemis.ml.predictors.learning_curve_plots import plot_learning_curves from artemis.ml.predictors.predictor_comparison import compare_predictors -from artemis.ml.predictors.train_and_test import percent_argmax_correct +from artemis.ml.tools.costs import percent_argmax_correct from artemis.ml.tools.processors import OneHotEncoding from artemis.plotting.pyplot_plus import set_default_figure_size from plato.tools.common.online_predictors import GradientBasedPredictor @@ -110,7 +109,7 @@ def backprop_vs_difference_target_prop( ): dataset = get_mnist_dataset(flat = True) - dataset = dataset.process_with(targets_processor=lambda (x, ): (OneHotEncoding(10)(x).astype(int), )) + dataset = dataset.process_with(targets_processor=lambda x_s: (OneHotEncoding(10)(x_s[0]).astype(int), )) if is_test_mode(): dataset = dataset.shorten(200) diff --git a/plato/tools/optimization/optimizers.py b/plato/tools/optimization/optimizers.py index 43b6052..c6ef42b 100644 --- a/plato/tools/optimization/optimizers.py +++ b/plato/tools/optimization/optimizers.py @@ -1,5 +1,8 @@ from abc import abstractmethod -from plato.core import add_update, create_shared_variable, StateCatcher, tdbprint, CaptureUpdates + +from theano.ifelse import ifelse + +from plato.core import add_update, create_shared_variable, StateCatcher, tdbprint, CaptureUpdates, symbolic_stateless from plato.interfaces.decorators import symbolic_updater import theano.tensor as tt import theano @@ -19,11 +22,25 @@ def __call__(self, cost, parameters): """ @abstractmethod - def get_updates(self, cost, parameters, constants = []): + def get_updates(self, cost, parameters, constants = ()): + """ + :param Scalar cost: + :param Sequence[Variable] parameters: + :param Sequence[Variable] constants: + :return Sequence[Tuple[Tensor, Tensor]]: Pairs of (variable, new_variable) + """ pass @abstractmethod - def update_parameters(self, cost, parameters, constants=[]): + def get_updates_from_gradients(self, parameters, gradients): + """ + :param Sequence[Tensor] parameters: + :param Sequence[Tensor] gradients: + :return Sequence[Tuple[Tensor, Tensor]]: + """ + + @abstractmethod + def update_parameters(self, cost, parameters, constants=()): pass @abstractmethod @@ -44,14 +61,13 @@ def __call__(self, cost, parameters, constants = []): """ self.update_parameters(cost=cost, parameters=parameters, constants=constants) - def get_updates(self, cost, parameters, constants = [], as_dict = False): + def get_updates(self, cost, parameters, constants = [], clip=None): """ Get the gradient-based parameter updates, but do not apply them. return: A list of (shared_var, new_val) pairs representing the updates. """ - with CaptureUpdates(swallow=True) as sc: - self(cost=cost, parameters=parameters, constants=constants) - return sc.get_updates(as_dict=as_dict) + gradients = theano.grad(cost, parameters, consider_constant = constants) + return self.get_updates_from_gradients(parameters=parameters, gradients=gradients, clip=clip) def update_parameters(self, cost, parameters, constants = []): """ @@ -69,20 +85,39 @@ def update_from_gradients(self, parameters, gradients, clip = None): :param gradients: A list of corresponding gradients :param clip: Optionally, a 2-tuple indicating the range in which to clip parameters, (or """ + updates = self.get_updates_from_gradients(parameters=parameters, gradients=gradients, clip=clip) + for p, v in updates: + add_update(p, v) + + @symbolic_stateless + def get_updates_from_gradients(self, parameters, gradients, clip=None): + """ + :param Sequence[Variable] parameters: The list of symbolic parameters + :param Sequence[Variable] gradients: The list of gradients + :param Optional[Union[float, Tuple[float,float]] clip: The clipping parameter + :return Sequence[Tuple[Variable, Variable]]: The list of updates (the first len(parameters) of which are ordered parameter updates - the rest are for optimizer params). + """ if clip is not None and not isinstance(clip, (list, tuple)): clip = (-clip, clip) assert len(parameters)==len(gradients), 'Lenght of parameter vector must match length of gradients.' + parameter_updates_list = [] + optimizer_updates_list = [] for p, g in zip(parameters, gradients): - if clip is None: - self._update_param(p, g) - else: - with CaptureUpdates(swallow=True) as sc: - self._update_param(p, g) - sc.get_updates() + updates = self._get_updates_for_param(p, g) + param_update = updates[0] if clip is None else (updates[0][0], tt.clip(updates[0][1], *clip)) + parameter_updates_list.append(param_update) + optimizer_updates_list += updates[1:] + all_updates = parameter_updates_list + optimizer_updates_list + return all_updates @abstractmethod - def _update_param(self, param, gradient): - pass + def _get_updates_for_param(self, param, gradient): + """ + A stateless method + :param Variable param: The parameter + :param Variable gradient: The gradient of this parameter + :return Sequence[Tuple[Variable, Variable]]: The updates - the first of which is the parameter updates (others may update optimizer state) + """ class GradientStepUpdater(UniformParameterOptimizer): @@ -90,8 +125,9 @@ class GradientStepUpdater(UniformParameterOptimizer): Just subtract the gradient to the parameter. This is mainly useful in some situations the step size doesn't matter (because for instance, the function is invariant to the scale of the weights) """ - def _update_param(self, param, gradient): - add_update(param, param - gradient) + def _get_updates_for_param(self, param, gradient): + return [(param, param-gradient)] + # add_update(param, param - gradient) class SimpleGradientDescent(UniformParameterOptimizer): @@ -106,8 +142,19 @@ def __init__(self, eta): """ self._eta = eta - def _update_param(self, param, gradient): - add_update(param, param - self._eta * gradient) + def _get_updates_for_param(self, param, gradient): + return [(param, param - self._eta * gradient)] + # add_update(param, param - self._eta * gradient) + + +def create_optimizer_param_like(param, name=None): + """ + :param TensorVariable like: A variable which it is "like" + :return Tuple[TensorSharedVariable, Scalar]: The variable and a scalar boolean tensor that can be used in an ifelse to check if its been initialized. + """ + opt_param = theano.shared(np.zeros([0]*param.ndim, dtype=param.dtype), name=name) + initialized = opt_param.size>0 + return opt_param, initialized class LangevinGradientDescent(UniformParameterOptimizer): @@ -123,8 +170,9 @@ def __init__(self, eta, rng = None): self._eta = eta self._rng = get_theano_rng(rng) - def _update_param(self, param, gradient): - add_update(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape)) + def _get_updates_for_param(self, param, gradient): + # add_update(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape)) + return[(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape))] class Adam(UniformParameterOptimizer): @@ -146,25 +194,31 @@ def __init__(self, alpha = 1e-3, beta_1=0.1, beta_2=0.001, eps = 1e-8): self.beta_2 = beta_2 self.eps = eps - def _update_param(self, param, gradient): + def _get_updates_for_param(self, param, gradient): # Initialize variables i = create_shared_variable(0.) - m = theano.shared(param.get_value() * 0.) - v = theano.shared(param.get_value() * 0.) + # m = theano.shared(param.get_value() * 0.) + # v = theano.shared(param.get_value() * 0.) + + m, initialized = create_optimizer_param_like(param) + v, _ = create_optimizer_param_like(param) + # v = theano.shared(param.ndim * 0.) # Recompute values i_t = i + 1. fix1 = 1. - (1. - self.beta_1)**i_t fix2 = 1. - (1. - self.beta_2)**i_t lr_t = self.alpha * (tt.sqrt(fix2) / fix1) - m_t = (self.beta_1 * gradient) + ((1. - self.beta_1) * m) - v_t = (self.beta_2 * tt.sqr(gradient)) + ((1. - self.beta_2) * v) + m_t = ifelse(initialized, self.beta_1 * gradient + (1. - self.beta_1) * m, self.beta_1 * gradient) + v_t = ifelse(initialized, self.beta_2 * tt.sqr(gradient) + (1. - self.beta_2) * v, self.beta_2 * tt.sqr(gradient)) g_t = m_t / (tt.sqrt(v_t) + self.eps) p_t = param - (lr_t * g_t) - add_update(param, p_t) - add_update(m, m_t) - add_update(v, v_t) - add_update(i, i_t) + return [(param, p_t), (m, m_t), (v, v_t), (i, i_t)] + + # add_update(param, p_t) + # add_update(m, m_t) + # add_update(v, v_t) + # add_update(i, i_t) class AdaMax(UniformParameterOptimizer): @@ -175,15 +229,21 @@ def __init__(self, alpha = 1e-3, beta_1=0.1, beta_2=0.001, eps = 1e-8): self._beta_2 = beta_2 self._eps = eps - def _update_param(self, param, gradient): - mom1 = theano.shared(np.zeros_like(param.get_value())) - mom2 = theano.shared(np.zeros_like(param.get_value())) - mom1_new = mom1 + self._beta_1 * (gradient - mom1) - mom2_new = tt.maximum(abs(gradient) + self._eps, (1. - self._beta_2) * mom2) + def _get_updates_for_param(self, param, gradient): + + mom1, initialized = create_optimizer_param_like(param) + mom2, _ = create_optimizer_param_like(param) + + # mom1 = theano.shared(np.zeros_like(param.get_value())) + # mom2 = theano.shared(np.zeros_like(param.get_value())) + mom1_new = ifelse(initialized, mom1 + self._beta_1 * (gradient - mom1), self._beta_1*gradient) + mom2_new = ifelse(initialized, tt.maximum(abs(gradient) + self._eps, (1. - self._beta_2) * mom2), abs(gradient) + self._eps) new_param = param - self._alpha * mom1_new / mom2_new - add_update(param, new_param) - add_update(mom1, mom1_new) - add_update(mom2, mom2_new) + return [(param, new_param), (mom1, mom1_new), (mom2, mom2_new)] + + # add_update(param, new_param) + # add_update(mom1, mom1_new) + # add_update(mom2, mom2_new) class RMSProp(UniformParameterOptimizer): @@ -193,12 +253,16 @@ def __init__(self, learning_rate = 0.1, decay = 0.9, max_scaling = 1e5): self.epsilon = 1./max_scaling self.learning_rate = learning_rate - def _update_param(self, param, gradient): - mean_squared_grad = theano.shared(np.zeros_like(param.get_value())) - new_mean_squared_grad = self.decay * mean_squared_grad + (1-self.decay) * gradient**2 + def _get_updates_for_param(self, param, gradient): + # mean_squared_grad = theano.shared(np.zeros_like(param.get_value())) + mean_squared_grad, initialized = create_optimizer_param_like(param) + + new_mean_squared_grad = ifelse(initialized, self.decay * mean_squared_grad + (1-self.decay) * gradient**2, (1-self.decay) * gradient**2) delta_p = - self.learning_rate * gradient / tt.maximum(tt.sqrt(new_mean_squared_grad), self.epsilon) - add_update(param, param + delta_p) - add_update(mean_squared_grad, new_mean_squared_grad) + + return [(param, param + delta_p), (mean_squared_grad, new_mean_squared_grad)] + # add_update(param, param + delta_p) + # add_update(mean_squared_grad, new_mean_squared_grad) class AdaGrad(UniformParameterOptimizer): @@ -216,12 +280,16 @@ def __init__(self, learning_rate = 0.01, decay_rate = 0, max_scaling = 1e5): self.learning_rate = learning_rate self.decay_rate = decay_rate - def _update_param(self, param, gradient): - sum_squared_grad = theano.shared(param.get_value()*0) - new_ssg = (1-self.decay_rate)*sum_squared_grad + gradient**2 + def _get_updates_for_param(self, param, gradient): + # sum_squared_grad = theano.shared(param.get_value()*0) + + sum_squared_grad, initialized = create_optimizer_param_like(param) + + new_ssg = ifelse(initialized, (1-self.decay_rate)*sum_squared_grad + gradient**2, gradient**2) scale = tt.maximum(self.eps, tt.sqrt(new_ssg)) - add_update(param, param - (self.learning_rate / scale) * gradient) - add_update(sum_squared_grad, new_ssg) + return [(param, param - (self.learning_rate / scale) * gradient), (sum_squared_grad, new_ssg)] + # add_update(param, param - (self.learning_rate / scale) * gradient) + # add_update(sum_squared_grad, new_ssg) class GradientDescent(UniformParameterOptimizer): @@ -235,16 +303,23 @@ def __init__(self, eta, momentum = 0, decay = 0): self.momentum = momentum self.decay = decay - def _update_param(self, param, gradient): + def _get_updates_for_param(self, param, gradient): + + updates = [] if self.momentum != 0: - mom = theano.shared(np.zeros_like(param.get_value())) - new_mom = self.momentum * mom + gradient - add_update(mom, new_mom) + mom, initialized = create_optimizer_param_like(param) + # mom = theano.shared(np.zeros_like(param.get_value())) + new_mom = ifelse(initialized, self.momentum * mom + gradient, gradient) + # add_update(mom, new_mom) + updates.append((mom, new_mom)) direction = new_mom # Or mom, something about Nesterov... else: direction = gradient - add_update(param, param - self.eta*direction - self.decay*param) + + updates.insert(0, (param, param - self.eta*direction - self.decay*param)) + return updates + # add_update(param, param - self.eta*direction - self.decay*param) class MultiplicativeGradientDescent(UniformParameterOptimizer): @@ -252,9 +327,10 @@ class MultiplicativeGradientDescent(UniformParameterOptimizer): def __init__(self, factor = 0.01): self.factor = factor - def _update_param(self, param, gradient): + def _get_updates_for_param(self, param, gradient): multiplier = tt.exp(-tt.tanh(gradient)*self.factor) - add_update(param, param*multiplier) + return [(param, param*multiplier)] + # add_update(param, param*multiplier) class PIDOptimizer(UniformParameterOptimizer): @@ -268,35 +344,41 @@ def __init__(self, kp=0.1, ki=0, kd=0): self.ki = ki self.kd = kd - def _update_param(self, param, gradient): + def _get_updates_for_param(self, param, gradient): + + updates = [] new_param = param if self.kp != 0: new_param -= self.kp * gradient if self.ki != 0: grad_integral = create_shared_variable(np.zeros_like(param.get_value())) new_gradient_integral = grad_integral + grad_integral - add_update(grad_integral, new_gradient_integral) + # add_update(grad_integral, new_gradient_integral) + updates.append((grad_integral, new_gradient_integral)) new_param -= self.ki * new_gradient_integral if self.kd != 0: grad_last = create_shared_variable(np.zeros_like(param.get_value())) - add_update(grad_last, gradient) + # add_update(grad_last, gradient) + updates.append((grad_last, gradient)) new_param -= self.kd * (gradient - grad_last) - add_update(param, new_param) + # add_update(param, new_param) + updates.insert(0, (param, new_param)) + return updates -def get_named_optimizer(name, learning_rate, rng = None): +def get_named_optimizer(name, learning_rate, rng = None, **kwargs): """ Convenience function for easily specifying optimizers. :param name: The name of the optimizer :param learning_rate: A scalar, representing the parameter that's most equivalent to a learning rate. - :return: An IGradientOptimizer object. + :return IGradientOptimizer: The optimizer object. """ return { - 'sgd': lambda: SimpleGradientDescent(eta = learning_rate), - 'adam': lambda: Adam(alpha=learning_rate), - 'adamax': lambda: AdaMax(alpha=learning_rate), - 'rmsprop': lambda: RMSProp(learning_rate=learning_rate), - 'adagrad': lambda: AdaGrad(learning_rate=learning_rate), - 'mulsgd': lambda: MultiplicativeGradientDescent(factor=learning_rate), - 'langevin': lambda: LangevinGradientDescent(eta = learning_rate, rng = rng), + 'sgd': lambda: GradientDescent(eta = learning_rate, **kwargs), + 'adam': lambda: Adam(alpha=learning_rate, **kwargs), + 'adamax': lambda: AdaMax(alpha=learning_rate, **kwargs), + 'rmsprop': lambda: RMSProp(learning_rate=learning_rate, **kwargs), + 'adagrad': lambda: AdaGrad(learning_rate=learning_rate, **kwargs), + 'mulsgd': lambda: MultiplicativeGradientDescent(factor=learning_rate, **kwargs), + 'langevin': lambda: LangevinGradientDescent(eta = learning_rate, rng = rng, **kwargs), }[name]() diff --git a/plato/tools/optimization/test_optimizers.py b/plato/tools/optimization/test_optimizers.py index 2344355..8e3cdea 100644 --- a/plato/tools/optimization/test_optimizers.py +++ b/plato/tools/optimization/test_optimizers.py @@ -1,7 +1,10 @@ +from plato.core import symbolic, add_update from plato.tools.optimization.demo_compare_optimizers import get_experiments -from plato.tools.optimization.optimizers import GradientDescent, Adam, AdaMax +from plato.tools.optimization.optimizers import GradientDescent, Adam, AdaMax, RMSProp, get_named_optimizer from plato.tools.regressors.online_regressor import OnlineRegressor from artemis.ml.predictors.predictor_tests import assert_online_predictor_not_broken +import theano.tensor as tt +import numpy as np def _test_optimizer_on_simple_classification_problem(optimizer): @@ -32,14 +35,39 @@ def test_adamax_optimizer(): _test_optimizer_on_simple_classification_problem(AdaMax(alpha=0.01)) -if __name__ == '__main__': - test_gradient_descent_optimizer() - test_adam_optimizer() - test_adamax_optimizer() +def test_unknown_shape(): + + @symbolic + def func(x, optimizer): + loss = tt.sum((x-3)**2) + updates = optimizer.get_updates(cost=loss, parameters=[x]) + for p, v in updates[1:]: + add_update(p, v) + return updates[0][1] + + x_base = np.random.RandomState(1234).randn(3, 4) + for opt in ('adam', 'adamax', 'adagrad', 'rmsprop'): + print('Running Optimizer: {}'.format(opt)) + optimizer = get_named_optimizer(opt, learning_rate=0.5) + x = x_base + f = func.partial(optimizer = optimizer).compile() + for _ in range(50): + x = f(x) + error = np.abs(x-3) + print('Mean Error: {}'.format(error.mean())) + assert np.all(np.abs(x-3)<1.) def test_demo_compare_optimizers(): for exp_name, exp in get_experiments().iteritems(): - print 'Running %s' % exp_name - exp() \ No newline at end of file + print('Running %s' % exp_name) + exp() + + +if __name__ == '__main__': + # test_gradient_descent_optimizer() + # test_adam_optimizer() + # test_adamax_optimizer() + test_unknown_shape() + diff --git a/plato/tools/va/demo_gaussian_vae.py b/plato/tools/va/demo_gaussian_vae.py index acc612f..b192003 100644 --- a/plato/tools/va/demo_gaussian_vae.py +++ b/plato/tools/va/demo_gaussian_vae.py @@ -1,5 +1,5 @@ import numpy as np -from artemis.experiments.experiment_record import experiment_function +from artemis.experiments import experiment_function from artemis.experiments.ui import browse_experiments from artemis.general.test_mode import is_test_mode from artemis.ml.datasets.mnist import get_mnist_dataset