Skip to content

Commit

Permalink
Project clean + Comments + EntropyAdam
Browse files Browse the repository at this point in the history
  • Loading branch information
steph1793 committed Mar 20, 2020
1 parent 1a57dfa commit 8ed0166
Show file tree
Hide file tree
Showing 4 changed files with 317 additions and 107 deletions.
175 changes: 175 additions & 0 deletions EntropySGD/EntropyAdam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from keras.optimizers import Optimizer
from keras import backend as K
from keras.legacy import interfaces
import tensorflow as tf
import math


import keras
import numpy as np
from tensorflow.keras.utils import Progbar
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


class EntropyAdam(Optimizer):
"""Entropy SGD optimizer. This implementation may takes into account Adam optimizer updates.
# Arguments
lr: float > 0. Learning rate.
sgld_step (eta prime): float > 0. The inner sgld step size (for x' update)
L: int > 0. Number of Langevin steps (inner loop) for the estimation of the gradient
gamma : float > 0 . the scope allow the inner SGLD to explore further away from the parameters when estimating the negative local entropy
scoping : float >= 0 . gamma exponential decay parameter : gamma*(1+scoping)^t
sgld_noise : float >0. thermal noise, used in the langevin dynamics update (inner loop)
alpha : float <1 & > 0 . Exponential averaging parameter for the estimation of mu. More details in the paper.
beta_1 : float, 0 < beta < 1. Generally close to 1.
beta_2 : float, 0 < beta < 1. Generally close to 1.
amsgrad : boolean. Whether to apply the AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and Beyond".
decay : >= 0. . Learning rate exponential decay. 0 for no decay.
#Reference
- [ENTROPY-SGD: BIASING GRADIENT DESCENT INTO WIDE VALLEYS](https://arxiv.org/pdf/1611.01838.pdf)
"""
def __init__(self, lr=1., sgld_step=0.1, L=20, gamma=0.03, sgld_noise=1e-4, alpha=0.75, scoping=1e-3, beta_1=0.9, beta_2=0.999, amsgrad=False, decay=0., **kwargs):
super(EntropyAdam, self).__init__(**kwargs)
self.scoping = scoping
self.L = L
self.alpha = alpha
self.amsgrad = amsgrad
self.epsilon = kwargs.pop('epsilon', K.epsilon())
self.initial_decay = decay
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.lr = K.variable(lr, name='lr')
self.sgld_step = K.variable(sgld_step, name='sgld_step')
self.sgld_noise = K.variable(sgld_noise, name='sgld_noise')
self.gamma = K.variable(gamma, name='gamma')

self.state_counter = K.variable(0, dtype='int64', name='state_counter')
self.num_steps = K.variable(-1, dtype='int32')
self.iterator = K.variable(0, dtype='int32', name='iterator')
self.decay = K.variable(self.initial_decay, name='decay')


@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):

self.updates = []
self.updates.append(K.update_add(self.state_counter, 1))
self.updates.append(K.update_add(self.iterator, 1))
self.updates.append(K.update_add(self.iterations, 1))
t = K.cast(self.iterations, K.floatx()) + 1

lr = self.lr
if self.initial_decay > 0:
lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))

shapes = [K.int_shape(p) for p in params]
x = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]
mu = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]
grads = self.get_gradients(loss, params)


ms = [K.zeros(K.int_shape(p),
dtype=K.dtype(p),
name='m_' + str(i))
for (i, p) in enumerate(params)]
vs = [K.zeros(K.int_shape(p),
dtype=K.dtype(p),
name='v_' + str(i))
for (i, p) in enumerate(params)]


if self.amsgrad:
vhats = [K.zeros(K.int_shape(p),
dtype=K.dtype(p),
name='vhat_' + str(i))
for (i, p) in enumerate(params)]
else:
vhats = [K.zeros(1, name='vhat_' + str(i))
for i in range(len(params))]


for x_i, x_prime_i, mu_i, g, m, v, vhat in zip(x, params, mu, grads, ms, vs, vhats):

## we update x_prime (if we are in LAngevin steps, we update otherwise we switch to parameters x_i)
dx_prime_i = g - self.gamma*(x_i - x_prime_i)
x_prime_update_i = K.switch(K.any(K.stack([K.equal(self.state_counter, 0),
K.equal(self.num_steps, self.iterator)], axis=0), axis=0),
x_i,
x_prime_i - self.sgld_step*dx_prime_i + K.sqrt(self.sgld_step)*self.sgld_noise*K.random_normal(K.int_shape(x_prime_i))
)
# Apply constraints.
if getattr(x_prime_i, 'constraint', None) is not None:
x_prime_update_i = x_prime_i.constraint(x_prime_update_i)
self.updates.append(K.update(x_prime_i, x_prime_update_i))

## We update mu (if we are in LAngevin steps, we update otherwise we switch to parameters x_i)
mu_update_i = K.switch(K.equal(self.state_counter, 0),
x_i,
(1-self.alpha)*mu_i + self.alpha*x_prime_i)
self.updates.append(K.update(mu_i, mu_update_i))

## We update x every L steps (Note that at step L+1 or when step < L, the update term is 0. This is coherent with the paper)
## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing
## After each update we rescale gamme with a factor of 1.001


## Adam update
gradient = (x_i-mu_i)
m_t = (self.beta_1 * m) + (1. - self.beta_1) * gradient
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(gradient)
if self.amsgrad:
vhat_t = K.maximum(vhat, v_t)
x_i_t = x_i - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
self.updates.append(
K.update(vhat,
K.switch(K.equal(self.state_counter, self.L+1), vhat_t , vhat ) ))
else:
x_i_t = x_i - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)


self.updates.append(
K.update(m,
K.switch(K.equal(self.state_counter, self.L+1), m_t , m ) ))
self.updates.append(
K.update(v,
K.switch(K.equal(self.state_counter, self.L+1), v_t , v ) ))
new_x_i = x_i_t

x_i_update = K.switch(K.equal(self.state_counter, self.L+1), new_x_i , x_i )
self.updates.append(K.update(x_i, x_i_update))


## Gamma scoping
gamma_update = K.switch(K.equal(self.state_counter, self.L+1) , self.gamma, self.gamma*(1. + self.scoping) )
self.updates.append(K.update(self.gamma, gamma_update))


counter = K.switch(K.equal(self.state_counter, self.L+2), K.constant(0, dtype='int64'),self.state_counter)
self.updates.append(K.update(self.state_counter, counter))
return self.updates


def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'sgld_step' : float(K.get_value(self.sgld_step)),
'gamma' : float(K.get_value(self.gamma)),
'sgld_noise' : float(K.get_value(self.sgld_noise)),
'L' : self.L,
"alpha" : self.alpha,
'scoping' : self.scoping,
'beta_1' : self.beta_1,
'beta_2' : self.beta_2,
'amsgrad' : self.amsgrad,
'decay' : self.decay}
base_config = super(EntropyAdam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

164 changes: 59 additions & 105 deletions EntropySGD/EntropySGD.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,58 +12,76 @@
warnings.filterwarnings("ignore", category=RuntimeWarning)


class ESGD(Optimizer):
"""Entropy SGD optimizer
class EntropySgd(Optimizer):
"""Entropy SGD optimizer. This implementation may take into account Nesterov's Momentum similar to keras implementation in SGD.
# Arguments
lr: float >= 0. Learning rate.
sgld_step (eta prime): float > 0. The inner sgld step size
L: int > 0. Number of Langevin steps.
gamma : float >0 . the scope allow the inner SGLD to explore further away from the parameters
epsilon : float >0. thermal noise
lr: float > 0. Learning rate.
sgld_step (eta prime): float > 0. The inner sgld step size (for x' update)
L: int > 0. Number of Langevin steps (inner loop) for the estimation of the gradient
gamma : float > 0 . the scope allow the inner SGLD to explore further away from the parameters when estimating the negative local entropy
scoping : float >= 0 . gamma exponential decay parameter : gamma*(1+scoping)^t
sgld_noise : float >0. thermal noise, used in the langevin dynamics update (inner loop)
alpha : float <1 & > 0 . Exponential averaging parameter for the estimation of mu. More details in the paper.
momentum : float >= 0. Parameter that accelerates SGD in the relevant direction and dampens oscillations.
nesterov : boolean. Whether to apply Nesterov momentum.
decay : >= 0. . Learning rate exponential decay. 0 for no decay.
#Reference
- [ENTROPY-SGD: BIASING GRADIENT DESCENT INTO WIDE VALLEYS](https://arxiv.org/pdf/1611.01838.pdf)
"""
def __init__(self, lr=1., sgld_step=0.1, L=20, gamma=0.03, epsilon=1e-4, alpha=0.75, scoping=1e-3, momentum=0., nesterov=False, **kwargs):
super(ESGD, self).__init__(**kwargs)
def __init__(self, lr=1., sgld_step=0.1, L=20, gamma=0.03, sgld_noise=1e-4, alpha=0.75, scoping=1e-3, momentum=0., nesterov=False, decay=.0, **kwargs):
super(EntropySgd, self).__init__(**kwargs)
self.scoping = scoping
self.momentum = momentum
self.nesterov = nesterov
self.L = L
self.alpha = alpha
self.initial_decay = decay
with K.name_scope(self.__class__.__name__):
self.scoping = scoping
self.momentum = momentum
self.nesterov = nesterov
self.lr = K.variable(lr, name='lr')
self.sgld_step = K.variable(sgld_step, name='sgld_step')
self.gamma = K.variable(gamma, name='sgld_step')
self.epsilon = K.variable(epsilon, name='sgld_step')
self.L = L
self.gamma = K.variable(gamma, name='gamma')
self.sgld_noise = K.variable(sgld_noise, name='sgld_noise')

self.state_counter = K.variable(0, dtype='int64', name='state_counter')
self.alpha = alpha

self.num_steps = K.variable(-1, dtype='int32')
self.iterator = K.variable(0, dtype='int32', name='state_counter')
self.iterator = K.variable(0, dtype='int32', name='iterator')
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.decay = K.variable(self.initial_decay, name='decay')


@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):

self.updates = [];
self.updates.append(K.update_add(self.state_counter, 1))
self.updates.append(K.update_add(self.iterator, 1))
self.updates.append(K.update_add(self.iterations, 1))

lr = self.lr
## lr exponential decay
if self.initial_decay > 0:
lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))

shapes = [K.int_shape(p) for p in params]
x = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]
mu = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]

grads = self.get_gradients(loss, params)

moments = [K.zeros(shape, name='moment_' + str(i))
for (i, shape) in enumerate(shapes)]

for x_i, x_prime_i, mu_i, g, m in zip(x, params, mu, grads, moments):

## we update x_prime (if we are in LAngevin steps, we update otherwise we switch to parameters x_i)
## we update x_prime (if we are in LAngevin steps, we update, otherwise we switch to parameters x_i)
dx_prime_i = g - self.gamma*(x_i - x_prime_i)
x_prime_update_i = K.switch(K.any(K.stack([K.equal(self.state_counter, 0),
K.equal(self.num_steps, self.iterator)], axis=0), axis=0),
x_i,
x_prime_i - self.sgld_step*dx_prime_i + K.sqrt(self.sgld_step)*self.epsilon*K.random_normal(K.int_shape(x_prime_i))
x_prime_i - self.sgld_step*dx_prime_i + K.sqrt(self.sgld_step)*self.sgld_noise*K.random_normal(K.int_shape(x_prime_i))
)
# Apply constraints.
if getattr(x_prime_i, 'constraint', None) is not None:
Expand All @@ -76,16 +94,19 @@ def get_updates(self, loss, params):
(1-self.alpha)*mu_i + self.alpha*x_prime_i)
self.updates.append(K.update(mu_i, mu_update_i))

## We update x every L steps (Note that at step L+1 or when step < L, the update term is 0. This is coherent with the paper)
## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing
## After each update we rescale gamme with a factor of 1.001


## Momentum and Nesterov
v = self.momentum * m - lr * (x_i-mu_i) # velocity
self.updates.append(K.update(m, v))
## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing
## After each outer loop update we apply an exponential decay on gamma
## The following lines concerns the outer loop updates

## Nesterov's momentum
gradient = (x_i-mu_i)
v = self.momentum * m - lr * gradient # velocity
self.updates.append(
K.update(m,
K.switch(K.equal(self.state_counter, self.L+1), v , m ) ))
if self.nesterov:
new_x_i = x_i + self.momentum * v - lr * (x_i-mu_i)
new_x_i = x_i + self.momentum * v - lr * gradient
else:
new_x_i = x_i + v

Expand All @@ -94,7 +115,7 @@ def get_updates(self, loss, params):


## Gamma scoping
gamma_update = K.switch(self.state_counter<self.L , self.gamma, self.gamma*(1. + self.scoping) )
gamma_update = K.switch(K.equal(self.state_counter, self.L+1) , self.gamma, self.gamma*(1. + self.scoping) )
self.updates.append(K.update(self.gamma, gamma_update))


Expand All @@ -107,79 +128,12 @@ def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'sgld_step' : float(K.get_value(self.sgld_step)),
'gamma' : float(K.get_value(self.gamma)),
'epsilon' : float(K.get_value(self.epsilon)),
'L' : int(K.get_value(self.L))}
base_config = super(SGLD, self).get_config()
'sgld_noise' : float(K.get_value(self.sgld_noise)),
'L' : self.L,
"alpha" : self.alpha,
'scoping' : self.scoping,
'momentum' : self.momentum,
'nesterov' : self.nesterov,
'decay' : self.decay}
base_config = super(EntropySgd, self).get_config()
return dict(list(base_config.items()) + list(config.items()))







class History(keras.callbacks.Callback):

def __init__(self):
super(History, self).__init__()
self.loss = []
self.val_loss = []
self.eff_loss = []
self.eff_val_loss = []
self.i = 0


def on_train_begin(self, logs=None):
self.epochs = self.params['epochs']
K.set_value(self.model.optimizer.num_steps,
math.ceil(self.params["samples"]/self.params["batch_size"])*self.params["epochs"])

def on_epoch_begin(self, epoch, logs={}):
self.loss_buff = []
self.val_loss_buff = []
print('Epoch : %d/%d, Effective Epoch : %d/%d' % (epoch + 1, self.epochs, (epoch+1)//self.model.optimizer.L+1, self.epochs//self.model.optimizer.L))
self.target = self.params['samples']
self.progbar = Progbar(target=self.target,
verbose=1,
stateful_metrics=['loss', 'val_loss'])
self.seen = 0

def on_train_batch_begin(self, batch, logs=None):
if self.seen < self.target:
self.log_values = []


def on_train_batch_end(self, batch, logs={}):
self.i = self.i+1
batch_size = logs.get('size', 0)
self.seen += batch_size

if K.eval(self.model.optimizer.state_counter) == 0:
self.loss_buff.append(logs.get('loss'))
self.log_values.append(('loss', np.mean(self.loss_buff)))

# Skip progbar update for the last batch;
# will be handled by on_epoch_end.
if self.seen < self.target:
self.progbar.update(self.seen, self.log_values)
else:
self.progbar.update(self.target-1, self.log_values)



def on_test_batch_end(self, batch, logs={}):
self.val_loss_buff.append(logs.get('loss'))



def on_epoch_end(self, epoch, logs):
self.loss.append(np.mean(self.loss_buff))
self.val_loss.append(np.mean(self.val_loss_buff))

if (epoch+1)%self.model.optimizer.L == 0:
self.eff_loss.append(np.mean(self.loss[-self.model.optimizer.L:]))
self.eff_val_loss.append(np.mean(self.val_loss[-self.model.optimizer.L:]))

self.log_values.append(('val_loss', np.mean(self.val_loss_buff)))
self.progbar.update(self.target, self.log_values)

Loading

0 comments on commit 8ed0166

Please sign in to comment.