diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md index c42ce4e423..045ef6fbd8 100644 --- a/STYLE_GUIDE.md +++ b/STYLE_GUIDE.md @@ -187,7 +187,7 @@ they supersede all previous conventions. 1. Submodule names should be singular, except where they overlap to TF. Justification: Having plural looks strange in user code, ie, - tf.optimizer.Foo reads nicer than tf.keras.optimizers.Foo since submodules + tf.optimizer.Foo reads nicer than tf_keras.optimizers.Foo since submodules are only used to access a single, specific thing (at a time). 1. Use `tf.newaxis` rather than `None` to `tf.expand_dims`. diff --git a/SUBSTRATES.md b/SUBSTRATES.md index 35ccadf034..99edbf1ee0 100644 --- a/SUBSTRATES.md +++ b/SUBSTRATES.md @@ -75,7 +75,7 @@ vmap, etc.), we will special-case using an `if JAX_MODE:` block. tests, TFP impl, etc), with `tfp.math.value_and_gradient` or similar. Then, we can special-case `JAX_MODE` inside the body of `value_and_gradient`. -* __`tf.Variable`, `tf.keras.optimizers.Optimizer`__ +* __`tf.Variable`, `tf_keras.optimizers.Optimizer`__ TF provides a `Variable` abstraction so that graph functions may modify state, including using the Keras `Optimizer` subclasses like `Adam`. JAX, diff --git a/tensorflow_probability/examples/BUILD b/tensorflow_probability/examples/BUILD index 2193b49f76..58c89b9220 100644 --- a/tensorflow_probability/examples/BUILD +++ b/tensorflow_probability/examples/BUILD @@ -84,6 +84,7 @@ py_library( # six dep, # tensorflow dep, "//tensorflow_probability", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/examples/bayesian_neural_network.py b/tensorflow_probability/examples/bayesian_neural_network.py index fe1f08cd4a..976a99de69 100644 --- a/tensorflow_probability/examples/bayesian_neural_network.py +++ b/tensorflow_probability/examples/bayesian_neural_network.py @@ -37,6 +37,7 @@ import numpy as np import tensorflow.compat.v2 as tf import tensorflow_probability as tfp +from tensorflow_probability.python.internal import tf_keras tf.enable_v2_behavior() @@ -174,26 +175,26 @@ def create_model(): # and two fully connected dense layers. We use the Flipout # Monte Carlo estimator for these layers, which enables lower variance # stochastic gradients than naive reparameterization. - model = tf.keras.models.Sequential([ + model = tf_keras.models.Sequential([ tfp.layers.Convolution2DFlipout( 6, kernel_size=5, padding='SAME', kernel_divergence_fn=kl_divergence_function, activation=tf.nn.relu), - tf.keras.layers.MaxPooling2D( + tf_keras.layers.MaxPooling2D( pool_size=[2, 2], strides=[2, 2], padding='SAME'), tfp.layers.Convolution2DFlipout( 16, kernel_size=5, padding='SAME', kernel_divergence_fn=kl_divergence_function, activation=tf.nn.relu), - tf.keras.layers.MaxPooling2D( + tf_keras.layers.MaxPooling2D( pool_size=[2, 2], strides=[2, 2], padding='SAME'), tfp.layers.Convolution2DFlipout( 120, kernel_size=5, padding='SAME', kernel_divergence_fn=kl_divergence_function, activation=tf.nn.relu), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseFlipout( 84, kernel_divergence_fn=kl_divergence_function, activation=tf.nn.relu), @@ -203,7 +204,7 @@ def create_model(): ]) # Model compilation. - optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) + optimizer = tf_keras.optimizers.Adam(lr=FLAGS.learning_rate) # We use the categorical_crossentropy loss since the MNIST dataset contains # ten labels. The Keras API will then automatically add the # Kullback-Leibler divergence (contained on the individual layers of @@ -214,7 +215,7 @@ def create_model(): return model -class MNISTSequence(tf.keras.utils.Sequence): +class MNISTSequence(tf_keras.utils.Sequence): """Produces a sequence of MNIST digits with labels.""" def __init__(self, data=None, batch_size=128, fake_data_size=None): @@ -272,7 +273,7 @@ def __preprocessing(images, labels): images = 2 * (images / 255.) - 1. images = images[..., tf.newaxis] - labels = tf.keras.utils.to_categorical(labels) + labels = tf_keras.utils.to_categorical(labels) return images, labels def __len__(self): @@ -298,7 +299,7 @@ def main(argv): heldout_seq = MNISTSequence(batch_size=FLAGS.batch_size, fake_data_size=NUM_HELDOUT_EXAMPLES) else: - train_set, heldout_set = tf.keras.datasets.mnist.load_data() + train_set, heldout_set = tf_keras.datasets.mnist.load_data() train_seq = MNISTSequence(data=train_set, batch_size=FLAGS.batch_size) heldout_seq = MNISTSequence(data=heldout_set, batch_size=FLAGS.batch_size) diff --git a/tensorflow_probability/examples/cifar10_bnn.py b/tensorflow_probability/examples/cifar10_bnn.py index 666f5aca1b..4504667bd7 100644 --- a/tensorflow_probability/examples/cifar10_bnn.py +++ b/tensorflow_probability/examples/cifar10_bnn.py @@ -47,6 +47,8 @@ from tensorflow_probability.examples.models.bayesian_resnet import bayesian_resnet from tensorflow_probability.examples.models.bayesian_vgg import bayesian_vgg +from tensorflow_probability.python.internal import tf_keras + matplotlib.use("Agg") warnings.simplefilter(action="ignore") tfd = tfp.distributions @@ -169,7 +171,7 @@ def main(argv): if FLAGS.fake_data: (x_train, y_train), (x_test, y_test) = build_fake_data() else: - (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() + (x_train, y_train), (x_test, y_test) = tf_keras.datasets.cifar10.load_data() (images, labels, handle, training_iterator, diff --git a/tensorflow_probability/examples/disentangled_vae.py b/tensorflow_probability/examples/disentangled_vae.py index 483153adb9..de8f823ff9 100644 --- a/tensorflow_probability/examples/disentangled_vae.py +++ b/tensorflow_probability/examples/disentangled_vae.py @@ -102,10 +102,12 @@ from absl import app from absl import flags -import tensorflow.compat.v1 as tf +import tensorflow.compat.v1 as tf1 +import tensorflow.compat.v2 as tf import tensorflow_probability as tfp from tensorflow_probability.examples import sprites_dataset +from tensorflow_probability.python.internal import tf_keras tfd = tfp.distributions @@ -178,7 +180,7 @@ FLAGS = flags.FLAGS -class LearnableMultivariateNormalDiag(tf.keras.Model): +class LearnableMultivariateNormalDiag(tf_keras.v1.Model): """Learnable multivariate diagonal normal distribution. The model is a multivariate normal distribution with learnable @@ -193,19 +195,19 @@ def __init__(self, dimensions): distribution. """ super(LearnableMultivariateNormalDiag, self).__init__() - with tf.compat.v1.name_scope(self._name): + with tf1.name_scope(self._name): self.dimensions = dimensions - self._mean = tf.compat.v2.Variable( - tf.random.normal([dimensions], stddev=0.1), name="mean") + self._mean = tf.Variable( + tf1.random.normal([dimensions], stddev=0.1), name="mean") # Initialize the std dev such that it will be close to 1 after a softplus # function. - self._untransformed_stddev = tf.compat.v2.Variable( - tf.random.normal([dimensions], mean=0.55, stddev=0.1), + self._untransformed_stddev = tf.Variable( + tf1.random.normal([dimensions], mean=0.55, stddev=0.1), name="untransformed_stddev") def __call__(self, *args, **kwargs): # Allow this Model to be called without inputs. - dummy = tf.zeros(self.dimensions) + dummy = tf1.zeros(self.dimensions) return super(LearnableMultivariateNormalDiag, self).__call__( dummy, *args, **kwargs) @@ -221,7 +223,7 @@ def call(self, inputs): dimensions]. """ del inputs # unused - with tf.compat.v1.name_scope(self._name): + with tf1.name_scope(self._name): return tfd.MultivariateNormalDiag(self.loc, self.scale_diag) @property @@ -232,10 +234,10 @@ def loc(self): @property def scale_diag(self): """The diagonal standard deviation of the normal distribution.""" - return tf.nn.softplus(self._untransformed_stddev) + 1e-5 # keep > 0 + return tf1.nn.softplus(self._untransformed_stddev) + 1e-5 # keep > 0 -class LearnableMultivariateNormalDiagCell(tf.keras.Model): +class LearnableMultivariateNormalDiagCell(tf_keras.v1.Model): """Multivariate diagonal normal distribution RNN cell. The model is an LSTM-based recurrent function that computes the @@ -254,8 +256,8 @@ def __init__(self, dimensions, hidden_size): super(LearnableMultivariateNormalDiagCell, self).__init__() self.dimensions = dimensions self.hidden_size = hidden_size - self.lstm_cell = tf.keras.layers.LSTMCell(hidden_size) - self.output_layer = tf.keras.layers.Dense(2*dimensions) + self.lstm_cell = tf_keras.v1.layers.LSTMCell(hidden_size) + self.output_layer = tf_keras.v1.layers.Dense(2*dimensions) def zero_state(self, sample_batch_shape=()): """Returns an initial state for the LSTM cell. @@ -268,12 +270,11 @@ def zero_state(self, sample_batch_shape=()): A tuple of the initial previous output at timestep 0 of shape [sample_batch_shape, dimensions], and the cell state. """ - h0 = tf.zeros([1, self.hidden_size]) - c0 = tf.zeros([1, self.hidden_size]) - combined_shape = tf.concat((tf.convert_to_tensor( - value=sample_batch_shape, dtype=tf.int32), [self.dimensions]), - axis=-1) - previous_output = tf.zeros(combined_shape) + h0 = tf1.zeros([1, self.hidden_size]) + c0 = tf1.zeros([1, self.hidden_size]) + combined_shape = tf1.concat((tf1.convert_to_tensor( + value=sample_batch_shape, dtype=tf1.int32), [self.dimensions]), axis=-1) + previous_output = tf1.zeros(combined_shape) return previous_output, (h0, c0) def call(self, inputs, state): @@ -298,20 +299,20 @@ def call(self, inputs, state): # In order to allow the user to pass in a single example without a batch # dimension, we always expand the input to at least two dimensions, then # fix the output shape to remove the batch dimension if necessary. - original_shape = inputs.shape - if len(original_shape) < 2: - inputs = tf.reshape(inputs, [1, -1]) + # original_shape = inputs.shape + # if len(original_shape) < 2: + # inputs = tf1.reshape(inputs, [1, -1]) out, state = self.lstm_cell(inputs, state) out = self.output_layer(out) - correct_shape = tf.concat((original_shape[:-1], tf.shape(input=out)[-1:]), - 0) - out = tf.reshape(out, correct_shape) + # correct_shape = tf1.concat( + # (original_shape[:-1], tf1.shape(input=out)[-1:]), 0) + # out = tf1.reshape(out, correct_shape) loc = out[..., :self.dimensions] - scale_diag = tf.nn.softplus(out[..., self.dimensions:]) + 1e-5 # keep > 0 + scale_diag = tf1.nn.softplus(out[..., self.dimensions:]) + 1e-5 # keep > 0 return tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag), state -class Decoder(tf.keras.Model): +class Decoder(tf_keras.v1.Model): """Probabilistic decoder for `p(x_t | z_t, f)`. The decoder generates a sequence of image frames `x_{1:T}` from @@ -341,11 +342,11 @@ def __init__(self, hidden_size, channels=3): """ super(Decoder, self).__init__() self.hidden_size = hidden_size - activation = tf.nn.leaky_relu - self.dense = tf.keras.layers.Dense(hidden_size, activation=activation) + activation = tf1.nn.leaky_relu + self.dense = tf_keras.v1.layers.Dense(hidden_size, activation=activation) # Spatial sizes: (1,1) -> (8,8) -> (16,16) -> (32,32) -> (64,64). - conv_transpose = functools.partial( - tf.keras.layers.Conv2DTranspose, padding="SAME", activation=activation) + conv_transpose = functools.partial(tf_keras.v1.layers.Conv2DTranspose, + padding="SAME", activation=activation) self.conv_transpose1 = conv_transpose(256, 8, 1, padding="VALID") self.conv_transpose2 = conv_transpose(256, 3, 2) self.conv_transpose3 = conv_transpose(256, 3, 2) @@ -367,27 +368,27 @@ def call(self, inputs): batch_size, timesteps, height, width, channels]. """ # We explicitly broadcast f to the same shape as z other than the final - # dimension, because `tf.concat` can't automatically do this. + # dimension, because `tf1.concat` can't automatically do this. dynamic, static = inputs - timesteps = tf.shape(input=dynamic)[-2] - static = static[..., tf.newaxis, :] + tf.zeros([timesteps, 1]) - latents = tf.concat([dynamic, static], axis=-1) # (sample, N, T, latents) + timesteps = tf1.shape(input=dynamic)[-2] + static = static[..., tf1.newaxis, :] + tf1.zeros([timesteps, 1]) + latents = tf1.concat([dynamic, static], axis=-1) # (sample, N, T, latents) out = self.dense(latents) - out = tf.reshape(out, (-1, 1, 1, self.hidden_size)) + out = tf1.reshape(out, (-1, 1, 1, self.hidden_size)) out = self.conv_transpose1(out) out = self.conv_transpose2(out) out = self.conv_transpose3(out) out = self.conv_transpose4(out) # (sample*N*T, h, w, c) - expanded_shape = tf.concat( - (tf.shape(input=latents)[:-1], tf.shape(input=out)[1:]), axis=0) - out = tf.reshape(out, expanded_shape) # (sample, N, T, h, w, c) + expanded_shape = tf1.concat( + (tf1.shape(input=latents)[:-1], tf1.shape(input=out)[1:]), axis=0) + out = tf1.reshape(out, expanded_shape) # (sample, N, T, h, w, c) return tfd.Independent( distribution=tfd.Normal(loc=out, scale=1.), reinterpreted_batch_ndims=3, # wrap (h, w, c) name="decoded_image") -class Compressor(tf.keras.Model): +class Compressor(tf_keras.v1.Model): """Feature extractor. This convolutional model aims to extract features corresponding to a @@ -408,7 +409,7 @@ def __init__(self, hidden_size): self.hidden_size = hidden_size # Spatial sizes: (64,64) -> (32,32) -> (16,16) -> (8,8) -> (1,1). conv = functools.partial( - tf.keras.layers.Conv2D, padding="SAME", activation=tf.nn.leaky_relu) + tf_keras.v1.layers.Conv2D, padding="SAME", activation=tf1.nn.leaky_relu) self.conv1 = conv(256, 3, 2) self.conv2 = conv(256, 3, 2) self.conv3 = conv(256, 3, 2) @@ -426,18 +427,18 @@ def call(self, inputs): A batch of intermediate representations of shape [sample_shape, batch_size, timesteps, hidden_size]. """ - image_shape = tf.shape(input=inputs)[-3:] - collapsed_shape = tf.concat(([-1], image_shape), axis=0) - out = tf.reshape(inputs, collapsed_shape) # (sample*batch*T, h, w, c) + image_shape = tf1.shape(input=inputs)[-3:] + collapsed_shape = tf1.concat(([-1], image_shape), axis=0) + out = tf1.reshape(inputs, collapsed_shape) # (sample*batch*T, h, w, c) out = self.conv1(out) out = self.conv2(out) out = self.conv3(out) out = self.conv4(out) - expanded_shape = tf.concat((tf.shape(input=inputs)[:-3], [-1]), axis=0) - return tf.reshape(out, expanded_shape) # (sample, batch, T, hidden) + expanded_shape = tf1.concat((tf1.shape(input=inputs)[:-3], [-1]), axis=0) + return tf1.reshape(out, expanded_shape) # (sample, batch, T, hidden) -class EncoderStatic(tf.keras.Model): +class EncoderStatic(tf_keras.v1.Model): """Probabilistic encoder for the time-invariant latent variable `f`. The conditional distribution `q(f | x_{1:T})` is a multivariate @@ -476,10 +477,10 @@ def __init__(self, latent_size, hidden_size): super(EncoderStatic, self).__init__() self.latent_size = latent_size self.hidden_size = hidden_size - self.bilstm = tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(hidden_size), + self.bilstm = tf_keras.v1.layers.Bidirectional( + tf_keras.v1.layers.LSTM(hidden_size), merge_mode="sum") - self.output_layer = tf.keras.layers.Dense(2*latent_size) + self.output_layer = tf_keras.v1.layers.Dense(2*latent_size) def call(self, inputs): """Runs the model to generate a distribution `q(f | x_{1:T})`. @@ -500,18 +501,18 @@ def call(self, inputs): """ # TODO(dusenberrymw): Remove these reshaping commands after b/113126249 is # fixed. - collapsed_shape = tf.concat(([-1], tf.shape(input=inputs)[-2:]), axis=0) - out = tf.reshape(inputs, collapsed_shape) # (sample*batch_size, T, hidden) + collapsed_shape = tf1.concat(([-1], tf1.shape(input=inputs)[-2:]), axis=0) + out = tf1.reshape(inputs, collapsed_shape) # (sample*batch_size, T, hidden) out = self.bilstm(out) # (sample*batch_size, hidden) - expanded_shape = tf.concat((tf.shape(input=inputs)[:-2], [-1]), axis=0) - out = tf.reshape(out, expanded_shape) # (sample, batch_size, hidden) + expanded_shape = tf1.concat((tf1.shape(input=inputs)[:-2], [-1]), axis=0) + out = tf1.reshape(out, expanded_shape) # (sample, batch_size, hidden) out = self.output_layer(out) # (sample, batch_size, 2*latent_size) loc = out[..., :self.latent_size] - scale_diag = tf.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 + scale_diag = tf1.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 return tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) -class EncoderDynamicFactorized(tf.keras.Model): +class EncoderDynamicFactorized(tf_keras.v1.Model): """Probabilistic encoder for the time-variant latent variable `z_t`. The conditional distribution `q(z_t | x_t)` is a multivariate normal @@ -542,8 +543,9 @@ def __init__(self, latent_size, hidden_size): super(EncoderDynamicFactorized, self).__init__() self.latent_size = latent_size self.hidden_size = hidden_size - self.dense = tf.keras.layers.Dense(hidden_size, activation=tf.nn.leaky_relu) - self.output_layer = tf.keras.layers.Dense(2*latent_size) + self.dense = tf_keras.v1.layers.Dense(hidden_size, + activation=tf1.nn.leaky_relu) + self.output_layer = tf_keras.v1.layers.Dense(2*latent_size) def call(self, inputs): """Runs the model to generate a distribution `q(z_{1:T} | x_{1:T})`. @@ -562,11 +564,11 @@ def call(self, inputs): out = self.dense(inputs) # (..., batch, time, hidden) out = self.output_layer(out) # (..., batch, time, 2*latent) loc = out[..., :self.latent_size] - scale_diag = tf.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 + scale_diag = tf1.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 return tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) -class EncoderDynamicFull(tf.keras.Model): +class EncoderDynamicFull(tf_keras.v1.Model): """Probabilistic encoder for the time-variant latent variable `z_t`. The conditional distribution `q(z_{1:T} | x_{1:T}, f)` is a @@ -601,11 +603,11 @@ def __init__(self, latent_size, hidden_size): super(EncoderDynamicFull, self).__init__() self.latent_size = latent_size self.hidden_size = hidden_size - self.bilstm = tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(hidden_size, return_sequences=True), + self.bilstm = tf_keras.v1.layers.Bidirectional( + tf_keras.v1.layers.LSTM(hidden_size, return_sequences=True), merge_mode="sum") - self.rnn = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=True) - self.output_layer = tf.keras.layers.Dense(2*latent_size) + self.rnn = tf_keras.v1.layers.SimpleRNN(hidden_size, return_sequences=True) + self.output_layer = tf_keras.v1.layers.Dense(2*latent_size) def call(self, inputs): """Runs the model to generate a distribution `q(z_{1:T} | x_{1:T}, f)`. @@ -629,37 +631,37 @@ def call(self, inputs): sample. """ # We explicitly broadcast `x` and `f` to the same shape other than the final - # dimension, because `tf.concat` can't automatically do this. This will + # dimension, because `tf1.concat` can't automatically do this. This will # entail adding a `timesteps` dimension to `f` to give the shape `(..., # batch, timesteps, latent)`, and then broadcasting the sample shapes of # both tensors to the same shape. features, static_sample = inputs - length = tf.shape(input=features)[-2] - static_sample = static_sample[..., tf.newaxis, :] + tf.zeros([length, 1]) - sample_shape_static = tf.shape(input=static_sample)[:-3] - sample_shape_inputs = tf.shape(input=features)[:-3] - broadcast_shape_inputs = tf.concat((sample_shape_static, [1, 1, 1]), 0) - broadcast_shape_static = tf.concat((sample_shape_inputs, [1, 1, 1]), 0) - features = features + tf.zeros(broadcast_shape_inputs) - static_sample = static_sample + tf.zeros(broadcast_shape_static) + length = tf1.shape(input=features)[-2] + static_sample = static_sample[..., tf1.newaxis, :] + tf1.zeros([length, 1]) + sample_shape_static = tf1.shape(input=static_sample)[:-3] + sample_shape_inputs = tf1.shape(input=features)[:-3] + broadcast_shape_inputs = tf1.concat((sample_shape_static, [1, 1, 1]), 0) + broadcast_shape_static = tf1.concat((sample_shape_inputs, [1, 1, 1]), 0) + features = features + tf1.zeros(broadcast_shape_inputs) + static_sample = static_sample + tf1.zeros(broadcast_shape_static) # `combined` will have shape (..., batch, T, hidden+latent). - combined = tf.concat((features, static_sample), axis=-1) + combined = tf1.concat((features, static_sample), axis=-1) # TODO(dusenberrymw): Remove these reshaping commands after b/113126249 is # fixed. - collapsed_shape = tf.concat(([-1], tf.shape(input=combined)[-2:]), axis=0) - out = tf.reshape(combined, collapsed_shape) + collapsed_shape = tf1.concat(([-1], tf1.shape(input=combined)[-2:]), axis=0) + out = tf1.reshape(combined, collapsed_shape) out = self.bilstm(out) # (sample*batch, T, hidden_size) out = self.rnn(out) # (sample*batch, T, hidden_size) - expanded_shape = tf.concat( - (tf.shape(input=combined)[:-2], tf.shape(input=out)[1:]), axis=0) - out = tf.reshape(out, expanded_shape) # (sample, batch, T, hidden_size) + expanded_shape = tf1.concat( + (tf1.shape(input=combined)[:-2], tf1.shape(input=out)[1:]), axis=0) + out = tf1.reshape(out, expanded_shape) # (sample, batch, T, hidden_size) out = self.output_layer(out) # (sample, batch, T, 2*latent_size) loc = out[..., :self.latent_size] - scale_diag = tf.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 + scale_diag = tf1.nn.softplus(out[..., self.latent_size:]) + 1e-5 # keep > 0 return tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) -class DisentangledSequentialVAE(tf.keras.Model): +class DisentangledSequentialVAE(tf_keras.v1.Model): """Disentangled Sequential Variational Autoencoder. The disentangled sequential variational autoencoder posits a generative @@ -812,8 +814,8 @@ def reconstruct(self, inputs, samples=1, sample_static=False, sample shape [sample_shape, samples, batch_size, timesteps, height, width, channels]. """ - batch_size = tf.shape(input=inputs)[-5] - length = len(tf.unstack(inputs, axis=-4)) # hack for graph mode + batch_size = tf1.shape(input=inputs)[-5] + length = len(tf1.unstack(inputs, axis=-4)) # hack for graph mode features = self.compressor(inputs) # (..., batch, timesteps, hidden) @@ -824,7 +826,7 @@ def reconstruct(self, inputs, samples=1, sample_static=False, static_sample, _ = self.sample_static_posterior(features, samples) if swap_static: - static_sample = tf.reverse(static_sample, axis=[1]) + static_sample = tf1.reverse(static_sample, axis=[1]) if sample_dynamic: dynamic_sample, _ = self.sample_dynamic_prior( @@ -834,7 +836,7 @@ def reconstruct(self, inputs, samples=1, sample_static=False, features, samples, static_sample) if swap_dynamic: - dynamic_sample = tf.reverse(dynamic_sample, axis=[1]) + dynamic_sample = tf1.reverse(dynamic_sample, axis=[1]) likelihood = self.decoder((dynamic_sample, static_sample)) return likelihood @@ -856,7 +858,7 @@ def sample_static_prior(self, samples, batch_size, fixed=False): """ dist = self.static_prior() if fixed: # in either case, shape is (samples, batch, latent) - sample = dist.sample((samples, 1)) + tf.zeros([batch_size, 1]) + sample = dist.sample((samples, 1)) + tf1.zeros([batch_size, 1]) else: sample = dist.sample((samples, batch_size)) return sample, dist @@ -913,12 +915,12 @@ def sample_dynamic_prior(self, samples, batch_size, length, fixed=False): scale_diags.append(dist.parameters["scale_diag"]) sample_list.append(sample) - sample = tf.stack(sample_list, axis=2) - loc = tf.stack(locs, axis=2) - scale_diag = tf.stack(scale_diags, axis=2) + sample = tf1.stack(sample_list, axis=2) + loc = tf1.stack(locs, axis=2) + scale_diag = tf1.stack(scale_diags, axis=2) if fixed: # tile along the batch axis - sample = sample + tf.zeros([batch_size, 1, 1]) + sample = sample + tf1.zeros([batch_size, 1, 1]) return sample, tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) @@ -967,15 +969,15 @@ def image_summary(seqs, name, num=None): num: Integer for the number of examples to visualize. Defaults to all examples. """ - seqs = tf.clip_by_value(seqs, 0., 1.) - seqs = tf.unstack(seqs[:num]) - joined_seqs = [tf.concat(tf.unstack(seq), 1) for seq in seqs] - joined_seqs = tf.expand_dims(tf.concat(joined_seqs, 0), 0) - tf.compat.v2.summary.image( + seqs = tf1.clip_by_value(seqs, 0., 1.) + seqs = tf1.unstack(seqs[:num]) + joined_seqs = [tf1.concat(tf1.unstack(seq), 1) for seq in seqs] + joined_seqs = tf1.expand_dims(tf1.concat(joined_seqs, 0), 0) + tf.summary.image( name, joined_seqs, max_outputs=1, - step=tf.compat.v1.train.get_or_create_global_step()) + step=tf1.train.get_or_create_global_step()) def visualize_reconstruction(inputs, reconstruct, num=3, name="reconstruction"): @@ -989,8 +991,8 @@ def visualize_reconstruction(inputs, reconstruct, num=3, name="reconstruction"): num: Integer for the number of examples to visualize. name: String name of this summary. """ - reconstruct = tf.clip_by_value(reconstruct, 0., 1.) - inputs_and_reconstruct = tf.concat((inputs[:num], reconstruct[:num]), axis=0) + reconstruct = tf1.clip_by_value(reconstruct, 0., 1.) + inputs_and_reconstruct = tf1.concat((inputs[:num], reconstruct[:num]), axis=0) image_summary(inputs_and_reconstruct, name) @@ -1006,9 +1008,9 @@ def visualize_qualitative_analysis(inputs, model, samples=1, batch_size=3, batch_size: Number of sequences to generate. length: Number of timesteps to generate for each sequence. """ - average = lambda dist: tf.reduce_mean( + average = lambda dist: tf1.reduce_mean( input_tensor=dist.mean(), axis=0) # avg over samples - with tf.compat.v1.name_scope("val_reconstruction"): + with tf1.name_scope("val_reconstruction"): reconstruct = functools.partial(model.reconstruct, inputs=inputs, samples=samples) visualize_reconstruction(inputs, average(reconstruct())) @@ -1021,7 +1023,7 @@ def visualize_qualitative_analysis(inputs, model, samples=1, batch_size=3, visualize_reconstruction(inputs, average(reconstruct(swap_dynamic=True)), name="swap_dynamic") - with tf.compat.v1.name_scope("generation"): + with tf1.name_scope("generation"): generate = functools.partial(model.generate, batch_size=batch_size, length=length, samples=samples) image_summary(average(generate(fix_static=True)), "fix_static") @@ -1037,15 +1039,15 @@ def summarize_dist_params(dist, name, name_scope="dist_params"): name: The name of the distribution. name_scope: The name scope of this summary. """ - with tf.compat.v1.name_scope(name_scope): - tf.compat.v2.summary.histogram( + with tf1.name_scope(name_scope): + tf.summary.histogram( name="{}/{}".format(name, "mean"), data=dist.mean(), - step=tf.compat.v1.train.get_or_create_global_step()) - tf.compat.v2.summary.histogram( + step=tf1.train.get_or_create_global_step()) + tf.summary.histogram( name="{}/{}".format(name, "stddev"), data=dist.stddev(), - step=tf.compat.v1.train.get_or_create_global_step()) + step=tf1.train.get_or_create_global_step()) def summarize_mean_in_nats_and_bits(inputs, units, name, @@ -1061,29 +1063,29 @@ def summarize_mean_in_nats_and_bits(inputs, units, name, nats_name_scope: The name scope of the nats summary. bits_name_scope: The name scope of the bits summary. """ - mean = tf.reduce_mean(input_tensor=inputs) - with tf.compat.v1.name_scope(nats_name_scope): - tf.compat.v2.summary.scalar( + mean = tf1.reduce_mean(input_tensor=inputs) + with tf1.name_scope(nats_name_scope): + tf.summary.scalar( name, mean, - step=tf.compat.v1.train.get_or_create_global_step()) - with tf.compat.v1.name_scope(bits_name_scope): - tf.compat.v2.summary.scalar( + step=tf1.train.get_or_create_global_step()) + with tf1.name_scope(bits_name_scope): + tf.summary.scalar( name, - mean / units / tf.math.log(2.), - step=tf.compat.v1.train.get_or_create_global_step()) + mean / units / tf1.math.log(2.), + step=tf1.train.get_or_create_global_step()) def main(argv): del argv # unused - tf.compat.v1.enable_eager_execution() - tf.compat.v1.set_random_seed(FLAGS.seed) + tf1.enable_eager_execution() + tf1.set_random_seed(FLAGS.seed) timestamp = datetime.strftime(datetime.today(), "%y%m%d_%H%M%S") FLAGS.logdir = FLAGS.logdir.format(timestamp=timestamp) FLAGS.model_dir = FLAGS.model_dir.format(timestamp=timestamp) - if not tf.io.gfile.exists(FLAGS.model_dir): - tf.io.gfile.makedirs(FLAGS.model_dir) + if not tf1.io.gfile.exists(FLAGS.model_dir): + tf1.io.gfile.makedirs(FLAGS.model_dir) sprites_data = sprites_dataset.SpritesDataset(fake_data=FLAGS.fake_data) @@ -1093,18 +1095,17 @@ def main(argv): hidden_size=FLAGS.hidden_size, channels=sprites_data.channels, latent_posterior=FLAGS.latent_posterior) - global_step = tf.compat.v1.train.get_or_create_global_step() - optimizer = tf.compat.v1.train.AdamOptimizer( - tf.compat.v1.train.cosine_decay(FLAGS.learning_rate, global_step, - FLAGS.max_steps)) + global_step = tf1.train.get_or_create_global_step() + optimizer = tf1.train.AdamOptimizer( + tf1.train.cosine_decay(FLAGS.learning_rate, global_step, FLAGS.max_steps)) - checkpoint = tf.train.Checkpoint(model=model, global_step=global_step, - optimizer=optimizer) - checkpoint_manager = tf.train.CheckpointManager( + checkpoint = tf1.train.Checkpoint(model=model, global_step=global_step, + optimizer=optimizer) + checkpoint_manager = tf1.train.CheckpointManager( checkpoint, directory=FLAGS.model_dir, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) - writer = tf.compat.v2.summary.create_file_writer(FLAGS.logdir) + writer = tf.summary.create_file_writer(FLAGS.logdir) writer.set_as_default() dataset = sprites_data.train.map(lambda *x: x[0]).shuffle(1000).repeat() @@ -1112,14 +1113,14 @@ def main(argv): if FLAGS.enable_debug_logging: for inputs in dataset.prefetch(buffer_size=None): - with tf.compat.v2.summary.record_if( - lambda: tf.math.equal(0, global_step % FLAGS.log_steps)): - tf.compat.v2.summary.histogram( + with tf.summary.record_if( + lambda: tf1.math.equal(0, global_step % FLAGS.log_steps)): + tf.summary.histogram( "image", data=inputs, - step=tf.compat.v1.train.get_or_create_global_step()) + step=tf1.train.get_or_create_global_step()) - with tf.GradientTape() as tape: + with tf1.GradientTape() as tape: features = model.compressor(inputs) # (batch, timesteps, hidden) static_sample, static_posterior = model.sample_static_posterior( features, FLAGS.num_samples) # (samples, batch, latent) @@ -1127,7 +1128,7 @@ def main(argv): features, FLAGS.num_samples, static_sample) # (sampl, N, T, latent) likelihood = model.decoder((dynamic_sample, static_sample)) - reconstruction = tf.reduce_mean( # integrate samples + reconstruction = tf1.reduce_mean( # integrate samples input_tensor=likelihood.mean()[:FLAGS.num_reconstruction_samples], axis=0) visualize_reconstruction(inputs, reconstruction, @@ -1146,17 +1147,17 @@ def main(argv): static_prior_log_prob = static_prior.log_prob(static_sample) static_posterior_log_prob = static_posterior.log_prob(static_sample) - dynamic_prior_log_prob = tf.reduce_sum( + dynamic_prior_log_prob = tf1.reduce_sum( input_tensor=dynamic_prior.log_prob(dynamic_sample), axis=-1) # sum time - dynamic_posterior_log_prob = tf.reduce_sum( + dynamic_posterior_log_prob = tf1.reduce_sum( input_tensor=dynamic_posterior.log_prob(dynamic_sample), axis=-1) # sum time - likelihood_log_prob = tf.reduce_sum( + likelihood_log_prob = tf1.reduce_sum( input_tensor=likelihood.log_prob(inputs), axis=-1) # sum time if FLAGS.enable_debug_logging: - with tf.compat.v1.name_scope("log_probs"): + with tf1.name_scope("log_probs"): summarize_mean_in_nats_and_bits( static_prior_log_prob, FLAGS.latent_size_static, "static_prior") summarize_mean_in_nats_and_bits( @@ -1172,40 +1173,40 @@ def main(argv): likelihood_log_prob, sprites_data.frame_size ** 2 * sprites_data.channels * sprites_data.length, "likelihood") - elbo = tf.reduce_mean(input_tensor=static_prior_log_prob - - static_posterior_log_prob + - dynamic_prior_log_prob - - dynamic_posterior_log_prob + likelihood_log_prob) + elbo = tf1.reduce_mean(input_tensor=static_prior_log_prob - + static_posterior_log_prob + + dynamic_prior_log_prob - + dynamic_posterior_log_prob + likelihood_log_prob) loss = -elbo - tf.compat.v2.summary.scalar( + tf.summary.scalar( "elbo", elbo, - step=tf.compat.v1.train.get_or_create_global_step()) + step=tf1.train.get_or_create_global_step()) grads = tape.gradient(loss, model.variables) - grads, global_norm = tf.clip_by_global_norm(grads, FLAGS.clip_norm) + grads, global_norm = tf1.clip_by_global_norm(grads, FLAGS.clip_norm) grads_and_vars = list(zip(grads, model.variables)) # allow reuse in py3 if FLAGS.enable_debug_logging: - with tf.compat.v1.name_scope("grads"): - tf.compat.v2.summary.scalar( + with tf1.name_scope("grads"): + tf.summary.scalar( "global_norm_grads", global_norm, - step=tf.compat.v1.train.get_or_create_global_step()) - tf.compat.v2.summary.scalar( + step=tf1.train.get_or_create_global_step()) + tf.summary.scalar( "global_norm_grads_clipped", - tf.linalg.global_norm(grads), - step=tf.compat.v1.train.get_or_create_global_step()) + tf1.linalg.global_norm(grads), + step=tf1.train.get_or_create_global_step()) for grad, var in grads_and_vars: - with tf.compat.v1.name_scope("grads"): - tf.compat.v2.summary.histogram( + with tf1.name_scope("grads"): + tf.summary.histogram( "{}/grad".format(var.name), data=grad, - step=tf.compat.v1.train.get_or_create_global_step()) - with tf.compat.v1.name_scope("vars"): - tf.compat.v2.summary.histogram( + step=tf1.train.get_or_create_global_step()) + with tf1.name_scope("vars"): + tf.summary.histogram( var.name, data=var, - step=tf.compat.v1.train.get_or_create_global_step()) + step=tf1.train.get_or_create_global_step()) optimizer.apply_gradients(grads_and_vars, global_step) is_log_step = global_step.numpy() % FLAGS.log_steps == 0 @@ -1214,7 +1215,7 @@ def main(argv): checkpoint_manager.save() print("ELBO ({}/{}): {}".format(global_step.numpy(), FLAGS.max_steps, elbo.numpy())) - with tf.compat.v2.summary.record_if(True): + with tf.summary.record_if(True): val_data = sprites_data.test.take(20) inputs = next(iter(val_data.shuffle(20).batch(3)))[0] visualize_qualitative_analysis(inputs, model, diff --git a/tensorflow_probability/examples/logistic_regression.py b/tensorflow_probability/examples/logistic_regression.py index 095d362d34..c2171a3e8e 100644 --- a/tensorflow_probability/examples/logistic_regression.py +++ b/tensorflow_probability/examples/logistic_regression.py @@ -25,6 +25,7 @@ import numpy as np import tensorflow.compat.v2 as tf import tensorflow_probability as tfp +from tensorflow_probability.python.internal import tf_keras tf.enable_v2_behavior() @@ -132,7 +133,7 @@ def toy_logistic_data(num_examples, input_size=2, weights_prior_stddev=5.0): return random_weights, random_bias, np.float32(design_matrix), labels -class ToyDataSequence(tf.keras.utils.Sequence): +class ToyDataSequence(tf_keras.utils.Sequence): """Creates a sequence of labeled points from provided numpy arrays.""" def __init__(self, features, labels, batch_size): @@ -177,7 +178,7 @@ def create_model(num_samples, num_dimensions): # parameterized by logits from a single linear layer. We use the Flipout # Monte Carlo estimator for the layer: this enables lower variance # stochastic gradients than naive reparameterization. - input_layer = tf.keras.layers.Input(shape=num_dimensions) + input_layer = tf_keras.layers.Input(shape=num_dimensions) dense_layer = tfp.layers.DenseFlipout( units=1, activation='sigmoid', @@ -186,8 +187,8 @@ def create_model(num_samples, num_dimensions): kernel_divergence_fn=kl_divergence_function)(input_layer) # Model compilation. - model = tf.keras.Model(inputs=input_layer, outputs=dense_layer) - optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) + model = tf_keras.Model(inputs=input_layer, outputs=dense_layer) + optimizer = tf_keras.optimizers.Adam(lr=FLAGS.learning_rate) # We use the binary_crossentropy loss since this toy example contains # two labels. The Keras API will then automatically add the # Kullback-Leibler divergence (contained on the individual layers of diff --git a/tensorflow_probability/examples/models/bayesian_resnet.py b/tensorflow_probability/examples/models/bayesian_resnet.py index 1ad4f9be24..8a2c16e824 100644 --- a/tensorflow_probability/examples/models/bayesian_resnet.py +++ b/tensorflow_probability/examples/models/bayesian_resnet.py @@ -16,6 +16,7 @@ import tensorflow.compat.v1 as tf import tensorflow_probability as tfp +from tensorflow_probability.python.internal import tf_keras def bayesian_resnet(input_shape, @@ -42,7 +43,7 @@ def bayesian_resnet(input_shape, i.e. log_var <= log(kernel_posterior_scale_constraint). Returns: - tf.keras.Model. + tf_keras.Model. """ filters = [64, 128, 256, 512] @@ -59,7 +60,7 @@ def _untransformed_scale_constraint(t): stddev=kernel_posterior_scale_stddev), untransformed_scale_constraint=_untransformed_scale_constraint) - image = tf.keras.layers.Input(shape=input_shape, dtype='float32') + image = tf_keras.layers.Input(shape=input_shape, dtype='float32') x = tfp.layers.Convolution2DFlipout( 64, 3, @@ -75,23 +76,23 @@ def _untransformed_scale_constraint(t): strides[i], kernel_posterior_fn) - x = tf.keras.layers.BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) - x = tf.keras.layers.AveragePooling2D(4, 1)(x) - x = tf.keras.layers.Flatten()(x) + x = tf_keras.layers.BatchNormalization()(x) + x = tf_keras.layers.Activation('relu')(x) + x = tf_keras.layers.AveragePooling2D(4, 1)(x) + x = tf_keras.layers.Flatten()(x) x = tfp.layers.DenseFlipout( num_classes, kernel_posterior_fn=kernel_posterior_fn)(x) - model = tf.keras.Model(inputs=image, outputs=x, name='resnet18') + model = tf_keras.Model(inputs=image, outputs=x, name='resnet18') return model def _resnet_block(x, filters, kernel, stride, kernel_posterior_fn): """Network block for ResNet.""" - x = tf.keras.layers.BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) + x = tf_keras.layers.BatchNormalization()(x) + x = tf_keras.layers.Activation('relu')(x) if stride != 1 or filters != x.shape[1]: shortcut = _projection_shortcut(x, filters, stride, kernel_posterior_fn) @@ -104,8 +105,8 @@ def _resnet_block(x, filters, kernel, stride, kernel_posterior_fn): strides=stride, padding='same', kernel_posterior_fn=kernel_posterior_fn)(x) - x = tf.keras.layers.BatchNormalization()(x) - x = tf.keras.layers.Activation('relu')(x) + x = tf_keras.layers.BatchNormalization()(x) + x = tf_keras.layers.Activation('relu')(x) x = tfp.layers.Convolution2DFlipout( filters, @@ -113,7 +114,7 @@ def _resnet_block(x, filters, kernel, stride, kernel_posterior_fn): strides=1, padding='same', kernel_posterior_fn=kernel_posterior_fn)(x) - x = tf.keras.layers.add([x, shortcut]) + x = tf_keras.layers.add([x, shortcut]) return x diff --git a/tensorflow_probability/examples/models/bayesian_vgg.py b/tensorflow_probability/examples/models/bayesian_vgg.py index 339e4e6015..f3a8826e9e 100644 --- a/tensorflow_probability/examples/models/bayesian_vgg.py +++ b/tensorflow_probability/examples/models/bayesian_vgg.py @@ -16,6 +16,7 @@ import tensorflow.compat.v1 as tf import tensorflow_probability as tfp +from tensorflow_probability.python.internal import tf_keras def bayesian_vgg(input_shape, @@ -42,7 +43,7 @@ def bayesian_vgg(input_shape, i.e. log_var <= log(kernel_posterior_scale_constraint). Returns: - tf.keras.Model. + tf_keras.Model. """ filters = [64, 128, 256, 512, 512] @@ -59,7 +60,7 @@ def _untransformed_scale_constraint(t): stddev=kernel_posterior_scale_stddev), untransformed_scale_constraint=_untransformed_scale_constraint) - image = tf.keras.layers.Input(shape=input_shape, dtype='float32') + image = tf_keras.layers.Input(shape=input_shape, dtype='float32') x = image for i in range(len(kernels)): @@ -70,11 +71,11 @@ def _untransformed_scale_constraint(t): strides[i], kernel_posterior_fn) - x = tf.keras.layers.Flatten()(x) + x = tf_keras.layers.Flatten()(x) x = tfp.layers.DenseFlipout( num_classes, kernel_posterior_fn=kernel_posterior_fn)(x) - model = tf.keras.Model(inputs=image, outputs=x, name='vgg16') + model = tf_keras.Model(inputs=image, outputs=x, name='vgg16') return model @@ -85,17 +86,17 @@ def _vggconv_block(x, filters, kernel, stride, kernel_posterior_fn): kernel, padding='same', kernel_posterior_fn=kernel_posterior_fn)(x) - out = tf.keras.layers.BatchNormalization()(out) - out = tf.keras.layers.Activation('relu')(out) + out = tf_keras.layers.BatchNormalization()(out) + out = tf_keras.layers.Activation('relu')(out) out = tfp.layers.Convolution2DFlipout( filters, kernel, padding='same', kernel_posterior_fn=kernel_posterior_fn)(out) - out = tf.keras.layers.BatchNormalization()(out) - out = tf.keras.layers.Activation('relu')(out) + out = tf_keras.layers.BatchNormalization()(out) + out = tf_keras.layers.Activation('relu')(out) - out = tf.keras.layers.MaxPooling2D( + out = tf_keras.layers.MaxPooling2D( pool_size=(2, 2), strides=stride)(out) return out diff --git a/tensorflow_probability/examples/vq_vae.py b/tensorflow_probability/examples/vq_vae.py index 2bb73e6bb6..d2b4e08f35 100644 --- a/tensorflow_probability/examples/vq_vae.py +++ b/tensorflow_probability/examples/vq_vae.py @@ -43,6 +43,7 @@ import tensorflow.compat.v1 as tf from tensorflow_probability import distributions as tfd +from tensorflow_probability.python.internal import tf_keras from tensorflow.contrib.learn.python.learn.datasets import mnist from tensorflow.python.training import moving_averages @@ -174,17 +175,17 @@ def make_encoder(base_depth, activation, latent_size, code_size): `[..., latent_size, code_size]`. """ conv = functools.partial( - tf.keras.layers.Conv2D, padding="SAME", activation=activation) + tf_keras.layers.Conv2D, padding="SAME", activation=activation) - encoder_net = tf.keras.Sequential([ + encoder_net = tf_keras.Sequential([ conv(base_depth, 5, 1), conv(base_depth, 5, 2), conv(2 * base_depth, 5, 1), conv(2 * base_depth, 5, 2), conv(4 * latent_size, 7, padding="VALID"), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(latent_size * code_size, activation=None), - tf.keras.layers.Reshape([latent_size, code_size]) + tf_keras.layers.Flatten(), + tf_keras.layers.Dense(latent_size * code_size, activation=None), + tf_keras.layers.Reshape([latent_size, code_size]) ]) def encoder(images): @@ -219,11 +220,11 @@ def make_decoder(base_depth, activation, input_size, output_shape): `tfd.Distribution` instance over images. """ deconv = functools.partial( - tf.keras.layers.Conv2DTranspose, padding="SAME", activation=activation) + tf_keras.layers.Conv2DTranspose, padding="SAME", activation=activation) conv = functools.partial( - tf.keras.layers.Conv2D, padding="SAME", activation=activation) - decoder_net = tf.keras.Sequential([ - tf.keras.layers.Reshape((1, 1, input_size)), + tf_keras.layers.Conv2D, padding="SAME", activation=activation) + decoder_net = tf_keras.Sequential([ + tf_keras.layers.Reshape((1, 1, input_size)), deconv(2 * base_depth, 7, padding="VALID"), deconv(2 * base_depth, 5), deconv(2 * base_depth, 5, 2), @@ -231,7 +232,7 @@ def make_decoder(base_depth, activation, input_size, output_shape): deconv(base_depth, 5, 2), deconv(base_depth, 5), conv(output_shape[-1], 5, activation=None), - tf.keras.layers.Reshape(output_shape), + tf_keras.layers.Reshape(output_shape), ]) def decoder(codes): diff --git a/tensorflow_probability/python/bijectors/BUILD b/tensorflow_probability/python/bijectors/BUILD index b1a9b79912..14b1dc619f 100644 --- a/tensorflow_probability/python/bijectors/BUILD +++ b/tensorflow_probability/python/bijectors/BUILD @@ -260,6 +260,7 @@ multi_substrate_py_library( deps = [ ":bijector", # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -477,6 +478,7 @@ py_library( ":tanh", ":transpose", # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util", ], ) @@ -612,6 +614,7 @@ multi_substrate_py_library( # tensorflow dep, "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:numeric", ], ) @@ -745,6 +748,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:parameter_properties", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util", ], ) @@ -770,6 +774,7 @@ multi_substrate_py_library( # numpy dep, # tensorflow dep, "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1108,6 +1113,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:parameter_properties", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1207,6 +1213,7 @@ py_test( "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1271,6 +1278,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:sample", "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1699,6 +1707,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/internal:tensorshape_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:gradient", ], ) @@ -1796,6 +1805,7 @@ multi_substrate_py_test( # numpy dep, # tensorflow dep, "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1859,6 +1869,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/internal:tensorshape_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -1879,6 +1890,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:hypothesis_testlib", "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/bijectors/batch_normalization.py b/tensorflow_probability/python/bijectors/batch_normalization.py index 1c7619880c..74537c9c6c 100644 --- a/tensorflow_probability/python/bijectors/batch_normalization.py +++ b/tensorflow_probability/python/bijectors/batch_normalization.py @@ -16,10 +16,10 @@ # Dependency imports -import tensorflow.compat.v1 as tf1 import tensorflow.compat.v2 as tf from tensorflow_probability.python.bijectors import bijector +from tensorflow_probability.python.internal import tf_keras __all__ = [ @@ -128,7 +128,7 @@ def __init__(self, Args: batchnorm_layer: `tf.layers.BatchNormalization` layer object. If `None`, - defaults to a `tf.keras.layers.BatchNormalization` with + defaults to a `tf_keras.layers.BatchNormalization` with `gamma_constraint=tf.nn.relu(x) + 1e-6)`. This ensures positivity of the scale variable. @@ -146,7 +146,7 @@ def __init__(self, with tf.name_scope(name) as name: # Scale must be positive. g_constraint = lambda x: tf.nn.relu(x) + 1e-6 - self.batchnorm = batchnorm_layer or tf.keras.layers.BatchNormalization( + self.batchnorm = batchnorm_layer or tf_keras.layers.BatchNormalization( gamma_constraint=g_constraint) self._validate_bn_layer(self.batchnorm) self._training = training @@ -174,11 +174,11 @@ def _validate_bn_layer(self, layer): `tf.layers.BatchNormalization`, or if `batchnorm_layer.renorm=True` or if `batchnorm_layer.virtual_batch_size` is specified. """ - if (not isinstance(layer, tf.keras.layers.BatchNormalization) and - not isinstance(layer, tf1.layers.BatchNormalization)): + if (not isinstance(layer, tf_keras.layers.BatchNormalization) and + not isinstance(layer, tf_keras.tf1_layers.BatchNormalization)): raise ValueError( 'batchnorm_layer must be an instance of ' - '`tf.keras.layers.BatchNormalization` or ' + '`tf_keras.layers.BatchNormalization` or ' '`tf.compat.v1.layers.BatchNormalization`. Got {}'.format( type(layer))) if layer.renorm: diff --git a/tensorflow_probability/python/bijectors/batch_normalization_test.py b/tensorflow_probability/python/bijectors/batch_normalization_test.py index f5b3a50788..bb29345223 100644 --- a/tensorflow_probability/python/bijectors/batch_normalization_test.py +++ b/tensorflow_probability/python/bijectors/batch_normalization_test.py @@ -29,6 +29,7 @@ from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras @test_util.test_all_tf_execution_regimes @@ -68,7 +69,7 @@ def testForwardInverse(self, input_shape, event_dims, training): x_, input_shape if 0 in event_dims else (None,) + input_shape[1:]) # When training, memorize the exact mean of the last # minibatch that it normalized (instead of moving average assignment). - layer = tf.keras.layers.BatchNormalization( + layer = tf_keras.layers.BatchNormalization( axis=event_dims, momentum=0., epsilon=0.) batch_norm = batch_normalization.BatchNormalization( batchnorm_layer=layer, training=training) @@ -140,13 +141,13 @@ def testForwardInverse(self, input_shape, event_dims, training): @parameterized.named_parameters( ("2d_event_ndims_v1", - (10, 4), [-1], False, tf1.layers.BatchNormalization), + (10, 4), [-1], False, tf_keras.tf1_layers.BatchNormalization), ("1d_event_ndims_v1", - 2, [-1], False, tf1.layers.BatchNormalization), + 2, [-1], False, tf_keras.tf1_layers.BatchNormalization), ("2d_event_ndims_keras", - (10, 4), [-1], False, tf.keras.layers.BatchNormalization), + (10, 4), [-1], False, tf_keras.layers.BatchNormalization), ("1d_event_ndims_keras", - 2, [-1], False, tf.keras.layers.BatchNormalization)) + 2, [-1], False, tf_keras.layers.BatchNormalization)) def testLogProb(self, event_shape, event_dims, training, layer_cls): training = tf1.placeholder_with_default(training, (), "training") layer = layer_cls(axis=event_dims, epsilon=0.) @@ -173,8 +174,8 @@ def testLogProb(self, event_shape, event_dims, training, layer_cls): self.assertAllClose(base_log_prob_, dist_log_prob_) @parameterized.named_parameters( - ("v1", tf1.layers.BatchNormalization), - ("keras", tf.keras.layers.BatchNormalization)) + ("v1", tf_keras.tf1_layers.BatchNormalization), + ("keras", tf_keras.layers.BatchNormalization)) def testMutuallyConsistent(self, layer_cls): # BatchNorm bijector is only mutually consistent when training=False. dims = 4 @@ -195,8 +196,8 @@ def testMutuallyConsistent(self, layer_cls): rtol=0.02) @parameterized.named_parameters( - ("v1", tf1.layers.BatchNormalization), - ("keras", tf.keras.layers.BatchNormalization)) + ("v1", tf_keras.tf1_layers.BatchNormalization), + ("keras", tf_keras.layers.BatchNormalization)) def testInvertMutuallyConsistent(self, layer_cls): # BatchNorm bijector is only mutually consistent when training=False. dims = 4 @@ -227,9 +228,9 @@ def testWithKeras(self): bijector=batch_normalization.BatchNormalization(batchnorm_layer=layer), validate_args=True) - x_ = tf.keras.Input(shape=(1,)) + x_ = tf_keras.Input(shape=(1,)) log_prob_ = dist.log_prob(x_) - model = tf.keras.Model(x_, log_prob_) + model = tf_keras.Model(x_, log_prob_) model.compile(optimizer="adam", loss=lambda _, log_prob: -log_prob) diff --git a/tensorflow_probability/python/bijectors/bijector_test.py b/tensorflow_probability/python/bijectors/bijector_test.py index 92f7e3d1fb..2d87e67daa 100644 --- a/tensorflow_probability/python/bijectors/bijector_test.py +++ b/tensorflow_probability/python/bijectors/bijector_test.py @@ -46,6 +46,7 @@ from tensorflow_probability.python.internal import tensor_util from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras JAX_MODE = False @@ -978,7 +979,7 @@ def testJacobianRespectsCache(self, keras): bijector = InverseOnlyBijector(scale=2.) y = tf.constant(10.) if keras: - y = tf.keras.layers.Input(shape=(), dtype=tf.float32, tensor=y) + y = tf_keras.layers.Input(shape=(), dtype=tf.float32, tensor=y) x = bijector.inverse(y) # Forward computation should work here because it should look up # `y` in the cache and call `inverse_log_det_jacobian`. diff --git a/tensorflow_probability/python/bijectors/glow.py b/tensorflow_probability/python/bijectors/glow.py index e59703ee03..bdcd5cde42 100644 --- a/tensorflow_probability/python/bijectors/glow.py +++ b/tensorflow_probability/python/bijectors/glow.py @@ -34,10 +34,11 @@ from tensorflow_probability.python.internal import dtype_util from tensorflow_probability.python.internal import prefer_static from tensorflow_probability.python.internal import tensorshape_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util.deferred_tensor import TransformedVariable from tensorflow_probability.python.util.seed_stream import SeedStream -tfk = tf.keras +tfk = tf_keras tfkl = tfk.layers __all__ = [ @@ -859,15 +860,15 @@ def __init__(self, input_shape, num_hidden=400, kernel_shape=3): conv_last = functools.partial( tfkl.Conv2D, padding='same', - kernel_initializer=tf.keras.initializers.zeros(), - bias_initializer=tf.keras.initializers.zeros()) + kernel_initializer=tf_keras.initializers.zeros(), + bias_initializer=tf_keras.initializers.zeros()) super(GlowDefaultNetwork, self).__init__([ tfkl.Input(shape=input_shape), tfkl.Conv2D(num_hidden, kernel_shape, padding='same', - kernel_initializer=tf.keras.initializers.he_normal(), + kernel_initializer=tf_keras.initializers.he_normal(), activation='relu'), tfkl.Conv2D(num_hidden, 1, padding='same', - kernel_initializer=tf.keras.initializers.he_normal(), + kernel_initializer=tf_keras.initializers.he_normal(), activation='relu'), conv_last(this_nchan, kernel_shape) ]) @@ -886,8 +887,8 @@ def __init__(self, input_shape, output_chan, kernel_shape=3): conv = functools.partial( tfkl.Conv2D, padding='same', - kernel_initializer=tf.keras.initializers.zeros(), - bias_initializer=tf.keras.initializers.zeros()) + kernel_initializer=tf_keras.initializers.zeros(), + bias_initializer=tf_keras.initializers.zeros()) super(GlowDefaultExitNetwork, self).__init__([ tfkl.Input(input_shape), diff --git a/tensorflow_probability/python/bijectors/glow_test.py b/tensorflow_probability/python/bijectors/glow_test.py index 735d365ce7..37903ea362 100644 --- a/tensorflow_probability/python/bijectors/glow_test.py +++ b/tensorflow_probability/python/bijectors/glow_test.py @@ -29,6 +29,7 @@ from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math.gradient import batch_jacobian @@ -331,14 +332,14 @@ def testDtypes(self): def float64_net(input_shape): input_nchan = input_shape[-1] - return tf.keras.Sequential([ - tf.keras.layers.Input(input_shape, dtype=tf.float64), - tf.keras.layers.Conv2D( + return tf_keras.Sequential([ + tf_keras.layers.Input(input_shape, dtype=tf.float64), + tf_keras.layers.Conv2D( 2 * input_nchan, 3, padding='same', dtype=tf.float64)]) def float64_exit(input_shape, output_chan): - return tf.keras.Sequential([ - tf.keras.layers.Input(input_shape, dtype=tf.float64), - tf.keras.layers.Conv2D( + return tf_keras.Sequential([ + tf_keras.layers.Input(input_shape, dtype=tf.float64), + tf_keras.layers.Conv2D( 2*output_chan, 3, padding='same', dtype=tf.float64)]) float64_bijection = glow.Glow( @@ -359,15 +360,15 @@ def testBijectorFn(self): ims = self._make_images() def shiftfn(input_shape): input_nchan = input_shape[-1] - return tf.keras.Sequential([ - tf.keras.layers.Input(input_shape), - tf.keras.layers.Conv2D( + return tf_keras.Sequential([ + tf_keras.layers.Input(input_shape), + tf_keras.layers.Conv2D( input_nchan, 3, padding='same')]) def shiftexitfn(input_shape, output_chan): - return tf.keras.Sequential([ - tf.keras.layers.Input(input_shape), - tf.keras.layers.Conv2D( + return tf_keras.Sequential([ + tf_keras.layers.Input(input_shape), + tf_keras.layers.Conv2D( output_chan, 3, padding='same')]) shiftonlyglow = glow.Glow( diff --git a/tensorflow_probability/python/bijectors/masked_autoregressive.py b/tensorflow_probability/python/bijectors/masked_autoregressive.py index e948d3c63c..7c1fb5b60d 100644 --- a/tensorflow_probability/python/bijectors/masked_autoregressive.py +++ b/tensorflow_probability/python/bijectors/masked_autoregressive.py @@ -27,6 +27,7 @@ from tensorflow_probability.python.internal import dtype_util from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import tensorshape_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math.numeric import clip_by_value_preserve_gradient from tensorflow.python.util import deprecation # pylint: disable=g-direct-tensorflow-import @@ -87,7 +88,7 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector): is possible that this architecture is suboptimal for your task. To build alternative networks, either change the arguments to `tfp.bijectors.AutoregressiveNetwork` or use some other architecture, e.g., - using `tf.keras.layers`. + using `tf_keras.layers`. Warning: no attempt is made to validate that the `shift_and_log_scale_fn` enforces the 'autoregressive property'. @@ -215,7 +216,7 @@ def inverse(y): track variables used inside `shift_and_log_scale_fn` or `bijector_fn`. To get `tfb.MaskedAutoregressiveFlow` to track such variables, either: - 1. Replace the Python function with a `tf.Module`, `tf.keras.Layer`, + 1. Replace the Python function with a `tf.Module`, `tf_keras.Layer`, or other callable object through which `tf.Module` can find variables. 2. Or, add a reference to the variables to the `tfb.MaskedAutoregressiveFlow` @@ -482,7 +483,7 @@ def masked_initializer(shape, dtype=None, partition_info=None): return mask * kernel_initializer(shape, dtype, partition_info) with tf.name_scope(name or 'masked_dense'): - layer = tf1.layers.Dense( + layer = tf_keras.tf1_layers.Dense( units, kernel_initializer=masked_initializer, kernel_constraint=lambda x: mask * x, @@ -621,7 +622,7 @@ def _fn(x): return tf1.make_template(name, _fn) -class AutoregressiveNetwork(tf.keras.layers.Layer): +class AutoregressiveNetwork(tf_keras.layers.Layer): r"""Masked Autoencoder for Distribution Estimation [Germain et al. (2015)][1]. A `AutoregressiveNetwork` takes as input a Tensor of shape `[..., event_size]` @@ -664,7 +665,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.keras.optimizers.Adam(), + model.compile(optimizer=tf_keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 25 @@ -718,7 +719,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): x_, bijector_kwargs={'conditional_input': c_}) model = tfk.Model([x_, c_], log_prob_) - model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + model.compile(optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), loss=lambda _, log_prob: -log_prob) batch_size = 25 @@ -780,7 +781,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.keras.optimizers.Adam(), + model.compile(optimizer=tf_keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 10 @@ -838,7 +839,7 @@ class AutoregressiveNetwork(tf.keras.layers.Layer): log_prob_ = distribution.log_prob(x_) model = tfk.Model(x_, log_prob_) - model.compile(optimizer=tf.keras.optimizers.Adam(), + model.compile(optimizer=tf_keras.optimizers.Adam(), loss=lambda _, log_prob: -log_prob) batch_size = 10 @@ -923,10 +924,10 @@ def __init__(self, hidden_degrees: Method for assigning degrees to the hidden units: 'equal', 'random'. If 'equal', hidden units in each layer are allocated equally (up to a remainder term) to each degree. Default: 'equal'. - activation: An activation function. See `tf.keras.layers.Dense`. Default: + activation: An activation function. See `tf_keras.layers.Dense`. Default: `None`. use_bias: Whether or not the dense layers constructed in this layer - should have a bias term. See `tf.keras.layers.Dense`. Default: `True`. + should have a bias term. See `tf_keras.layers.Dense`. Default: `True`. kernel_initializer: Initializer for the `Dense` kernel weight matrices. Default: 'glorot_uniform'. bias_initializer: Initializer for the `Dense` bias vectors. Default: @@ -944,7 +945,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. **kwargs: Additional keyword arguments passed to this layer (but not to - the `tf.keras.layer.Dense` layers constructed by this layer). + the `tf_keras.layer.Dense` layers constructed by this layer). """ super().__init__(**kwargs) @@ -964,7 +965,7 @@ def __init__(self, self._bias_initializer = bias_initializer self._kernel_regularizer = kernel_regularizer self._bias_regularizer = bias_regularizer - self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) + self._kernel_constraint = tf_keras.constraints.get(kernel_constraint) self._bias_constraint = bias_constraint self._validate_args = validate_args self._kwargs = kwargs @@ -1030,10 +1031,10 @@ def build(self, input_shape): hidden_degrees=self._hidden_degrees, ) - outputs = [tf.keras.Input((self._event_size,), dtype=self.dtype)] + outputs = [tf_keras.Input((self._event_size,), dtype=self.dtype)] inputs = outputs[0] if self._conditional: - conditional_input = tf.keras.Input((self._conditional_size,), + conditional_input = tf_keras.Input((self._conditional_size,), dtype=self.dtype) inputs = [inputs, conditional_input] @@ -1043,7 +1044,7 @@ def build(self, input_shape): # [..., self._hidden_units[-1]] -> [..., event_size * self._params]. layer_output_sizes = self._hidden_units + [self._event_size * self._params] for k in range(len(self._masks)): - autoregressive_output = tf.keras.layers.Dense( + autoregressive_output = tf_keras.layers.Dense( layer_output_sizes[k], activation=None, use_bias=self._use_bias, @@ -1059,7 +1060,7 @@ def build(self, input_shape): if (self._conditional and ((self._conditional_layers == 'all_layers') or ((self._conditional_layers == 'first_layer') and (k == 0)))): - conditional_output = tf.keras.layers.Dense( + conditional_output = tf_keras.layers.Dense( layer_output_sizes[k], activation=None, use_bias=False, @@ -1070,16 +1071,16 @@ def build(self, input_shape): kernel_constraint=self._kernel_constraint, bias_constraint=None, dtype=self.dtype)(conditional_input) - outputs.append(tf.keras.layers.Add()([ + outputs.append(tf_keras.layers.Add()([ autoregressive_output, conditional_output])) else: outputs.append(autoregressive_output) if k + 1 < len(self._masks): outputs.append( - tf.keras.layers.Activation(self._activation) + tf_keras.layers.Activation(self._activation) (outputs[-1])) - self._network = tf.keras.models.Model( + self._network = tf_keras.models.Model( inputs=inputs, outputs=outputs[-1]) # Allow network to be called with inputs of shapes that don't match @@ -1352,10 +1353,10 @@ def _create_masks(degrees): def _make_masked_initializer(mask, initializer): """Returns a masked version of the given initializer.""" - initializer = tf.keras.initializers.get(initializer) + initializer = tf_keras.initializers.get(initializer) def masked_initializer(shape, dtype=None, partition_info=None): # If no `partition_info` is given, then don't pass it to `initializer`, as - # `initializer` may be a `tf.keras.initializers.Initializer` (which don't + # `initializer` may be a `tf_keras.initializers.Initializer` (which don't # accept a `partition_info` argument). if partition_info is None: x = initializer(shape, dtype) @@ -1366,7 +1367,7 @@ def masked_initializer(shape, dtype=None, partition_info=None): def _make_masked_constraint(mask, constraint=None): - constraint = tf.keras.constraints.get(constraint) + constraint = tf_keras.constraints.get(constraint) def masked_constraint(x): x = tf.convert_to_tensor(x, dtype_hint=tf.float32, name='x') if constraint is not None: diff --git a/tensorflow_probability/python/bijectors/masked_autoregressive_test.py b/tensorflow_probability/python/bijectors/masked_autoregressive_test.py index 4c4dad6152..11e126fff6 100644 --- a/tensorflow_probability/python/bijectors/masked_autoregressive_test.py +++ b/tensorflow_probability/python/bijectors/masked_autoregressive_test.py @@ -39,10 +39,11 @@ from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import tensorshape_util from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import gradient -tfk = tf.keras -tfkl = tf.keras.layers +tfk = tf_keras +tfkl = tf_keras.layers def _funnel_bijector_fn(x): @@ -711,7 +712,7 @@ def test_layer_no_hidden_units(self): self.assertIsAutoregressive(made, event_size=3, order="left-to-right") def test_layer_v2_kernel_initializer(self): - init = tf.keras.initializers.GlorotNormal() + init = tf_keras.initializers.GlorotNormal() made = masked_autoregressive.AutoregressiveNetwork( params=2, event_shape=4, @@ -798,9 +799,9 @@ def test_doc_string_2(self): model = tfk.Model([x_, c_], log_prob_) if tf.__internal__.tf2.enabled() and tf.executing_eagerly(): - optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) + optimizer = tf_keras.optimizers.Adam(learning_rate=0.1) else: - optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.1) + optimizer = tf_keras.optimizers.legacy.Adam(learning_rate=0.1) model.compile( optimizer=optimizer, loss=lambda _, log_prob: -log_prob) diff --git a/tensorflow_probability/python/bijectors/permute_test.py b/tensorflow_probability/python/bijectors/permute_test.py index eef1994567..cce4e5b439 100644 --- a/tensorflow_probability/python/bijectors/permute_test.py +++ b/tensorflow_probability/python/bijectors/permute_test.py @@ -22,6 +22,7 @@ from tensorflow_probability.python.bijectors import bijector_test_util from tensorflow_probability.python.bijectors import permute from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras @test_util.test_all_tf_execution_regimes @@ -88,7 +89,7 @@ def testPreservesShape(self): # TODO(b/131157549, b/131124359): Test should not be needed. Consider # deleting when underlying issue with constant eager tensors is fixed. permutation = [2, 1, 0] - x = tf.keras.Input((3,), batch_size=None) + x = tf_keras.Input((3,), batch_size=None) bijector = permute.Permute( permutation=permutation, axis=-1, validate_args=True) diff --git a/tensorflow_probability/python/bijectors/rational_quadratic_spline.py b/tensorflow_probability/python/bijectors/rational_quadratic_spline.py index 2b3f12e785..8c2e2e13ca 100644 --- a/tensorflow_probability/python/bijectors/rational_quadratic_spline.py +++ b/tensorflow_probability/python/bijectors/rational_quadratic_spline.py @@ -100,11 +100,11 @@ def _slopes(x): x = tf.reshape(x, out_shape) return tf.math.softplus(x) + self._min_slope - self._bin_widths = tf.keras.layers.Dense( + self._bin_widths = tf_keras.layers.Dense( nunits * self._nbins, activation=_bin_positions, name='w') - self._bin_heights = tf.keras.layers.Dense( + self._bin_heights = tf_keras.layers.Dense( nunits * self._nbins, activation=_bin_positions, name='h') - self._knot_slopes = tf.keras.layers.Dense( + self._knot_slopes = tf_keras.layers.Dense( nunits * (self._nbins - 1), activation=_slopes, name='s') self._built = True diff --git a/tensorflow_probability/python/bijectors/rational_quadratic_spline_test.py b/tensorflow_probability/python/bijectors/rational_quadratic_spline_test.py index 9e6210bdb7..dbad3a4ed6 100644 --- a/tensorflow_probability/python/bijectors/rational_quadratic_spline_test.py +++ b/tensorflow_probability/python/bijectors/rational_quadratic_spline_test.py @@ -31,6 +31,8 @@ from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras + JAX_MODE = False @@ -96,11 +98,11 @@ def _slopes(x): x = tf.reshape(x, out_shape) return tf.math.softplus(x) + 1e-2 - self._bin_widths = tf.keras.layers.Dense( + self._bin_widths = tf_keras.layers.Dense( nunits * self._nbins, activation=_bin_positions, name='w') - self._bin_heights = tf.keras.layers.Dense( + self._bin_heights = tf_keras.layers.Dense( nunits * self._nbins, activation=_bin_positions, name='h') - self._knot_slopes = tf.keras.layers.Dense( + self._knot_slopes = tf_keras.layers.Dense( nunits * (self._nbins - 1), activation=_slopes, name='s') self._built = True diff --git a/tensorflow_probability/python/bijectors/real_nvp.py b/tensorflow_probability/python/bijectors/real_nvp.py index d9b7f5deb1..c51e857a9f 100644 --- a/tensorflow_probability/python/bijectors/real_nvp.py +++ b/tensorflow_probability/python/bijectors/real_nvp.py @@ -23,6 +23,7 @@ from tensorflow_probability.python.bijectors import scale as scale_lib from tensorflow_probability.python.bijectors import shift as shift_lib from tensorflow_probability.python.internal import tensorshape_util +from tensorflow_probability.python.internal import tf_keras __all__ = [ @@ -389,13 +390,13 @@ def _fn(x, output_units, **condition_kwargs): else: reshape_output = lambda x: x for units in hidden_layers: - x = tf1.layers.dense( + x = tf_keras.tf1_layers.dense( inputs=x, units=units, activation=activation, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) - x = tf1.layers.dense( + x = tf_keras.tf1_layers.dense( inputs=x, units=(1 if shift_only else 2) * output_units, activation=None, diff --git a/tensorflow_probability/python/bijectors/real_nvp_test.py b/tensorflow_probability/python/bijectors/real_nvp_test.py index 1af9299353..43dce97222 100644 --- a/tensorflow_probability/python/bijectors/real_nvp_test.py +++ b/tensorflow_probability/python/bijectors/real_nvp_test.py @@ -30,6 +30,7 @@ from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.internal import tensorshape_util from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras @test_util.test_all_tf_execution_regimes @@ -226,7 +227,7 @@ def _bijector_fn(x, output_units): else: reshape_output = lambda x: x - out = tf1.layers.dense(inputs=x, units=2 * output_units) + out = tf_keras.tf1_layers.dense(inputs=x, units=2 * output_units) shift, logit_gate = tf.split(out, 2, axis=-1) shift = reshape_output(shift) logit_gate = reshape_output(logit_gate) diff --git a/tensorflow_probability/python/build_defs.bzl b/tensorflow_probability/python/build_defs.bzl index 51bff587ca..47de202039 100644 --- a/tensorflow_probability/python/build_defs.bzl +++ b/tensorflow_probability/python/build_defs.bzl @@ -212,6 +212,7 @@ def multi_substrate_py_library( remove_deps = [ "//third_party/py/tensorflow", "//third_party/py/tensorflow:tensorflow", + "//tensorflow_probability/python/internal:tf_keras", ] trimmed_deps = [dep for dep in deps if (dep not in substrates_omit_deps and @@ -337,6 +338,7 @@ def multi_substrate_py_test( remove_deps = [ "//third_party/py/tensorflow", "//third_party/py/tensorflow:tensorflow", + "//tensorflow_probability/python/internal:tf_keras", ] trimmed_deps = [dep for dep in deps if dep not in remove_deps] diff --git a/tensorflow_probability/python/distributions/BUILD b/tensorflow_probability/python/distributions/BUILD index c42e793ebb..228f0a6532 100644 --- a/tensorflow_probability/python/distributions/BUILD +++ b/tensorflow_probability/python/distributions/BUILD @@ -762,6 +762,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:reparameterization", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:linalg", "//tensorflow_probability/python/math/psd_kernels/internal:util", ], @@ -780,6 +781,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:nest_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math/psd_kernels:schur_complement", "//tensorflow_probability/python/util", ], @@ -1156,6 +1158,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/distributions:joint_distribution_coroutine", "//tensorflow_probability/python/internal:auto_composite_tensor", "//tensorflow_probability/python/internal:distribution_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:seed_stream", ], ) @@ -1857,6 +1860,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:reparameterization", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/layers:weight_norm", ], ) @@ -2195,6 +2199,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:reparameterization", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:linalg", "//tensorflow_probability/python/math:special", ], @@ -2406,6 +2411,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:parameter_properties", "//tensorflow_probability/python/internal:tensor_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:linalg", "//tensorflow_probability/python/math/psd_kernels:positive_semidefinite_kernel", "//tensorflow_probability/python/math/psd_kernels/internal:util", @@ -3623,6 +3629,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:reparameterization", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/layers:distribution_layer", ], ) @@ -4215,6 +4222,7 @@ multi_substrate_py_test( # numpy dep, # tensorflow dep, "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:gradient", ], ) diff --git a/tensorflow_probability/python/distributions/gaussian_process.py b/tensorflow_probability/python/distributions/gaussian_process.py index 6c30c6a69e..6131b688e4 100644 --- a/tensorflow_probability/python/distributions/gaussian_process.py +++ b/tensorflow_probability/python/distributions/gaussian_process.py @@ -221,7 +221,7 @@ class GaussianProcess( gp = tfd.GaussianProcess(kernel, observed_index_points) - optimizer = tf.keras.optimizers.Adam() + optimizer = tf_keras.optimizers.Adam() @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/gaussian_process_regression_model.py b/tensorflow_probability/python/distributions/gaussian_process_regression_model.py index 84f9b4966b..1df5cb596e 100644 --- a/tensorflow_probability/python/distributions/gaussian_process_regression_model.py +++ b/tensorflow_probability/python/distributions/gaussian_process_regression_model.py @@ -190,7 +190,7 @@ class GaussianProcessRegressionModel( index_points=observation_index_points, observation_noise_variance=observation_noise_variance) - optimizer = tf.keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) + optimizer = tf_keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/joint_distribution_sequential.py b/tensorflow_probability/python/distributions/joint_distribution_sequential.py index 341cbb55c3..4653ba0941 100644 --- a/tensorflow_probability/python/distributions/joint_distribution_sequential.py +++ b/tensorflow_probability/python/distributions/joint_distribution_sequential.py @@ -54,10 +54,10 @@ class _JointDistributionSequential(joint_distribution_lib.JointDistribution): a single model specification. A joint distribution is a collection of possibly interdependent distributions. - Like `tf.keras.Sequential`, the `JointDistributionSequential` can be specified + Like `tf_keras.Sequential`, the `JointDistributionSequential` can be specified via a `list` of functions (each responsible for making a `tfp.distributions.Distribution`-like instance). Unlike - `tf.keras.Sequential`, each function can depend on the output of all previous + `tf_keras.Sequential`, each function can depend on the output of all previous elements rather than only the immediately previous. #### Mathematical Details diff --git a/tensorflow_probability/python/distributions/lambertw_f_test.py b/tensorflow_probability/python/distributions/lambertw_f_test.py index a5e3f6b4e3..95d8da2918 100644 --- a/tensorflow_probability/python/distributions/lambertw_f_test.py +++ b/tensorflow_probability/python/distributions/lambertw_f_test.py @@ -27,6 +27,7 @@ from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import uniform from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras @test_util.test_all_tf_execution_regimes @@ -190,16 +191,16 @@ def dist_lambda(t): from tensorflow_probability.python.layers import distribution_layer # pylint:disable=g-import-not-at-top dist_layer = distribution_layer.DistributionLambda(dist_lambda) - model = tf.keras.Sequential([ - tf.keras.layers.Dense(10, "relu"), - tf.keras.layers.Dense(5, "selu"), - tf.keras.layers.Dense(1 + 1 + 1), + model = tf_keras.Sequential([ + tf_keras.layers.Dense(10, "relu"), + tf_keras.layers.Dense(5, "selu"), + tf_keras.layers.Dense(1 + 1 + 1), dist_layer]) negloglik = lambda y, p_y: -p_y.log_prob(y) if tf.__internal__.tf2.enabled() and tf.executing_eagerly(): - optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) + optimizer = tf_keras.optimizers.Adam(learning_rate=0.01) else: - optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.01) + optimizer = tf_keras.optimizers.legacy.Adam(learning_rate=0.01) model.compile(optimizer=optimizer, loss=negloglik) diff --git a/tensorflow_probability/python/distributions/pixel_cnn.py b/tensorflow_probability/python/distributions/pixel_cnn.py index 08582f88ce..d1b325a0f8 100644 --- a/tensorflow_probability/python/distributions/pixel_cnn.py +++ b/tensorflow_probability/python/distributions/pixel_cnn.py @@ -30,6 +30,7 @@ from tensorflow_probability.python.internal import prefer_static from tensorflow_probability.python.internal import reparameterization from tensorflow_probability.python.internal import tensorshape_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import weight_norm @@ -103,8 +104,8 @@ class PixelCNN(distribution.Distribution): import tensorflow_probability as tfp tfd = tfp.distributions - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Load MNIST from tensorflow_datasets data = tfds.load('mnist') @@ -381,7 +382,7 @@ class labels), or `None`. May have leading batch dimension(s), which must broadcast to the leading batch dimensions of `value`. training: `bool` or `None`. If `bool`, it controls the dropout layer, where `True` implies dropout is active. If `None`, it defaults to - `tf.keras.backend.learning_phase()`. + `tf_keras.backend.learning_phase()`. Returns: log_prob_values: `Tensor`. """ @@ -618,7 +619,7 @@ def _event_shape(self): return tf.TensorShape(self.image_shape) -class _PixelCNNNetwork(tf.keras.layers.Layer): +class _PixelCNNNetwork(tf_keras.layers.Layer): """Keras `Layer` to parameterize a Pixel CNN++ distribution. This is a Keras implementation of the Pixel CNN++ network, as described in @@ -701,33 +702,33 @@ def build(self, input_shape): dtype = self.dtype if len(input_shape) == 2: batch_image_shape, batch_conditional_shape = input_shape - conditional_input = tf.keras.layers.Input( + conditional_input = tf_keras.layers.Input( shape=batch_conditional_shape[1:], dtype=dtype) else: batch_image_shape = input_shape conditional_input = None image_shape = batch_image_shape[1:] - image_input = tf.keras.layers.Input(shape=image_shape, dtype=dtype) + image_input = tf_keras.layers.Input(shape=image_shape, dtype=dtype) if self._resnet_activation == 'concat_elu': - activation = tf.keras.layers.Lambda( + activation = tf_keras.layers.Lambda( lambda x: tf.nn.elu(tf.concat([x, -x], axis=-1)), dtype=dtype) else: - activation = tf.keras.activations.get(self._resnet_activation) + activation = tf_keras.activations.get(self._resnet_activation) # Define layers with default inputs and layer wrapper applied Conv2D = functools.partial( # pylint:disable=invalid-name - self._layer_wrapper(tf.keras.layers.Convolution2D), + self._layer_wrapper(tf_keras.layers.Convolution2D), filters=self._num_filters, padding='same', dtype=dtype) Dense = functools.partial( # pylint:disable=invalid-name - self._layer_wrapper(tf.keras.layers.Dense), dtype=dtype) + self._layer_wrapper(tf_keras.layers.Dense), dtype=dtype) Conv2DTranspose = functools.partial( # pylint:disable=invalid-name - self._layer_wrapper(tf.keras.layers.Conv2DTranspose), + self._layer_wrapper(tf_keras.layers.Conv2DTranspose), filters=self._num_filters, padding='same', strides=(2, 2), @@ -773,7 +774,7 @@ def build(self, input_shape): kernel_constraint=_make_kernel_constraint( (3, cols), (0, 2), (0, cols // 2)))(image_input) - horizontal_stack_init = tf.keras.layers.add( + horizontal_stack_init = tf_keras.layers.add( [horizontal_stack_up, horizontal_stack_left], dtype=dtype) layer_stacks = { @@ -803,10 +804,10 @@ def build(self, input_shape): if stack == 'horizontal': h = activation(layer_stacks['vertical'][-1]) h = Dense(self._num_filters)(h) - x = tf.keras.layers.add([h, x], dtype=dtype) + x = tf_keras.layers.add([h, x], dtype=dtype) x = activation(x) - x = tf.keras.layers.Dropout(self._dropout_p, dtype=dtype)(x) + x = tf_keras.layers.Dropout(self._dropout_p, dtype=dtype)(x) x = Conv2D(filters=2*self._num_filters, kernel_size=kernel_sizes[stack], kernel_constraint=kernel_constraints[stack])(x) @@ -814,12 +815,12 @@ def build(self, input_shape): if conditional_input is not None: h_projection = _build_and_apply_h_projection( conditional_input, self._num_filters, dtype=dtype) - x = tf.keras.layers.add([x, h_projection], dtype=dtype) + x = tf_keras.layers.add([x, h_projection], dtype=dtype) x = _apply_sigmoid_gating(x) # Add a residual connection from the layer's input. - out = tf.keras.layers.add([input_x, x], dtype=dtype) + out = tf_keras.layers.add([input_x, x], dtype=dtype) layer_stacks[stack].append(out) if i < self._num_hierarchies - 1: @@ -872,17 +873,17 @@ def build(self, input_shape): # Include the vertical-stack layer of the upward pass in the layers # to be added to the horizontal layer. if stack == 'horizontal': - x_symmetric = tf.keras.layers.Concatenate(axis=-1, dtype=dtype)( + x_symmetric = tf_keras.layers.Concatenate(axis=-1, dtype=dtype)( [upward_pass['vertical'], x_symmetric]) # Add a skip-connection from the symmetric layer in the downward # pass to the layer `x` in the upward pass. h = activation(x_symmetric) h = Dense(self._num_filters)(h) - x = tf.keras.layers.add([h, x], dtype=dtype) + x = tf_keras.layers.add([h, x], dtype=dtype) x = activation(x) - x = tf.keras.layers.Dropout(self._dropout_p, dtype=dtype)(x) + x = tf_keras.layers.Dropout(self._dropout_p, dtype=dtype)(x) x = Conv2D(filters=2*self._num_filters, kernel_size=kernel_sizes[stack], kernel_constraint=kernel_constraints[stack])(x) @@ -890,10 +891,10 @@ def build(self, input_shape): if conditional_input is not None: h_projection = _build_and_apply_h_projection( conditional_input, self._num_filters, dtype=dtype) - x = tf.keras.layers.add([x, h_projection], dtype=dtype) + x = tf_keras.layers.add([x, h_projection], dtype=dtype) x = _apply_sigmoid_gating(x) - upward_pass[stack] = tf.keras.layers.add([input_x, x], dtype=dtype) + upward_pass[stack] = tf_keras.layers.add([input_x, x], dtype=dtype) # Define deconvolutional layers that expand height/width dimensions on the # upward pass (e.g. expanding from 8x8 to 16x16 in Figure 2 of [1]), with @@ -918,7 +919,7 @@ def build(self, input_shape): kernel_constraint=kernel_constraint)(x) upward_pass[stack] = x - x_out = tf.keras.layers.ELU(dtype=dtype)(upward_pass['horizontal']) + x_out = tf_keras.layers.ELU(dtype=dtype)(upward_pass['horizontal']) # Build final Dense/Reshape layers to output the correct number of # parameters per pixel. @@ -948,7 +949,7 @@ def build(self, input_shape): inputs = (image_input if conditional_input is None else [image_input, conditional_input]) - self._network = tf.keras.Model(inputs=inputs, outputs=outputs) + self._network = tf_keras.Model(inputs=inputs, outputs=outputs) super(_PixelCNNNetwork, self).build(input_shape) def call(self, inputs, training=None): @@ -962,7 +963,7 @@ def call(self, inputs, training=None): same leading batch dimension as the image `Tensor`. training: `bool` or `None`. If `bool`, it controls the dropout layer, where `True` implies dropout is active. If `None`, it it defaults to - `tf.keras.backend.learning_phase()` + `tf_keras.backend.learning_phase()` Returns: outputs: a 3- or 4-element `list` of `Tensor`s in the following order: @@ -996,8 +997,8 @@ def _make_kernel_constraint(kernel_size, valid_rows, valid_columns): def _build_and_apply_h_projection(h, num_filters, dtype): """Project the conditional input.""" - h = tf.keras.layers.Flatten(dtype=dtype)(h) - h_projection = tf.keras.layers.Dense( + h = tf_keras.layers.Flatten(dtype=dtype)(h) + h_projection = tf_keras.layers.Dense( 2*num_filters, kernel_initializer='random_normal', dtype=dtype)(h) return h_projection[..., tf.newaxis, tf.newaxis, :] @@ -1006,6 +1007,6 @@ def _apply_sigmoid_gating(x): """Apply the sigmoid gating in Figure 2 of [2].""" activation_tensor, gate_tensor = tf.split(x, 2, axis=-1) sigmoid_gate = tf.sigmoid(gate_tensor) - return tf.keras.layers.multiply( + return tf_keras.layers.multiply( [sigmoid_gate, activation_tensor], dtype=x.dtype) diff --git a/tensorflow_probability/python/distributions/pixel_cnn_test.py b/tensorflow_probability/python/distributions/pixel_cnn_test.py index 630f862ac3..a035a61c18 100644 --- a/tensorflow_probability/python/distributions/pixel_cnn_test.py +++ b/tensorflow_probability/python/distributions/pixel_cnn_test.py @@ -21,6 +21,7 @@ from tensorflow_probability.python.distributions import pixel_cnn from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import gradient @@ -64,7 +65,7 @@ def _make_fake_inputs(self): return self._make_fake_images() def _make_input_layers(self): - return tf.keras.layers.Input(self.image_shape) + return tf_keras.layers.Input(self.image_shape) def _get_single_pixel_logit_gradients(self, dist, logit_ind, pixel_ind): @@ -170,12 +171,12 @@ def testAutoregression(self): log_prob = dist.log_prob(inputs) # Build/fit a model to activate autoregressive kernel constraints - model = tf.keras.Model(inputs=inputs, outputs=log_prob) + model = tf_keras.Model(inputs=inputs, outputs=log_prob) model.add_loss(-tf.reduce_mean(log_prob)) model.compile() if not tf.executing_eagerly() and isinstance( - model.optimizer, tf.keras.optimizers.experimental.Optimizer): + model.optimizer, tf_keras.optimizers.experimental.Optimizer): return train_data = self._make_fake_inputs() model.fit(x=train_data) @@ -276,8 +277,8 @@ def _make_fake_inputs(self): return [self._make_fake_images(), self._make_fake_conditional()] def _make_input_layers(self): - return [tf.keras.layers.Input(shape=self.image_shape), - tf.keras.layers.Input(shape=self.h_shape)] + return [tf_keras.layers.Input(shape=self.image_shape), + tf_keras.layers.Input(shape=self.h_shape)] def testScalarConditional(self): dist = pixel_cnn.PixelCNN( diff --git a/tensorflow_probability/python/distributions/student_t_process.py b/tensorflow_probability/python/distributions/student_t_process.py index 6395474f99..f19c0346af 100644 --- a/tensorflow_probability/python/distributions/student_t_process.py +++ b/tensorflow_probability/python/distributions/student_t_process.py @@ -226,7 +226,7 @@ class StudentTProcess(distribution.AutoCompositeTensorDistribution): tp = tfd.StudentTProcess(3., kernel, observed_index_points) - optimizer = tf.keras.optimizers.Adam() + optimizer = tf_keras.optimizers.Adam() @tf.function def optimize(): diff --git a/tensorflow_probability/python/distributions/variational_gaussian_process.py b/tensorflow_probability/python/distributions/variational_gaussian_process.py index ae06d41631..88aa7aa7ea 100644 --- a/tensorflow_probability/python/distributions/variational_gaussian_process.py +++ b/tensorflow_probability/python/distributions/variational_gaussian_process.py @@ -558,7 +558,7 @@ class VariationalGaussianProcess(gaussian_process.GaussianProcess, # For training, we use some simplistic numpy-based minibatching. batch_size = 64 - optimizer = tf.keras.optimizers.Adam(learning_rate=.1) + optimizer = tf_keras.optimizers.Adam(learning_rate=.1) @tf.function def optimize(x_train_batch, y_train_batch): @@ -670,7 +670,7 @@ def optimize(x_train_batch, y_train_batch): # For training, we use some simplistic numpy-based minibatching. batch_size = 64 - optimizer = tf.keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) + optimizer = tf_keras.optimizers.Adam(learning_rate=.05, beta_1=.5, beta_2=.99) @tf.function def optimize(x_train_batch, y_train_batch): diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD index 53b07f6a16..b41602157b 100644 --- a/tensorflow_probability/python/experimental/bijectors/BUILD +++ b/tensorflow_probability/python/experimental/bijectors/BUILD @@ -96,6 +96,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/distributions:sample", "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/distributions:uniform", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -124,6 +125,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:tensorshape_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:gradient", "//tensorflow_probability/python/mcmc:dual_averaging_step_size_adaptation", "//tensorflow_probability/python/mcmc:nuts", diff --git a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py index baf0ced5f5..d794ba1655 100644 --- a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py +++ b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors.py @@ -107,7 +107,7 @@ def make_distribution_bijector(distribution, name='make_distribution_bijector'): pinned_model) _ = tfp.vi.fit_surrogate_posterior(pinned_model.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(0.01), + optimizer=tf_keras.optimizers.Adam(0.01), num_steps=200) ``` diff --git a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py index 83a86b670e..344a9467b7 100644 --- a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py +++ b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py @@ -35,6 +35,7 @@ from tensorflow_probability.python.internal import hypothesis_testlib as tfp_hps from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import gradient from tensorflow_probability.python.mcmc import dual_averaging_step_size_adaptation as dassa from tensorflow_probability.python.mcmc import nuts @@ -205,7 +206,7 @@ def model_with_funnel(): optimization.fit_surrogate_posterior( pinned_model.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(0.01), + optimizer=tf_keras.optimizers.Adam(0.01), sample_size=10, num_steps=1) bijector = ( diff --git a/tensorflow_probability/python/experimental/distributions/BUILD b/tensorflow_probability/python/experimental/distributions/BUILD index 6019005130..c6d3b45c87 100644 --- a/tensorflow_probability/python/experimental/distributions/BUILD +++ b/tensorflow_probability/python/experimental/distributions/BUILD @@ -58,6 +58,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:parameter_properties", "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:samplers", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/mcmc/internal:util", ], ) @@ -120,6 +121,7 @@ multi_substrate_py_library( deps = [ # numpy dep, # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/experimental/distributions/importance_resample.py b/tensorflow_probability/python/experimental/distributions/importance_resample.py index 8acc7bcecd..5b4ce87917 100644 --- a/tensorflow_probability/python/experimental/distributions/importance_resample.py +++ b/tensorflow_probability/python/experimental/distributions/importance_resample.py @@ -142,7 +142,7 @@ def target_log_prob_fn(x): importance_weighted_losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn, surrogate_posterior=proposal_distribution, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=200, importance_sample_size=importance_sample_size) approximate_posterior = tfed.ImportanceResample( @@ -167,7 +167,7 @@ def target_log_prob_fn(x): proposal_distribution=proposal_distribution, target_log_prob_fn=target_log_prob_fn, importance_sample_size=importance_sample_size), - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=200) ``` diff --git a/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py b/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py index ff685713f1..e376b8462a 100644 --- a/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py +++ b/tensorflow_probability/python/experimental/distributions/joint_distribution_pinned.py @@ -246,7 +246,7 @@ def target_log_prob_fn(loc, scale): pulled_back_shape) vars = tf.nest.map_structure(tf.Variable, uniform_init) - opt = tf.keras.optimizers.Adam(.01) + opt = tf_keras.optimizers.Adam(.01) @tf.function(autograph=False) def one_step(): diff --git a/tensorflow_probability/python/experimental/nn/BUILD b/tensorflow_probability/python/experimental/nn/BUILD index 35de967cb7..8b3a035d06 100644 --- a/tensorflow_probability/python/experimental/nn/BUILD +++ b/tensorflow_probability/python/experimental/nn/BUILD @@ -55,6 +55,7 @@ py_library( "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/experimental/nn/util:kernel_bias", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -73,6 +74,7 @@ py_test( "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:deferred_tensor", ], ) @@ -88,6 +90,7 @@ py_library( "//tensorflow_probability/python/experimental/nn/util", "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -104,6 +107,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:deferred_tensor", ], ) @@ -137,6 +141,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:deferred_tensor", ], ) @@ -151,6 +156,7 @@ py_library( "//tensorflow_probability/python/distributions:distribution", "//tensorflow_probability/python/experimental/nn/util:utils", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:deferred_tensor", ], ) @@ -167,6 +173,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/experimental/nn/README.md b/tensorflow_probability/python/experimental/nn/README.md index 4384fe1b9b..6a95f75bfb 100644 --- a/tensorflow_probability/python/experimental/nn/README.md +++ b/tensorflow_probability/python/experimental/nn/README.md @@ -11,7 +11,7 @@ Design goals include but are not limited to: - extensibility - simple implementations. -The primary differences from `tf.keras` are: +The primary differences from `tf_keras` are: 1. The TFP NN toolbox use `tf.Module` for `tf.Variable` tracking. 2. Users are expected to implement their own train loops. diff --git a/tensorflow_probability/python/experimental/nn/affine_layers.py b/tensorflow_probability/python/experimental/nn/affine_layers.py index 66ca98b72d..5181fd7a39 100644 --- a/tensorflow_probability/python/experimental/nn/affine_layers.py +++ b/tensorflow_probability/python/experimental/nn/affine_layers.py @@ -45,7 +45,7 @@ def __init__( output_size, # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_kernel_bias_fn=kernel_bias_lib.make_kernel_bias, dtype=tf.float32, batch_shape=(), @@ -61,7 +61,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_kernel_bias_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias`. dtype: ... @@ -179,11 +179,11 @@ def _preprocess(image, label): padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform()) + kernel_initializer=tf_keras.initializers.he_uniform()) BayesAffine = functools.partial( tfn.AffineVariationalReparameterization, - kernel_initializer=tf.keras.initializers.he_normal()) + kernel_initializer=tf_keras.initializers.he_normal()) scale = tfp.util.TransformedVariable(1., tfb.Softplus()) bnn = tfn.Sequential([ @@ -206,7 +206,7 @@ def loss_fn(): kl = bnn.extra_loss / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() @@ -232,7 +232,7 @@ def __init__( output_size, # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -252,7 +252,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. @@ -363,7 +363,7 @@ def __init__( output_size, # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -383,7 +383,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. @@ -502,7 +502,7 @@ def __init__( output_size, # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -522,7 +522,7 @@ def __init__( Default value: `None` (i.e., `tfp.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. diff --git a/tensorflow_probability/python/experimental/nn/affine_layers_test.py b/tensorflow_probability/python/experimental/nn/affine_layers_test.py index 10682d0091..43433f4199 100644 --- a/tensorflow_probability/python/experimental/nn/affine_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/affine_layers_test.py @@ -29,6 +29,7 @@ from tensorflow_probability.python.experimental import nn as tfn from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util import deferred_tensor @@ -87,7 +88,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers.py b/tensorflow_probability/python/experimental/nn/convolutional_layers.py index 4255bcbb7a..38d5ab6550 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers.py @@ -91,7 +91,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_kernel_bias_fn=kernel_bias_lib.make_kernel_bias, dtype=tf.float32, batch_shape=(), @@ -147,7 +147,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_kernel_bias_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias`. dtype: ... @@ -288,7 +288,7 @@ def _preprocess(image, label): padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform(), + kernel_initializer=tf_keras.initializers.he_uniform(), penalty_weight=1. / n) BayesAffine = functools.partial( @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss # Already normalized via `penalty_weight` arg. loss = nll + kl return loss, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() @@ -349,7 +349,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -408,7 +408,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. @@ -538,7 +538,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -597,7 +597,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py index 1fb3b95337..a6525de128 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_test.py @@ -25,6 +25,7 @@ from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.experimental import nn as tfn from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util import deferred_tensor @@ -79,7 +80,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py index 5ca655cd3c..039755846d 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2.py @@ -94,7 +94,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_kernel_bias_fn=kernel_bias_lib.make_kernel_bias, dtype=tf.float32, index_dtype=tf.int32, @@ -151,7 +151,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_kernel_bias_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias`. dtype: ... @@ -288,7 +288,7 @@ def _preprocess(image, label): padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform(), + kernel_initializer=tf_keras.initializers.he_uniform(), penalty_weight=1. / n) BayesAffine = functools.partial( @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss # Already normalized via `penalty_weight` arg. loss = nll + kl return loss, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() @@ -349,7 +349,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -409,7 +409,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. @@ -549,7 +549,7 @@ def __init__( dilations=1, # keras::Conv::dilation_rate # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -609,7 +609,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. diff --git a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py index 4ff83235f1..0893af1b25 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_layers_v2_test.py @@ -27,6 +27,7 @@ from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.experimental import nn as tfn from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util import deferred_tensor @@ -78,7 +79,7 @@ def loss_fn(): nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1) kl = tfn.losses.compute_extra_loss(bnn) / n return nll + kl, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py index 8c6b3f3288..ead55e8430 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers.py @@ -91,7 +91,7 @@ def __init__( method='auto', # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_kernel_bias_fn=kernel_bias_lib.make_kernel_bias, dtype=tf.float32, index_dtype=tf.int32, @@ -156,7 +156,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_kernel_bias_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias`. dtype: ... @@ -278,7 +278,7 @@ def _preprocess(image, label): padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform()) + kernel_initializer=tf_keras.initializers.he_uniform()) BayesDeconv2D = functools.partial( tfn.ConvolutionTransposeVariationalReparameterization, @@ -286,7 +286,7 @@ def _preprocess(image, label): padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform()) + kernel_initializer=tf_keras.initializers.he_uniform()) scale = tfp.util.TransformedVariable(1., tfb.Softplus()) bnn = tfn.Sequential([ @@ -316,7 +316,7 @@ def loss_fn(): kl = bnn.extra_loss / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(200): loss, (nll, kl), g = fit_op() @@ -351,7 +351,7 @@ def __init__( method='auto', # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -420,7 +420,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. @@ -527,14 +527,14 @@ class ConvolutionTransposeVariationalFlipout( padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform()) + kernel_initializer=tf_keras.initializers.he_uniform()) BayesDeconv2D = functools.partial( tfn.ConvolutionTransposeVariationalFlipout, rank=2, padding='same', filter_shape=5, # Use `he_uniform` because we'll use the `relu` family. - kernel_initializer=tf.keras.initializers.he_uniform()) + kernel_initializer=tf_keras.initializers.he_uniform()) ``` This example uses reparameterization gradients to minimize the @@ -567,7 +567,7 @@ def __init__( method='auto', # Weights kernel_initializer=None, # tfp.nn.initializers.glorot_uniform() - bias_initializer=None, # tf.keras.initializers.zeros() + bias_initializer=None, # tf_keras.initializers.zeros() make_posterior_fn=kernel_bias_lib.make_kernel_bias_posterior_mvn_diag, make_prior_fn=kernel_bias_lib.make_kernel_bias_prior_spike_and_slab, posterior_value_fn=tfd.Distribution.sample, @@ -636,7 +636,7 @@ def __init__( Default value: `None` (i.e., `tfp.experimental.nn.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). make_posterior_fn: ... Default value: `tfp.experimental.nn.util.make_kernel_bias_posterior_mvn_diag`. diff --git a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py index 86e4018c52..eceba593ec 100644 --- a/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py +++ b/tensorflow_probability/python/experimental/nn/convolutional_transpose_layers_test.py @@ -24,6 +24,7 @@ from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.experimental import nn as tfn from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util import deferred_tensor @@ -78,7 +79,7 @@ def loss_fn(): kl = tfn.losses.compute_extra_loss(bnn) / tf.cast(train_size, tf.float32) loss = nll + kl return loss, (nll, kl) - opt = tf.keras.optimizers.Adam() + opt = tf_keras.optimizers.Adam() fit_op = tfn.util.make_fit_op(loss_fn, opt, bnn.trainable_variables) for _ in range(2): loss, (nll, kl) = fit_op() # pylint: disable=unused-variable diff --git a/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb b/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb index 9ee6a94450..0fa1c85003 100644 --- a/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/bnn_mnist_advi.ipynb @@ -91,6 +91,8 @@ "\n", "from tensorflow_probability.python.internal import prefer_static\n", "\n", + "from tensorflow_probability.python.internal import tf_keras\n", + "\n", "# Globally Enable XLA.\n", "# tf.config.optimizer.set_jit(True)\n", "\n", @@ -229,7 +231,7 @@ " kernel_name='posterior_kernel',\n", " bias_name='posterior_bias'):\n", " if kernel_initializer is None:\n", - " kernel_initializer = tf.keras.initializers.glorot_uniform()\n", + " kernel_initializer = tf_keras.initializers.glorot_uniform()\n", " if bias_initializer is None:\n", " bias_initializer = tf.zeros\n", " make_loc = lambda shape, init, name: tf.Variable( # pylint: disable=g-long-lambda\n", @@ -325,7 +327,7 @@ } ], "source": [ - "max_pool = tf.keras.layers.MaxPooling2D( # Has no tf.Variables.\n", + "max_pool = tf_keras.layers.MaxPooling2D( # Has no tf.Variables.\n", " pool_size=(2, 2),\n", " strides=(2, 2),\n", " padding='SAME',\n", @@ -348,7 +350,7 @@ " output_size=8,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " penalty_weight=1 / train_size,\n", " # penalty_weight=1e2 / train_size, # Layer specific \"beta\".\n", " # make_posterior_fn=make_posterior,\n", @@ -361,7 +363,7 @@ " output_size=16,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " penalty_weight=1 / train_size,\n", " # penalty_weight=1e2 / train_size, # Layer specific \"beta\".\n", " # make_posterior_fn=make_posterior,\n", @@ -375,7 +377,7 @@ " output_size=32,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " penalty_weight=1 / train_size,\n", " # penalty_weight=1e2 / train_size, # Layer specific \"beta\".\n", " # make_posterior_fn=make_posterior,\n", @@ -448,7 +450,7 @@ " loss, (nll, kl), _ = compute_loss_bnn(x, y)\n", " return loss, (nll, kl)\n", "\n", - "opt_bnn = tf.keras.optimizers.Adam(learning_rate=0.003)\n", + "opt_bnn = tf_keras.optimizers.Adam(learning_rate=0.003)\n", " \n", "fit_bnn = tfn.util.make_fit_op(\n", " train_loss_bnn,\n", @@ -1191,7 +1193,7 @@ } ], "source": [ - "max_pool = tf.keras.layers.MaxPooling2D( # Has no tf.Variables.\n", + "max_pool = tf_keras.layers.MaxPooling2D( # Has no tf.Variables.\n", " pool_size=(2, 2),\n", " strides=(2, 2),\n", " padding='SAME',\n", @@ -1207,7 +1209,7 @@ " output_size=8,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " name='conv1'),\n", " maybe_batchnorm,\n", " tf.nn.leaky_relu,\n", @@ -1216,7 +1218,7 @@ " output_size=16,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " name='conv1'),\n", " maybe_batchnorm,\n", " tf.nn.leaky_relu,\n", @@ -1226,7 +1228,7 @@ " output_size=32,\n", " filter_shape=5,\n", " padding='SAME',\n", - " init_kernel_fn=tf.keras.initializers.he_uniform(),\n", + " init_kernel_fn=tf_keras.initializers.he_uniform(),\n", " name='conv2'),\n", " maybe_batchnorm,\n", " tf.nn.leaky_relu,\n", @@ -1280,7 +1282,7 @@ " nll, _ = compute_loss_dnn(x, y)\n", " return nll, None\n", "\n", - "opt_dnn = tf.keras.optimizers.Adam(learning_rate=0.003)\n", + "opt_dnn = tf_keras.optimizers.Adam(learning_rate=0.003)\n", " \n", "fit_dnn = tfn.util.make_fit_op(\n", " train_loss_dnn,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb b/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb index dc5a30fdd4..575f613919 100644 --- a/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/single_column_mnist.ipynb @@ -283,7 +283,7 @@ "\n", " # Convenience function\n", " affine = functools.partial(tfn.Affine,\n", - " init_kernel_fn=tf.keras.initializers.he_normal(),\n", + " init_kernel_fn=tf_keras.initializers.he_normal(),\n", " init_bias_fn = tf.zeros_initializer())\n", "\n", " self._dnn = tfn.Sequential([\n", @@ -333,7 +333,7 @@ "\n", " # Convenience function\n", " affine = functools.partial(tfn.Affine, \n", - " init_kernel_fn=tf.keras.initializers.he_normal(),\n", + " init_kernel_fn=tf_keras.initializers.he_normal(),\n", " init_bias_fn = tf.zeros_initializer())\n", "\n", " # DNN is just an affine transformation for the decoder\n", @@ -475,7 +475,7 @@ " beta=beta,\n", " seed=seedstream)\n", "\n", - "opt = tf.keras.optimizers.Adam(lr)\n", + "opt = tf_keras.optimizers.Adam(lr)\n", "train_op = tfn.util.make_fit_op(\n", " loss_fn=loss_fn, optimizer=opt,\n", " trainable_variables=loss_fn.trainable_variables,\n", @@ -675,7 +675,7 @@ " beta=beta,\n", " seed=seedstream)\n", "\n", - " opt = tf.keras.optimizers.Adam(lr)\n", + " opt = tf_keras.optimizers.Adam(lr)\n", " train_op = tfn.util.make_fit_op(\n", " loss_fn=loss_fn, optimizer=opt,\n", " trainable_variables=loss_fn.trainable_variables,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb b/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb index ce5f50d62a..c55819d5ee 100644 --- a/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/vae_mnist_advi.ipynb @@ -240,7 +240,7 @@ "source": [ "Conv = functools.partial(\n", " tfn.Convolution,\n", - " init_kernel_fn=tf.keras.initializers.he_uniform()) # Better for leaky_relu.\n", + " init_kernel_fn=tf_keras.initializers.he_uniform()) # Better for leaky_relu.\n", "\n", "encoder = tfn.Sequential([\n", " lambda x: 2. * tf.cast(x, tf.float32) - 1., # Center.\n", @@ -303,7 +303,7 @@ "source": [ "DeConv = functools.partial(\n", " tfn.ConvolutionTranspose,\n", - " init_kernel_fn=tf.keras.initializers.he_uniform()) # Better for leaky_relu.\n", + " init_kernel_fn=tf_keras.initializers.he_uniform()) # Better for leaky_relu.\n", " \n", "decoder = tfn.Sequential([\n", " lambda x: x[..., tf.newaxis, tf.newaxis, :],\n", @@ -380,7 +380,7 @@ " loss, (nll, kl), _ = compute_loss(x)\n", " return loss, (nll, kl)\n", "\n", - "opt = tf.keras.optimizers.Adam(learning_rate=1e-3)\n", + "opt = tf_keras.optimizers.Adam(learning_rate=1e-3)\n", "\n", "fit = tfn.util.make_fit_op(\n", " loss,\n", diff --git a/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb b/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb index 3c3581c392..2b717f6f81 100644 --- a/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb +++ b/tensorflow_probability/python/experimental/nn/examples/vib_dose.ipynb @@ -275,7 +275,7 @@ "Conv = functools.partial(\n", " tfn.Convolution,\n", " init_bias_fn=tf.zeros_initializer(),\n", - " init_kernel_fn=tf.keras.initializers.he_uniform()) # Better for leaky_relu.\n", + " init_kernel_fn=tf_keras.initializers.he_uniform()) # Better for leaky_relu.\n", "\n", "encoder = tfn.Sequential([\n", " lambda x: 2. * tf.cast(x, tf.float32) - 1., # Center.\n", @@ -326,11 +326,11 @@ "source": [ "DeConv = functools.partial(\n", " tfn.ConvolutionTranspose,\n", - " init_kernel_fn=tf.keras.initializers.he_uniform()) # Better for leaky_relu.\n", + " init_kernel_fn=tf_keras.initializers.he_uniform()) # Better for leaky_relu.\n", " \n", "Affine = functools.partial(\n", " tfn.Affine,\n", - " init_kernel_fn=tf.keras.initializers.he_uniform())\n", + " init_kernel_fn=tf_keras.initializers.he_uniform())\n", "\n", "decoder = tfn.Sequential([\n", " Affine(encoded_size, 10),\n", @@ -390,7 +390,7 @@ " loss, (nll, kl), _ = compute_loss(x, y, beta=0.075)\n", " return loss, (nll, kl)\n", "\n", - "opt = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=0.00005)\n", + "opt = tf_keras.optimizers.Adam(learning_rate=1e-3, decay=0.00005)\n", "\n", "fit = tfn.util.make_fit_op(\n", " loss,\n", diff --git a/tensorflow_probability/python/experimental/nn/util/BUILD b/tensorflow_probability/python/experimental/nn/util/BUILD index 99e0c450c2..64e7557c72 100644 --- a/tensorflow_probability/python/experimental/nn/util/BUILD +++ b/tensorflow_probability/python/experimental/nn/util/BUILD @@ -49,6 +49,7 @@ py_test( # tensorflow dep, "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -68,6 +69,7 @@ py_library( "//tensorflow_probability/python/distributions:sample", "//tensorflow_probability/python/experimental/nn/initializers:initializers_lib", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -131,5 +133,6 @@ py_library( "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/experimental/nn/util/convolution_util_test.py b/tensorflow_probability/python/experimental/nn/util/convolution_util_test.py index 7028a1e949..d86b31a3cd 100644 --- a/tensorflow_probability/python/experimental/nn/util/convolution_util_test.py +++ b/tensorflow_probability/python/experimental/nn/util/convolution_util_test.py @@ -24,7 +24,7 @@ from tensorflow_probability.python.experimental.nn.util import convolution_util from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util - +from tensorflow_probability.python.internal import tf_keras # pylint: disable=bad-whitespace _CONV_TEST_CASES = ( @@ -374,7 +374,7 @@ def test_works_like_conv2d_transpose( perm=[0, 1, 3, 2]) # conv2d_transpose does not support dilations > 1; use Keras instead. if any(d > 1 for d in dilations): - keras_convt = tf.keras.layers.Conv2DTranspose( + keras_convt = tf_keras.layers.Conv2DTranspose( filters=channels_out, kernel_size=filter_shape, strides=strides, diff --git a/tensorflow_probability/python/experimental/nn/util/kernel_bias.py b/tensorflow_probability/python/experimental/nn/util/kernel_bias.py index cd331b8c58..5b24b5002d 100644 --- a/tensorflow_probability/python/experimental/nn/util/kernel_bias.py +++ b/tensorflow_probability/python/experimental/nn/util/kernel_bias.py @@ -1,3 +1,4 @@ + # Copyright 2020 The TensorFlow Probability Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +29,7 @@ from tensorflow_probability.python.distributions.sample import Sample from tensorflow_probability.python.experimental.nn import initializers as nn_init_lib from tensorflow_probability.python.internal import prefer_static as ps +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.util.deferred_tensor import TransformedVariable @@ -58,9 +60,9 @@ def make_kernel_bias( kernel_shape: ... bias_shape: ... kernel_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.glorot_uniform()`). + Default value: `None` (i.e., `tf_keras.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). kernel_batch_ndims: ... Default value: `0`. bias_batch_ndims: ... @@ -79,13 +81,13 @@ def make_kernel_bias( #### Recommendations: ```python - # tf.nn.relu ==> tf.keras.initializers.he_* - # tf.nn.elu ==> tf.keras.initializers.he_* - # tf.nn.selu ==> tf.keras.initializers.lecun_* - # tf.nn.tanh ==> tf.keras.initializers.glorot_* - # tf.nn.sigmoid ==> tf.keras.initializers.glorot_* - # tf.nn.softmax ==> tf.keras.initializers.glorot_* - # None ==> tf.keras.initializers.glorot_* + # tf.nn.relu ==> tf_keras.initializers.he_* + # tf.nn.elu ==> tf_keras.initializers.he_* + # tf.nn.selu ==> tf_keras.initializers.lecun_* + # tf.nn.tanh ==> tf_keras.initializers.glorot_* + # tf.nn.sigmoid ==> tf_keras.initializers.glorot_* + # tf.nn.softmax ==> tf_keras.initializers.glorot_* + # None ==> tf_keras.initializers.glorot_* # https://towardsdatascience.com/hyper-parameters-in-action-part-ii-weight-initializers-35aee1a28404 # https://stats.stackexchange.com/a/393012/1835 @@ -112,7 +114,7 @@ def make_normal(size): if kernel_initializer is None: kernel_initializer = nn_init_lib.glorot_uniform() if bias_initializer is None: - bias_initializer = tf.keras.initializers.zeros() + bias_initializer = tf_keras.initializers.zeros() return ( tf.Variable(_try_call_init_fn(kernel_initializer, kernel_shape, @@ -156,9 +158,9 @@ def make_kernel_bias_prior_spike_and_slab( kernel_shape: ... bias_shape: ... kernel_initializer: Ignored. - Default value: `None` (i.e., `tf.keras.initializers.glorot_uniform()`). + Default value: `None` (i.e., `tf_keras.initializers.glorot_uniform()`). bias_initializer: Ignored. - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). kernel_batch_ndims: ... Default value: `0`. bias_batch_ndims: ... @@ -200,9 +202,9 @@ def make_kernel_bias_posterior_mvn_diag( kernel_shape: ... bias_shape: ... kernel_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.glorot_uniform()`). + Default value: `None` (i.e., `tf_keras.initializers.glorot_uniform()`). bias_initializer: ... - Default value: `None` (i.e., `tf.keras.initializers.zeros()`). + Default value: `None` (i.e., `tf_keras.initializers.zeros()`). kernel_batch_ndims: ... Default value: `0`. bias_batch_ndims: ... @@ -220,7 +222,7 @@ def make_kernel_bias_posterior_mvn_diag( if kernel_initializer is None: kernel_initializer = nn_init_lib.glorot_uniform() if bias_initializer is None: - bias_initializer = tf.keras.initializers.zeros() + bias_initializer = tf_keras.initializers.zeros() make_loc = lambda init_fn, shape, batch_ndims, name: tf.Variable( # pylint: disable=g-long-lambda _try_call_init_fn(init_fn, shape, dtype, batch_ndims), name=name + '_loc') diff --git a/tensorflow_probability/python/experimental/nn/util/utils.py b/tensorflow_probability/python/experimental/nn/util/utils.py index cde61fe0d9..c502298721 100644 --- a/tensorflow_probability/python/experimental/nn/util/utils.py +++ b/tensorflow_probability/python/experimental/nn/util/utils.py @@ -249,7 +249,7 @@ def make_fit_op(loss_fn, optimizer, trainable_variables, loss_fn: Python `callable` which returns the pair `loss` (`tf.Tensor`) and any other second result such that `tf.nest.map_structure(tf.convert_to_tensor, other)` will succeed. - optimizer: `tf.keras.optimizers.Optimizer`-like instance which has members + optimizer: `tf_keras.optimizers.Optimizer`-like instance which has members `gradient` and `apply_gradients`. trainable_variables: `tf.nest.flatten`-able structure of `tf.Variable` instances. diff --git a/tensorflow_probability/python/experimental/util/BUILD b/tensorflow_probability/python/experimental/util/BUILD index ba24229343..464e167ce6 100644 --- a/tensorflow_probability/python/experimental/util/BUILD +++ b/tensorflow_probability/python/experimental/util/BUILD @@ -149,6 +149,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:wishart", "//tensorflow_probability/python/internal:structural_tuple", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:gradient", "//tensorflow_probability/python/math:minimize", ], diff --git a/tensorflow_probability/python/experimental/util/trainable.py b/tensorflow_probability/python/experimental/util/trainable.py index 84c5d5f833..6dea8680db 100644 --- a/tensorflow_probability/python/experimental/util/trainable.py +++ b/tensorflow_probability/python/experimental/util/trainable.py @@ -185,7 +185,7 @@ def _make_trainable(cls, model = tfp.util.make_trainable(tfd.Normal) losses = tfp.math.minimize( lambda: -model.log_prob(samples), - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=200) print('Fit Normal distribution with mean {} and stddev {}'.format( model.mean(), diff --git a/tensorflow_probability/python/experimental/util/trainable_test.py b/tensorflow_probability/python/experimental/util/trainable_test.py index 68b3c1c0ef..c9e23aae6b 100644 --- a/tensorflow_probability/python/experimental/util/trainable_test.py +++ b/tensorflow_probability/python/experimental/util/trainable_test.py @@ -35,6 +35,7 @@ from tensorflow_probability.python.experimental.util import trainable from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import gradient from tensorflow_probability.python.math.minimize import minimize from tensorflow_probability.python.math.minimize import minimize_stateless @@ -198,7 +199,7 @@ def test_docstring_example_normal(self): normal.Normal, seed=test_util.test_seed(sampler_type='stateless')) losses = minimize( lambda: -model.log_prob(samples), - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=200) self.evaluate(tf1.global_variables_initializer()) self.evaluate(losses) diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD index 70de67fb3c..b21fb43958 100644 --- a/tensorflow_probability/python/experimental/vi/BUILD +++ b/tensorflow_probability/python/experimental/vi/BUILD @@ -69,6 +69,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:prefer_static", "//tensorflow_probability/python/internal:samplers", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/internal:trainable_state_util", "//tensorflow_probability/python/util", ], @@ -141,6 +142,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:custom_gradient", "//tensorflow_probability/python/internal:samplers", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:gradient", "//tensorflow_probability/python/math:minimize", "//tensorflow_probability/python/vi:optimization", @@ -180,6 +182,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:student_t", "//tensorflow_probability/python/experimental/vi/util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/vi:optimization", ], ) diff --git a/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py b/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py index 596d0c4c59..fca05b0f71 100644 --- a/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py +++ b/tensorflow_probability/python/experimental/vi/automatic_structured_vi.py @@ -497,7 +497,7 @@ def model_fn(): target_log_prob_fn, surrogate_posterior=surrogate_posterior, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) # After optimization, samples from the surrogate will approximate diff --git a/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py b/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py index 61160fc595..e9287768dd 100644 --- a/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py +++ b/tensorflow_probability/python/experimental/vi/automatic_structured_vi_test.py @@ -48,6 +48,7 @@ from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import gradient from tensorflow_probability.python.math.minimize import minimize_stateless from tensorflow_probability.python.vi import optimization @@ -239,7 +240,7 @@ def test_fitting_surrogate_posterior(self, dtype): target_log_prob, surrogate_posterior, num_steps=3, # Don't optimize to completion. - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=5) # Compute posterior statistics. diff --git a/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py b/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py index 6b189120e6..6ff693367d 100644 --- a/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py +++ b/tensorflow_probability/python/experimental/vi/surrogate_posteriors.py @@ -153,7 +153,7 @@ def model_fn(): lambda rate, concentration: model.log_prob([rate, concentration, y]), surrogate_posterior=surrogate_posterior, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) # After optimization, samples from the surrogate will approximate @@ -350,7 +350,7 @@ def model_fn(): target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) ``` """ @@ -532,7 +532,7 @@ def model_fn(): target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) ``` @@ -728,7 +728,7 @@ def build_split_flow_surrogate_posterior( target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) ``` diff --git a/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py b/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py index f215928b0b..bfe84b0bb2 100644 --- a/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py +++ b/tensorflow_probability/python/experimental/vi/surrogate_posteriors_test.py @@ -44,6 +44,7 @@ from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.vi import optimization from tensorflow.python.util import nest # pylint: disable=g-direct-tensorflow-import @@ -131,7 +132,7 @@ def _test_fitting(self, model, surrogate_posterior): lambda rate, concentration: model.log_prob((rate, concentration, y)), surrogate_posterior, num_steps=5, # Don't optimize to completion. - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), sample_size=10) # Compute posterior statistics. diff --git a/tensorflow_probability/python/internal/BUILD b/tensorflow_probability/python/internal/BUILD index d11bb3e7ba..2182c40599 100644 --- a/tensorflow_probability/python/internal/BUILD +++ b/tensorflow_probability/python/internal/BUILD @@ -15,6 +15,8 @@ # Description: # Internal utilities for TensorFlow probability. +# [internal] load pytype.bzl (pytype_strict_test) +# [internal] load strict.bzl # Placeholder: py_library # Placeholder: py_test load( @@ -22,8 +24,6 @@ load( "multi_substrate_py_library", "multi_substrate_py_test", ) -# [internal] load pytype.bzl (pytype_strict_test) -# [internal] load strict.bzl licenses(["notice"]) @@ -653,6 +653,7 @@ multi_substrate_py_test( srcs = ["trainable_state_util_test.py"], jax_size = "medium", numpy_tags = ["notap"], + tf_tags = ["no-oss-ci"], # TODO(b/308579205) deps = [ # optax dep, # tensorflow dep, @@ -666,6 +667,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/experimental/util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/internal:trainable_state_util", "//tensorflow_probability/python/math:gradient", "//tensorflow_probability/python/math:minimize", @@ -921,3 +923,11 @@ exports_files( # "//tensorflow_probability/google:friends", # DisableOnExport ], ) + +py_library( + name = "tf_keras", + srcs = ["tf_keras.py"], + deps = [ + # tensorflow dep, + ], +) diff --git a/tensorflow_probability/python/internal/tf_keras.py b/tensorflow_probability/python/internal/tf_keras.py new file mode 100644 index 0000000000..4a7873081e --- /dev/null +++ b/tensorflow_probability/python/internal/tf_keras.py @@ -0,0 +1,36 @@ +# Copyright 2023 The TensorFlow Probability Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Utility for importing the correct version of Keras.""" + +import tensorflow.compat.v2 as tf + +# pylint: disable=g-bad-import-order +# pylint: disable=g-import-not-at-top +# pylint: disable=unused-import +# pylint: disable=wildcard-import +_keras_version_fn = getattr(tf.keras, "version", None) +if _keras_version_fn and _keras_version_fn().startswith("3."): + from tf_keras import * + from tf_keras import __internal__ + import tf_keras.api._v1.keras.__internal__.legacy.layers as tf1_layers + import tf_keras.api._v1.keras as v1 +else: + from tensorflow.compat.v2.keras import * + from tensorflow.compat.v2.keras import __internal__ + import tensorflow.compat.v1.layers as tf1_layers + import tensorflow.compat.v1.keras as v1 + +del tf +del _keras_version_fn diff --git a/tensorflow_probability/python/internal/trainable_state_util_test.py b/tensorflow_probability/python/internal/trainable_state_util_test.py index cf0ff57c8b..47bcfea474 100644 --- a/tensorflow_probability/python/internal/trainable_state_util_test.py +++ b/tensorflow_probability/python/internal/trainable_state_util_test.py @@ -33,6 +33,7 @@ from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.internal import trainable_state_util from tensorflow_probability.python.math import gradient from tensorflow_probability.python.math.minimize import minimize @@ -347,7 +348,7 @@ def test_fitting_example(self): trainable_dist = build_trainable_normal( shape=[], seed=test_util.test_seed(sampler_type='stateless')) - optimizer = tf.keras.optimizers.Adam(1.0) + optimizer = tf_keras.optimizers.Adam(1.0) # Find the maximum likelihood distribution given observed data. x_observed = [3., -2., 1.7] losses = minimize( diff --git a/tensorflow_probability/python/layers/BUILD b/tensorflow_probability/python/layers/BUILD index 8851579e91..ad7677b477 100644 --- a/tensorflow_probability/python/layers/BUILD +++ b/tensorflow_probability/python/layers/BUILD @@ -54,6 +54,7 @@ py_library( "//tensorflow_probability/python/distributions:kullback_leibler", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:docstring_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/random", "//tensorflow_probability/python/util:seed_stream", ], @@ -72,6 +73,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/random:random_ops", "//tensorflow_probability/python/util:seed_stream", ], @@ -89,6 +91,7 @@ py_library( "//tensorflow_probability/python/distributions:kullback_leibler", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:docstring_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/random", "//tensorflow_probability/python/util", ], @@ -106,6 +109,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:seed_stream", ], ) @@ -118,6 +122,7 @@ py_library( deps = [ # tensorflow dep, "//tensorflow_probability/python/distributions:kullback_leibler", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:seed_stream", ], ) @@ -136,6 +141,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -160,6 +166,7 @@ py_library( "//tensorflow_probability/python/distributions:poisson", "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/distributions:variational_gaussian_process", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/layers/internal", ], ) @@ -191,6 +198,7 @@ py_test( "//tensorflow_probability/python/distributions:poisson", "//tensorflow_probability/python/distributions:uniform", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:generic", "//tensorflow_probability/python/math/psd_kernels:exponentiated_quadratic", "//tensorflow_probability/python/util:deferred_tensor", @@ -204,6 +212,7 @@ py_library( ], deps = [ # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -213,8 +222,8 @@ py_test( srcs = ["initializers_test.py"], deps = [ ":initializers", - # tensorflow dep, "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -225,6 +234,7 @@ py_library( # tensorflow dep, "//tensorflow_probability/python/bijectors:masked_autoregressive", "//tensorflow_probability/python/distributions:transformed_distribution", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -240,6 +250,7 @@ py_test( "//tensorflow_probability/python/bijectors:masked_autoregressive", "//tensorflow_probability/python/distributions:mvn_diag", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -254,6 +265,7 @@ py_library( "//tensorflow_probability/python/distributions:deterministic", "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util", ], ) @@ -265,6 +277,7 @@ py_library( ], deps = [ # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -279,6 +292,7 @@ py_test( "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -289,6 +303,7 @@ py_library( ], deps = [ # tensorflow dep, + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -301,6 +316,7 @@ py_test( # numpy dep, # tensorflow dep, "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/layers:weight_norm", ], ) diff --git a/tensorflow_probability/python/layers/conv_variational.py b/tensorflow_probability/python/layers/conv_variational.py index a4af7a9be5..2003f96118 100644 --- a/tensorflow_probability/python/layers/conv_variational.py +++ b/tensorflow_probability/python/layers/conv_variational.py @@ -21,6 +21,7 @@ from tensorflow_probability.python.distributions import kullback_leibler as kl_lib from tensorflow_probability.python.distributions import normal as normal_lib from tensorflow_probability.python.internal import docstring_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import util as tfp_layers_util from tensorflow_probability.python.util.seed_stream import SeedStream from tensorflow.python.ops import nn_ops # pylint: disable=g-direct-tensorflow-import @@ -73,7 +74,7 @@ sample is a `Tensor`.""" -class _ConvVariational(tf.keras.layers.Layer): +class _ConvVariational(tf_keras.layers.Layer): """Abstract nD convolution layer (private, used as implementation base). This layer creates a convolution kernel that is convolved @@ -155,8 +156,8 @@ def __init__( self.data_format = normalize_data_format(data_format) self.dilation_rate = normalize_tuple( dilation_rate, rank, 'dilation_rate') - self.activation = tf.keras.activations.get(activation) - self.input_spec = tf.keras.layers.InputSpec(ndim=self.rank + 2) + self.activation = tf_keras.activations.get(activation) + self.input_spec = tf_keras.layers.InputSpec(ndim=self.rank + 2) self.kernel_posterior_fn = kernel_posterior_fn self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn self.kernel_prior_fn = kernel_prior_fn @@ -179,7 +180,7 @@ def build(self, input_shape): kernel_shape = self.kernel_size + (input_dim, self.filters) # If self.dtype is None, build weights using the default dtype. - dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx()) + dtype = tf.as_dtype(self.dtype or tf_keras.backend.floatx()) # Must have a posterior kernel. self.kernel_posterior = self.kernel_posterior_fn( @@ -207,7 +208,7 @@ def build(self, input_shape): dtype, (self.filters,), 'bias_prior', self.trainable, self.add_variable) - self.input_spec = tf.keras.layers.InputSpec( + self.input_spec = tf_keras.layers.InputSpec( ndim=self.rank + 2, axes={channel_axis: input_dim}) self._convolution_op = nn_ops.Convolution( input_shape, @@ -294,10 +295,10 @@ def get_config(self): 'padding': self.padding, 'data_format': self.data_format, 'dilation_rate': self.dilation_rate, - 'activation': (tf.keras.activations.serialize(self.activation) + 'activation': (tf_keras.activations.serialize(self.activation) if self.activation else None), 'activity_regularizer': - tf.keras.initializers.serialize(self.activity_regularizer), + tf_keras.initializers.serialize(self.activity_regularizer), } function_keys = [ 'kernel_posterior_fn', @@ -490,7 +491,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -554,11 +555,11 @@ class Conv1DReparameterization(_ConvReparameterization): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([128, 1]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([128, 1]), tfp.layers.Convolution1DReparameterization( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseReparameterization(10), ]) @@ -638,7 +639,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -694,14 +695,14 @@ class Conv2DReparameterization(_ConvReparameterization): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([32, 32, 3]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([32, 32, 3]), tfp.layers.Convolution2DReparameterization( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + tf_keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME'), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseReparameterization(10), ]) @@ -787,7 +788,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -839,14 +840,14 @@ class Conv3DReparameterization(_ConvReparameterization): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([256, 32, 32, 3]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([256, 32, 32, 3]), tfp.layers.Convolution3DReparameterization( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.MaxPooling3D(pool_size=[2, 2, 2], + tf_keras.layers.MaxPooling3D(pool_size=[2, 2, 2], strides=[2, 2, 2], padding='SAME'), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseReparameterization(10), ]) @@ -933,7 +934,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -1038,7 +1039,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -1165,11 +1166,11 @@ class Conv1DFlipout(_ConvFlipout): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([128, 1]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([128, 1]), tfp.layers.Convolution1DFlipout( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseFlipout(10), ]) @@ -1253,7 +1254,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -1308,14 +1309,14 @@ class Conv2DFlipout(_ConvFlipout): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([32, 32, 3]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([32, 32, 3]), tfp.layers.Convolution2DFlipout( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.MaxPooling2D(pool_size=[2, 2], + tf_keras.layers.MaxPooling2D(pool_size=[2, 2], strides=[2, 2], padding='SAME'), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseFlipout(10), ]) @@ -1405,7 +1406,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, @@ -1460,14 +1461,14 @@ class Conv3DFlipout(_ConvFlipout): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ - tf.keras.layers.Reshape([256, 32, 32, 3]), + model = tf_keras.Sequential([ + tf_keras.layers.Reshape([256, 32, 32, 3]), tfp.layers.Convolution3DFlipout( 64, kernel_size=5, padding='SAME', activation=tf.nn.relu), - tf.keras.layers.MaxPooling3D(pool_size=[2, 2, 2], + tf_keras.layers.MaxPooling3D(pool_size=[2, 2, 2], strides=[2, 2, 2], padding='SAME'), - tf.keras.layers.Flatten(), + tf_keras.layers.Flatten(), tfp.layers.DenseFlipout(10), ]) @@ -1558,7 +1559,7 @@ def __init__( padding=padding, data_format=data_format, dilation_rate=dilation_rate, - activation=tf.keras.activations.get(activation), + activation=tf_keras.activations.get(activation), activity_regularizer=activity_regularizer, kernel_posterior_fn=kernel_posterior_fn, kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, diff --git a/tensorflow_probability/python/layers/conv_variational_test.py b/tensorflow_probability/python/layers/conv_variational_test.py index a942c808b7..3822257aa2 100644 --- a/tensorflow_probability/python/layers/conv_variational_test.py +++ b/tensorflow_probability/python/layers/conv_variational_test.py @@ -26,6 +26,7 @@ from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import conv_variational from tensorflow_probability.python.layers import util from tensorflow_probability.python.random import random_ops @@ -216,7 +217,7 @@ def kernel_posterior_fn(dtype, shape, name, trainable, add_variable_fn): if self.data_format == 'channels_first': input_shape = channels_last_to_first(input_shape) - with tf.keras.utils.CustomObjectScope({layer_class.__name__: layer_class}): + with tf_keras.utils.CustomObjectScope({layer_class.__name__: layer_class}): with self.cached_session(): # TODO(scottzhu): reenable the test when the repo switch change reach # the TF PIP package. @@ -606,7 +607,7 @@ def _testLayerInSequential(self, layer_class): # pylint: disable=invalid-name inputs = self.maybe_transpose_tensor(inputs) outputs = self.maybe_transpose_tensor(outputs) - net = tf.keras.Sequential([ + net = tf_keras.Sequential([ layer_class(filters=2, kernel_size=3, data_format=self.data_format, input_shape=inputs.shape[1:]), layer_class(filters=2, kernel_size=1, data_format=self.data_format)]) @@ -717,7 +718,7 @@ def testSequentialConvolution3DFlipout(self): self._testLayerInSequential(conv_variational.Convolution3DFlipout) def testGradients(self): - net = tf.keras.Sequential([ + net = tf_keras.Sequential([ conv_variational.Convolution1DFlipout( 1, 1, data_format=self.data_format), conv_variational.Convolution1DReparameterization( diff --git a/tensorflow_probability/python/layers/dense_variational.py b/tensorflow_probability/python/layers/dense_variational.py index 2f842016b9..c58ce88061 100644 --- a/tensorflow_probability/python/layers/dense_variational.py +++ b/tensorflow_probability/python/layers/dense_variational.py @@ -21,6 +21,7 @@ from tensorflow_probability.python.distributions import kullback_leibler as kl_lib from tensorflow_probability.python.distributions import normal as normal_lib from tensorflow_probability.python.internal import docstring_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import util as tfp_layers_util from tensorflow_probability.python.util import SeedStream @@ -70,7 +71,7 @@ sample is a `Tensor`.""" -class _DenseVariational(tf.keras.layers.Layer): +class _DenseVariational(tf_keras.layers.Layer): """Abstract densely-connected class (private, used as implementation base). This layer implements the Bayesian variational inference analogue to @@ -115,8 +116,8 @@ def __init__( activity_regularizer=activity_regularizer, **kwargs) self.units = units - self.activation = tf.keras.activations.get(activation) - self.input_spec = tf.keras.layers.InputSpec(min_ndim=2) + self.activation = tf_keras.activations.get(activation) + self.input_spec = tf_keras.layers.InputSpec(min_ndim=2) self.kernel_posterior_fn = kernel_posterior_fn self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn self.kernel_prior_fn = kernel_prior_fn @@ -132,10 +133,10 @@ def build(self, input_shape): if in_size is None: raise ValueError('The last dimension of the inputs to `Dense` ' 'should be defined. Found `None`.') - self._input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: in_size}) + self._input_spec = tf_keras.layers.InputSpec(min_ndim=2, axes={-1: in_size}) # If self.dtype is None, build weights using the default dtype. - dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx()) + dtype = tf.as_dtype(self.dtype or tf_keras.backend.floatx()) # Must have a posterior kernel. self.kernel_posterior = self.kernel_posterior_fn( @@ -221,10 +222,10 @@ def get_config(self): """ config = { 'units': self.units, - 'activation': (tf.keras.activations.serialize(self.activation) + 'activation': (tf_keras.activations.serialize(self.activation) if self.activation else None), 'activity_regularizer': - tf.keras.initializers.serialize(self.activity_regularizer), + tf_keras.initializers.serialize(self.activity_regularizer), } function_keys = [ 'kernel_posterior_fn', @@ -346,7 +347,7 @@ class DenseReparameterization(_DenseVariational): import tensorflow as tf import tensorflow_probability as tfp - model = tf.keras.Sequential([ + model = tf_keras.Sequential([ tfp.layers.DenseReparameterization(512, activation=tf.nn.relu), tfp.layers.DenseReparameterization(10), ]) @@ -465,7 +466,7 @@ class DenseLocalReparameterization(_DenseVariational): ```python import tensorflow_probability as tfp - model = tf.keras.Sequential([ + model = tf_keras.Sequential([ tfp.layers.DenseLocalReparameterization(512, activation=tf.nn.relu), tfp.layers.DenseLocalReparameterization(10), ]) @@ -592,7 +593,7 @@ class DenseFlipout(_DenseVariational): ```python import tensorflow_probability as tfp - model = tf.keras.Sequential([ + model = tf_keras.Sequential([ tfp.layers.DenseFlipout(512, activation=tf.nn.relu), tfp.layers.DenseFlipout(10), ]) diff --git a/tensorflow_probability/python/layers/dense_variational_test.py b/tensorflow_probability/python/layers/dense_variational_test.py index 33b53423bf..7f06b1ade5 100644 --- a/tensorflow_probability/python/layers/dense_variational_test.py +++ b/tensorflow_probability/python/layers/dense_variational_test.py @@ -25,6 +25,7 @@ from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import dense_variational from tensorflow_probability.python.layers import util from tensorflow_probability.python.random import random_ops @@ -124,7 +125,7 @@ def kernel_posterior_fn(dtype, shape, name, trainable, add_variable_fn): 'kernel_prior_fn': None, 'bias_posterior_fn': None, 'bias_prior_fn': None} - with tf.keras.utils.CustomObjectScope({layer_class.__name__: layer_class}): + with tf_keras.utils.CustomObjectScope({layer_class.__name__: layer_class}): # TODO(scottzhu): reenable the test when the repo switch change reach # the TF PIP package. self.skipTest('Skip the test until the TF and Keras has a new PIP.') @@ -500,7 +501,7 @@ def testDenseLayersInSequential(self): y = np.random.uniform( -1., 1., size=(data_size, out_size)).astype(np.float32) - model = tf.keras.Sequential([ + model = tf_keras.Sequential([ dense_variational.DenseReparameterization(6, activation=tf.nn.relu), dense_variational.DenseFlipout(6, activation=tf.nn.relu), dense_variational.DenseLocalReparameterization(out_size) @@ -514,7 +515,7 @@ def testDenseLayersInSequential(self): self.assertAllEqual(batch_output.shape, [batch_size, out_size]) def testGradients(self): - net = tf.keras.Sequential([ + net = tf_keras.Sequential([ dense_variational.DenseReparameterization(1), dense_variational.DenseFlipout(1), dense_variational.DenseLocalReparameterization(1) diff --git a/tensorflow_probability/python/layers/dense_variational_v2.py b/tensorflow_probability/python/layers/dense_variational_v2.py index 9f8dd3ebcd..3f6bf70566 100644 --- a/tensorflow_probability/python/layers/dense_variational_v2.py +++ b/tensorflow_probability/python/layers/dense_variational_v2.py @@ -18,13 +18,15 @@ from tensorflow_probability.python.distributions import kullback_leibler +from tensorflow_probability.python.internal import tf_keras -class DenseVariational(tf.keras.layers.Layer): + +class DenseVariational(tf_keras.layers.Layer): """Dense layer with random `kernel` and `bias`. This layer uses variational inference to fit a "surrogate" posterior to the distribution over both the `kernel` matrix and the `bias` terms which are - otherwise used in a manner similar to `tf.keras.layers.Dense`. + otherwise used in a manner similar to `tf_keras.layers.Dense`. This layer fits the "weights posterior" according to the following generative process: @@ -67,12 +69,12 @@ def __init__(self, use_bias: Boolean, whether the layer uses a bias vector. activity_regularizer: Regularizer function applied to the output of the layer (its "activation").. - **kwargs: Extra arguments forwarded to `tf.keras.layers.Layer`. + **kwargs: Extra arguments forwarded to `tf_keras.layers.Layer`. """ if 'input_shape' not in kwargs and 'input_dim' in kwargs: kwargs['input_shape'] = (kwargs.pop('input_dim'),) super(DenseVariational, self).__init__( - activity_regularizer=tf.keras.regularizers.get(activity_regularizer), + activity_regularizer=tf_keras.regularizers.get(activity_regularizer), **kwargs) self.units = int(units) @@ -81,13 +83,13 @@ def __init__(self, self._kl_divergence_fn = _make_kl_divergence_penalty( kl_use_exact, weight=kl_weight) - self.activation = tf.keras.activations.get(activation) + self.activation = tf_keras.activations.get(activation) self.use_bias = use_bias self.supports_masking = False - self.input_spec = tf.keras.layers.InputSpec(min_ndim=2) + self.input_spec = tf_keras.layers.InputSpec(min_ndim=2) def build(self, input_shape): - dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx()) + dtype = tf.as_dtype(self.dtype or tf_keras.backend.floatx()) if not (dtype.is_floating or dtype.is_complex): raise TypeError('Unable to build `Dense` layer with non-floating point ' 'dtype %s' % (dtype,)) @@ -96,7 +98,7 @@ def build(self, input_shape): if last_dim is None: raise ValueError('The last dimension of the inputs to `DenseVariational` ' 'should be defined. Found `None`.') - self.input_spec = tf.keras.layers.InputSpec( + self.input_spec = tf_keras.layers.InputSpec( min_ndim=2, axes={-1: last_dim}) with tf.name_scope('posterior'): @@ -113,7 +115,7 @@ def build(self, input_shape): self.built = True def call(self, inputs): - dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx()) + dtype = tf.as_dtype(self.dtype or tf_keras.backend.floatx()) inputs = tf.cast(inputs, dtype, name='inputs') q = self._posterior(inputs) diff --git a/tensorflow_probability/python/layers/dense_variational_v2_test.py b/tensorflow_probability/python/layers/dense_variational_v2_test.py index aca410fc45..51c61d9fae 100644 --- a/tensorflow_probability/python/layers/dense_variational_v2_test.py +++ b/tensorflow_probability/python/layers/dense_variational_v2_test.py @@ -22,6 +22,7 @@ from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import dense_variational_v2 from tensorflow_probability.python.layers import distribution_layer from tensorflow_probability.python.layers import variable_input @@ -51,7 +52,7 @@ def s(x): def posterior_mean_field(kernel_size, bias_size=0, dtype=None): n = kernel_size + bias_size c = np.log(np.expm1(1.)) - return tf.keras.Sequential([ + return tf_keras.Sequential([ variable_input.VariableLayer(2 * n, dtype=dtype), distribution_layer.DistributionLambda(lambda t: independent.Independent( # pylint: disable=g-long-lambda normal.Normal(loc=t[..., :n], @@ -62,7 +63,7 @@ def posterior_mean_field(kernel_size, bias_size=0, dtype=None): def prior_trainable(kernel_size, bias_size=0, dtype=None): n = kernel_size + bias_size - return tf.keras.Sequential([ + return tf_keras.Sequential([ variable_input.VariableLayer(n, dtype=dtype), distribution_layer.DistributionLambda( lambda t: independent.Independent(normal.Normal(loc=t, scale=1), # pylint: disable=g-long-lambda @@ -83,16 +84,16 @@ def test_end_to_end(self): layer = dense_variational_v2.DenseVariational(1, posterior_mean_field, prior_trainable) - model = tf.keras.Sequential([ + model = tf_keras.Sequential([ layer, distribution_layer.DistributionLambda( lambda t: normal.Normal(loc=t, scale=1)) ]) if tf.__internal__.tf2.enabled() and tf.executing_eagerly(): - optimizer = tf.keras.optimizers.Adam(learning_rate=0.05) + optimizer = tf_keras.optimizers.Adam(learning_rate=0.05) else: - optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.05) + optimizer = tf_keras.optimizers.legacy.Adam(learning_rate=0.05) # Do inference. model.compile(optimizer=optimizer, loss=negloglik) diff --git a/tensorflow_probability/python/layers/distribution_layer.py b/tensorflow_probability/python/layers/distribution_layer.py index c51fb533a4..638d15e61d 100644 --- a/tensorflow_probability/python/layers/distribution_layer.py +++ b/tensorflow_probability/python/layers/distribution_layer.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""Layers for combining `tfp.distributions` and `tf.keras`.""" +"""Layers for combining `tfp.distributions` and `tf_keras`.""" import codecs import collections @@ -43,6 +43,7 @@ from tensorflow_probability.python.distributions import transformed_distribution as transformed_distribution_lib from tensorflow_probability.python.distributions import variational_gaussian_process as variational_gaussian_process_lib from tensorflow_probability.python.internal import distribution_util as dist_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers.internal import distribution_tensor_coercible as dtc from tensorflow_probability.python.layers.internal import tensor_tuple @@ -65,7 +66,7 @@ ] -tf.keras.__internal__.utils.register_symbolic_tensor_type(dtc._TensorCoercible) # pylint: disable=protected-access +tf_keras.__internal__.utils.register_symbolic_tensor_type(dtc._TensorCoercible) # pylint: disable=protected-access def _event_size(event_shape, name=None): @@ -92,7 +93,7 @@ def _event_size(event_shape, name=None): return tf.reduce_prod(event_shape) -class DistributionLambda(tf.keras.layers.Lambda): +class DistributionLambda(tf_keras.layers.Lambda): """Keras layer enabling plumbing TFP distributions through Keras models. A `DistributionLambda` is minimially characterized by a function that returns @@ -108,8 +109,8 @@ class DistributionLambda(tf.keras.layers.Lambda): #### Examples ```python - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers tfd = tfp.distributions tfpl = tfp.layers @@ -139,7 +140,7 @@ def __init__(self, instance and returns a `tf.Tensor`-like object. For examples, see `class` docstring. Default value: `tfd.Distribution.sample`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ # TODO(b/120440642): See if something like this code block is needed. # if output_shape is None: @@ -298,8 +299,8 @@ class MultivariateNormalTriL(DistributionLambda): #### Example ```python - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers tfd = tfp.distributions tfpl = tfp.layers @@ -355,7 +356,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(MultivariateNormalTriL, self).__init__( lambda t: MultivariateNormalTriL.new(t, event_size, validate_args), @@ -396,8 +397,8 @@ class OneHotCategorical(DistributionLambda): #### Example ```python - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers tfd = tfp.distributions tfpl = tfp.layers @@ -459,7 +460,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(OneHotCategorical, self).__init__( lambda t: OneHotCategorical.new( # pylint: disable=g-long-lambda @@ -500,8 +501,8 @@ class CategoricalMixtureOfOneHotCategorical(DistributionLambda): #### Example ```python - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers tfd = tfp.distributions tfpl = tfp.layers @@ -564,7 +565,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(CategoricalMixtureOfOneHotCategorical, self).__init__( # pylint: disable=g-long-lambda @@ -622,8 +623,8 @@ class IndependentBernoulli(DistributionLambda): #### Example ```python - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers tfd = tfp.distributions tfpl = tfp.layers @@ -685,7 +686,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -788,8 +789,8 @@ class IndependentLogistic(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Create a stochastic encoder -- e.g., for use in a variational auto-encoder. input_shape = [28, 28, 1] @@ -823,7 +824,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -903,8 +904,8 @@ class IndependentNormal(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Create a stochastic encoder -- e.g., for use in a variational auto-encoder. input_shape = [28, 28, 1] @@ -938,7 +939,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -1018,8 +1019,8 @@ class IndependentPoisson(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Create example data. n = 2000 @@ -1069,7 +1070,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -1141,7 +1142,7 @@ def get_config(self): # We mix-in `tf.Module` since Keras `Regularizer` base class tracks neither # tf.Variables nor tf.Modules. -class KLDivergenceRegularizer(tf.keras.regularizers.Regularizer, tf.Module): +class KLDivergenceRegularizer(tf_keras.regularizers.Regularizer, tf.Module): """Regularizer that adds a KL divergence penalty to the model loss. When using Monte Carlo approximation (e.g., `use_exact=False`), it is presumed @@ -1154,8 +1155,8 @@ class KLDivergenceRegularizer(tf.keras.regularizers.Regularizer, tf.Module): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Create a variational encoder and add a KL Divergence penalty to the # loss that encourages marginal coherence with a unit-MVN (the "prior"). @@ -1251,7 +1252,7 @@ def __call__(self, distribution_a): return self._kl_divergence_fn(distribution_a) -class KLDivergenceAddLoss(tf.keras.layers.Layer): +class KLDivergenceAddLoss(tf_keras.layers.Layer): """Pass-through layer that adds a KL divergence penalty to the model loss. When using Monte Carlo approximation (e.g., `use_exact=False`), it is presumed @@ -1264,8 +1265,8 @@ class KLDivergenceAddLoss(tf.keras.layers.Layer): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Create a variational encoder and add a KL Divergence penalty to the # loss that encourages marginal coherence with a unit-MVN (the "prior"). @@ -1315,7 +1316,7 @@ def __init__(self, weight: Multiplier applied to the calculated KL divergence for each Keras batch member. Default value: `None` (i.e., do not weight each batch member). - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(KLDivergenceAddLoss, self).__init__(**kwargs) self._regularizer = KLDivergenceRegularizer( @@ -1358,7 +1359,7 @@ def kl_divergence_fn(distribution_a, distribution_b): def _fn(distribution_a): """Closure that computes KLDiv as a function of `a` as in `KL[a, b]`.""" with tf.name_scope('kldivergence_loss'): - if isinstance(distribution_b, tf.keras.Model): + if isinstance(distribution_b, tf_keras.Model): distribution_b_ = distribution_b(0.) # Pass a dummy arg. elif callable(distribution_b): # TODO(b/119756336): Due to eager/graph Jacobian graph caching bug we @@ -1391,8 +1392,8 @@ class MixtureSameFamily(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Load data -- graph of a [cardioid](https://en.wikipedia.org/wiki/Cardioid). n = 2000 @@ -1449,7 +1450,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(MixtureSameFamily, self).__init__( lambda t: MixtureSameFamily.new( # pylint: disable=g-long-lambda @@ -1518,8 +1519,8 @@ class MixtureNormal(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Load data -- graph of a [cardioid](https://en.wikipedia.org/wiki/Cardioid). n = 2000 @@ -1571,7 +1572,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -1643,8 +1644,8 @@ class MixtureLogistic(DistributionLambda): ```python tfd = tfp.distributions tfpl = tfp.layers - tfk = tf.keras - tfkl = tf.keras.layers + tfk = tf_keras + tfkl = tf_keras.layers # Load data -- graph of a [cardioid](https://en.wikipedia.org/wiki/Cardioid). n = 2000 @@ -1696,7 +1697,7 @@ def __init__(self, performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ convert_to_tensor_fn = _get_convert_to_tensor_fn(convert_to_tensor_fn) @@ -1766,7 +1767,7 @@ class VariationalGaussianProcess(DistributionLambda): Create a VariationalGaussianProcess distribtuion whose `index_points` are the inputs to the layer. Parameterized by number of inducing points and a - `kernel_provider`, which should be a `tf.keras.Layer` with an @property that + `kernel_provider`, which should be a `tf_keras.Layer` with an @property that late-binds variable parameters to a `tfp.positive_semidefinite_kernel.PositiveSemidefiniteKernel` instance (this requirement has to do with the way that variables must be created in a keras @@ -1782,7 +1783,7 @@ def __init__( event_shape=(1,), inducing_index_points_initializer=None, unconstrained_observation_noise_variance_initializer=( - tf.keras.initializers.constant(-10.)), + tf_keras.initializers.constant(-10.)), variational_inducing_observations_scale_initializer=None, mean_fn=None, jitter=1e-6, @@ -1802,17 +1803,17 @@ def __init__( example, `event_shape = [3]` means we are modeling a batch of 3 distributions over functions. We can think of this as a distrbution over 3-dimensional vector-valued functions. - inducing_index_points_initializer: a `tf.keras.initializer.Initializer` + inducing_index_points_initializer: a `tf_keras.initializer.Initializer` used to initialize the trainable `inducing_index_points` variables. Training VGP's is pretty sensitive to choice of initial inducing index point locations. A reasonable heuristic is to scatter them near the data, not too close to each other. unconstrained_observation_noise_variance_initializer: a - `tf.keras.initializer.Initializer` used to initialize the unconstrained + `tf_keras.initializer.Initializer` used to initialize the unconstrained observation noise variable. The observation noise variance is computed from this variable via the `tf.nn.softplus` function. variational_inducing_observations_scale_initializer: a - `tf.keras.initializer.Initializer` used to initialize the variational + `tf_keras.initializer.Initializer` used to initialize the variational inducing observations scale. mean_fn: a callable that maps layer inputs to mean function values. Passed to the mean_fn parameter of VariationalGaussianProcess distribution. If @@ -1869,7 +1870,7 @@ def build(self, input_shape): if self._mean_fn is None: self.mean = self.add_weight( - initializer=tf.keras.initializers.constant([0.]), + initializer=tf_keras.initializers.constant([0.]), dtype=self._dtype, name='mean') self._mean_fn = lambda x: self.mean @@ -1896,14 +1897,14 @@ def build(self, input_shape): self._variational_inducing_observations_loc = self.add_weight( name='variational_inducing_observations_loc', shape=self._event_shape.as_list() + [self._num_inducing_points], - initializer=tf.keras.initializers.zeros(), + initializer=tf_keras.initializers.zeros(), dtype=self._dtype) if self._variational_inducing_observations_scale_initializer is None: eyes = (np.ones(self._event_shape.as_list() + [1, 1]) * np.eye(self._num_inducing_points, dtype=self._dtype)) self._variational_inducing_observations_scale_initializer = ( - tf.keras.initializers.constant(1e-5 * eyes)) + tf_keras.initializers.constant(1e-5 * eyes)) self._variational_inducing_observations_scale = self.add_weight( name='variational_inducing_observations_scale', shape=(self._event_shape.as_list() + @@ -1945,7 +1946,7 @@ def _transposed_variational_loss(y, kl_weight=1.): # For deserialization. -tf.keras.utils.get_custom_objects().update({ +tf_keras.utils.get_custom_objects().update({ 'DistributionLambda': DistributionLambda, 'IndependentBernoulli': IndependentBernoulli, 'IndependentLogistic': IndependentLogistic, @@ -1963,11 +1964,11 @@ def _transposed_variational_loss(y, kl_weight=1.): def _serialize(convert_to_tensor_fn): - return tf.keras.utils.legacy.serialize_keras_object(convert_to_tensor_fn) + return tf_keras.utils.legacy.serialize_keras_object(convert_to_tensor_fn) def _deserialize(name, custom_objects=None): - return tf.keras.utils.legacy.deserialize_keras_object( + return tf_keras.utils.legacy.deserialize_keras_object( name, module_objects=globals(), custom_objects=custom_objects, diff --git a/tensorflow_probability/python/layers/distribution_layer_test.py b/tensorflow_probability/python/layers/distribution_layer_test.py index 663fdf52fb..197f889a7f 100644 --- a/tensorflow_probability/python/layers/distribution_layer_test.py +++ b/tensorflow_probability/python/layers/distribution_layer_test.py @@ -37,15 +37,15 @@ from tensorflow_probability.python.distributions import poisson from tensorflow_probability.python.distributions import uniform from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import distribution_layer from tensorflow_probability.python.layers import variable_input from tensorflow_probability.python.math import generic from tensorflow_probability.python.math.psd_kernels import exponentiated_quadratic from tensorflow_probability.python.util import deferred_tensor -tfk = tf.keras - -tfkl = tf.keras.layers +tfk = tf_keras +tfkl = tf_keras.layers def _logit_avg_expit(t): @@ -72,8 +72,8 @@ def _unwrap_tensor_coercible(dist): def _get_adam_optimizer(learning_rate): if tf.__internal__.tf2.enabled() and tf.executing_eagerly(): - return tf.keras.optimizers.Adam(learning_rate=learning_rate) - return tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate) + return tf_keras.optimizers.Adam(learning_rate=learning_rate) + return tf_keras.optimizers.legacy.Adam(learning_rate=learning_rate) # TODO(b/143642032): Figure out how to solve issues with save/load, so that we @@ -92,9 +92,9 @@ class EndToEndTest(test_util.TestCase): registered via `tf.register_tensor_conversion_function`. Fundamentally, there are three ways to be Keras models: - 1. `tf.keras.Sequential` + 1. `tf_keras.Sequential` 2. Functional API - 3. Subclass `tf.keras.Model`. + 3. Subclass `tf_keras.Model`. Its important to have end-to-end tests for all three, because #1 and #2 call `__call__` and `call` differently. (#3's call pattern depends on user @@ -336,8 +336,8 @@ def test_side_variable_is_auto_tracked(self): # `s` is the "side variable". s = deferred_tensor.TransformedVariable(1., softplus.Softplus()) prior = normal_lib.Normal(tf.Variable(0.), 1.) - linear_regression = tf.keras.Sequential([ - tf.keras.layers.Dense(1), + linear_regression = tf_keras.Sequential([ + tf_keras.layers.Dense(1), distribution_layer.DistributionLambda( lambda t: normal_lib.Normal(t, s), activity_regularizer=distribution_layer.KLDivergenceRegularizer( @@ -600,8 +600,8 @@ def test_doc_string(self): true_bias = np.array([0, 0, np.log(scale_noise), 0, np.log(scale_noise)]) # Create model. - model = tf.keras.Sequential([ - tf.keras.layers.Dense( + model = tf_keras.Sequential([ + tf_keras.layers.Dense( distribution_layer.MultivariateNormalTriL.params_size(d), kernel_initializer=lambda s, **_: true_kernel, bias_initializer=lambda s, **_: true_bias), @@ -660,10 +660,10 @@ def test_doc_string(self): d = y.shape[-1] # Create model. - model = tf.keras.Sequential([ - tf.keras.layers.Dense( + model = tf_keras.Sequential([ + tf_keras.layers.Dense( distribution_layer.OneHotCategorical.params_size(d) - 1), - tf.keras.layers.Lambda(_vec_pad), + tf_keras.layers.Lambda(_vec_pad), distribution_layer.OneHotCategorical(d), ]) @@ -748,8 +748,8 @@ def test_doc_string(self): k = 2 p = distribution_layer.CategoricalMixtureOfOneHotCategorical.params_size( d, k) - model = tf.keras.Sequential([ - tf.keras.layers.Dense(p), + model = tf_keras.Sequential([ + tf_keras.layers.Dense(p), distribution_layer.CategoricalMixtureOfOneHotCategorical(d, k), ]) @@ -908,8 +908,8 @@ def test_doc_string(self): event_shape = y.shape[1:] # Create model. - model = tf.keras.Sequential([ - tf.keras.layers.Dense( + model = tf_keras.Sequential([ + tf_keras.layers.Dense( distribution_layer.IndependentBernoulli.params_size(event_shape)), distribution_layer.IndependentBernoulli(event_shape), ]) @@ -1510,13 +1510,13 @@ def s(x): y = (w0 * x * (1 + np.sin(x)) + b0) + eps x0 = np.linspace(*x_range, num=1000) - class KernelFn(tf.keras.layers.Layer): + class KernelFn(tf_keras.layers.Layer): def __init__(self, **kwargs): super(KernelFn, self).__init__(**kwargs) self._amplitude = self.add_weight( - initializer=tf.keras.initializers.constant(.54), + initializer=tf_keras.initializers.constant(.54), dtype=dtype, name='amplitude') @@ -1533,17 +1533,17 @@ def kernel(self): # Add a leading dimension for the event_shape. eyes = np.expand_dims(np.eye(num_inducing_points), 0) variational_inducing_observations_scale_initializer = ( - tf.keras.initializers.constant(1e-3 * eyes)) + tf_keras.initializers.constant(1e-3 * eyes)) - model = tf.keras.Sequential([ - tf.keras.layers.InputLayer(input_shape=[1], dtype=dtype), - tf.keras.layers.Dense(1, kernel_initializer='Ones', use_bias=False, + model = tf_keras.Sequential([ + tf_keras.layers.InputLayer(input_shape=[1], dtype=dtype), + tf_keras.layers.Dense(1, kernel_initializer='Ones', use_bias=False, activation=None, dtype=dtype), distribution_layer.VariationalGaussianProcess( num_inducing_points=num_inducing_points, kernel_provider=KernelFn(dtype=dtype), inducing_index_points_initializer=( - tf.keras.initializers.constant( + tf_keras.initializers.constant( np.linspace(*x_range, num=num_inducing_points, dtype=dtype)[..., np.newaxis])), diff --git a/tensorflow_probability/python/layers/initializers.py b/tensorflow_probability/python/layers/initializers.py index 52a092d0bb..c0b57bfddb 100644 --- a/tensorflow_probability/python/layers/initializers.py +++ b/tensorflow_probability/python/layers/initializers.py @@ -18,9 +18,10 @@ import numpy as np import tensorflow.compat.v2 as tf +from tensorflow_probability.python.internal import tf_keras -class BlockwiseInitializer(tf.keras.initializers.Initializer): +class BlockwiseInitializer(tf_keras.initializers.Initializer): """Initializer which concats other intializers.""" def __init__(self, initializers, sizes, validate_args=False): @@ -28,7 +29,7 @@ def __init__(self, initializers, sizes, validate_args=False): Args: initializers: `list` of Keras initializers, e.g., `"glorot_uniform"` or - `tf.keras.initializers.Constant(0.5413)`. + `tf_keras.initializers.Constant(0.5413)`. sizes: `list` of `int` scalars representing the number of elements associated with each initializer in `initializers`. validate_args: Python `bool` indicating we should do (possibly expensive) @@ -58,7 +59,7 @@ def __call__(self, shape, dtype=None): dtype: Optional dtype of the tensor. If not provided will return tensor of `tf.float32`. """ - dtype = tf.as_dtype(dtype or tf.keras.backend.floatx()) + dtype = tf.as_dtype(dtype or tf_keras.backend.floatx()) if isinstance(shape, tf.TensorShape): shape_dtype = tf.int32 shape_ = np.int32(shape) @@ -88,14 +89,14 @@ def __call__(self, shape, dtype=None): else shape_[:-1]) if sizes_ is not None and isinstance(s, (np.ndarray, np.generic)): return tf.concat([ - tf.keras.initializers.get(init)(np.concatenate([ + tf_keras.initializers.get(init)(np.concatenate([ s, np.array([e], shape_dtype.as_numpy_dtype)], axis=-1), dtype) for init, e in zip(self.initializers, sizes_.tolist()) ], axis=-1) sizes = tf.split(self.sizes, len(self.initializers)) return tf.concat([ - tf.keras.initializers.get(init)(tf.concat([s, e], axis=-1), dtype) + tf_keras.initializers.get(init)(tf.concat([s, e], axis=-1), dtype) for init, e in zip(self.initializers, sizes) ], axis=-1) @@ -103,8 +104,8 @@ def get_config(self): """Returns initializer configuration as a JSON-serializable dict.""" return { 'initializers': [ - tf.keras.initializers.serialize( - tf.keras.initializers.get(init)) + tf_keras.initializers.serialize( + tf_keras.initializers.get(init)) for init in self.initializers ], 'sizes': self.sizes, @@ -115,12 +116,12 @@ def get_config(self): def from_config(cls, config): """Instantiates an initializer from a configuration dictionary.""" return cls(**{ - 'initializers': [tf.keras.initializers.deserialize(init) + 'initializers': [tf_keras.initializers.deserialize(init) for init in config.get('initializers', [])], 'sizes': config.get('sizes', []), 'validate_args': config.get('validate_args', False), }) -tf.keras.utils.get_custom_objects()[ +tf_keras.utils.get_custom_objects()[ 'BlockwiseInitializer'] = BlockwiseInitializer diff --git a/tensorflow_probability/python/layers/initializers_test.py b/tensorflow_probability/python/layers/initializers_test.py index 8148df0b78..dc451cee26 100644 --- a/tensorflow_probability/python/layers/initializers_test.py +++ b/tensorflow_probability/python/layers/initializers_test.py @@ -17,8 +17,8 @@ # Dependency imports import numpy as np -import tensorflow.compat.v2 as tf from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import initializers @@ -34,9 +34,9 @@ def test_works_correctly(self): self.assertAllEqual(np.zeros([2, 1, 4]), x_[..., 3:]) def test_de_serialization(self): - s = tf.keras.initializers.serialize( + s = tf_keras.initializers.serialize( initializers.BlockwiseInitializer(['glorot_uniform', 'zeros'], [3, 4])) - init_clone = tf.keras.initializers.deserialize(s) + init_clone = tf_keras.initializers.deserialize(s) x = init_clone([2, 1, 7]) self.assertEqual((2, 1, 7), x.shape) x_ = self.evaluate(x) diff --git a/tensorflow_probability/python/layers/masked_autoregressive.py b/tensorflow_probability/python/layers/masked_autoregressive.py index 8ff923c125..07a406ec5a 100644 --- a/tensorflow_probability/python/layers/masked_autoregressive.py +++ b/tensorflow_probability/python/layers/masked_autoregressive.py @@ -19,7 +19,7 @@ from tensorflow_probability.python.bijectors import masked_autoregressive as masked_autoregressive_lib from tensorflow_probability.python.distributions import transformed_distribution as transformed_distribution_lib - +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers.distribution_layer import DistributionLambda @@ -61,7 +61,7 @@ def f_inverse(x): tfd = tfp.distributions tfpl = tfp.layers tfb = tfp.bijectors - tfk = tf.keras + tfk = tf_keras # Generate data -- as in Figure 1 in [Papamakarios et al. (2017)][1]). n = 2000 @@ -121,7 +121,7 @@ def __init__(self, made, **kwargs): Args: made: A `Made` layer, which must output two parameters for each input. - **kwargs: Additional keyword arguments passed to `tf.keras.Layer`. + **kwargs: Additional keyword arguments passed to `tf_keras.Layer`. """ super(AutoregressiveTransform, self).__init__(self._transform, **kwargs) @@ -132,8 +132,8 @@ def __init__(self, made, **kwargs): self._made = made def build(self, input_shape): - tf.keras.Sequential([ - tf.keras.layers.InputLayer( + tf_keras.Sequential([ + tf_keras.layers.InputLayer( input_shape=input_shape[1:], dtype=self.dtype), self._made ]) diff --git a/tensorflow_probability/python/layers/masked_autoregressive_test.py b/tensorflow_probability/python/layers/masked_autoregressive_test.py index ebddc2eb4d..24b382ffba 100644 --- a/tensorflow_probability/python/layers/masked_autoregressive_test.py +++ b/tensorflow_probability/python/layers/masked_autoregressive_test.py @@ -19,11 +19,12 @@ from tensorflow_probability.python.bijectors import masked_autoregressive as masked_autoregressive_lib from tensorflow_probability.python.distributions import mvn_diag from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import distribution_layer from tensorflow_probability.python.layers import masked_autoregressive -tfk = tf.keras -tfkl = tf.keras.layers +tfk = tf_keras +tfkl = tf_keras.layers @test_util.test_all_tf_execution_regimes diff --git a/tensorflow_probability/python/layers/util.py b/tensorflow_probability/python/layers/util.py index 5fcdb72ca7..6f3010920b 100644 --- a/tensorflow_probability/python/layers/util.py +++ b/tensorflow_probability/python/layers/util.py @@ -29,6 +29,8 @@ from tensorflow_probability.python.distributions import independent as independent_lib from tensorflow_probability.python.distributions import normal as normal_lib +from tensorflow_probability.python.internal import tf_keras + __all__ = [ 'default_loc_scale_fn', @@ -235,7 +237,7 @@ def deserialize_function(serial, function_type): Keras-deserialized functions do not perform lexical scoping. Any modules that the function requires must be imported within the function itself. - This serialization mimicks the implementation in `tf.keras.layers.Lambda`. + This serialization mimicks the implementation in `tf_keras.layers.Lambda`. Args: serial: Serialized Keras object: typically a dict, string, or bytecode. @@ -255,7 +257,7 @@ def deserialize_function(serial, function_type): """ if function_type == 'function': # Simple lookup in custom objects - function = tf.keras.utils.legacy.deserialize_keras_object(serial) + function = tf_keras.utils.legacy.deserialize_keras_object(serial) elif function_type == 'lambda': # Unsafe deserialization from bytecode function = _func_load(serial) @@ -273,7 +275,7 @@ def serialize_function(func): us use the Python scope to obtain the function rather than reload it from bytecode. (Note that both cases are brittle!) - This serialization mimicks the implementation in `tf.keras.layers.Lambda`. + This serialization mimicks the implementation in `tf_keras.layers.Lambda`. Args: func: Python function to serialize. diff --git a/tensorflow_probability/python/layers/variable_input.py b/tensorflow_probability/python/layers/variable_input.py index 9dbdb2edc9..0dae6ff7ef 100644 --- a/tensorflow_probability/python/layers/variable_input.py +++ b/tensorflow_probability/python/layers/variable_input.py @@ -18,27 +18,28 @@ import numpy as np import tensorflow.compat.v2 as tf +from tensorflow_probability.python.internal import tf_keras -class VariableLayer(tf.keras.layers.Layer): +class VariableLayer(tf_keras.layers.Layer): """Simply returns a (trainable) variable, regardless of input. This layer implements the mathematical function `f(x) = c` where `c` is a constant, i.e., unchanged for all `x`. Like other Keras layers, the constant is `trainable`. This layer can also be interpretted as the special case of - `tf.keras.layers.Dense` when the `kernel` is forced to be the zero matrix + `tf_keras.layers.Dense` when the `kernel` is forced to be the zero matrix (`tf.zeros`). #### Examples ```python - trainable_normal = tf.keras.models.Sequential([ + trainable_normal = tf_keras.models.Sequential([ tfp.layers.VariableLayer( shape=[3, 4, 2], dtype=tf.float64, initializer=tfp.layers.BlockwiseInitializer([ 'zeros', - tf.keras.initializers.Constant(np.log(np.expm1(1.))), + tf_keras.initializers.Constant(np.log(np.expm1(1.))), ], sizes=[1, 1])), tfp.layers.DistributionLambda(lambda t: tfd.Independent( tfd.Normal(loc=t[..., 0], scale=tf.math.softplus(t[..., 1])), @@ -83,7 +84,7 @@ def __init__(self, shape: integer or integer vector specifying the shape of the output of this layer. dtype: TensorFlow `dtype` of the variable created by this layer. - Default value: `None` (i.e., `tf.as_dtype(tf.keras.backend.floatx())`). + Default value: `None` (i.e., `tf.as_dtype(tf_keras.backend.floatx())`). activation: Activation function to use. If you don't specify anything, no activation is applied (ie. "linear" activation: `a(x) = x`). Default value: `None`. @@ -93,7 +94,7 @@ def __init__(self, ```python tfp.layers.BlockwiseInitializer([ 'zeros', - tf.keras.initializers.Constant(np.log(np.expm1(1.))), # = 0.541325 + tf_keras.initializers.Constant(np.log(np.expm1(1.))), # = 0.541325 ], sizes=[1, 1]) ``` Default value: `'zeros'`. @@ -101,14 +102,14 @@ def __init__(self, Default value: `None`. constraint: Constraint function applied to the `constant` vector. Default value: `None`. - **kwargs: Extra arguments forwarded to `tf.keras.layers.Layer`. + **kwargs: Extra arguments forwarded to `tf_keras.layers.Layer`. """ super(VariableLayer, self).__init__(**kwargs) - self.activation = tf.keras.activations.get(activation) - self.initializer = tf.keras.initializers.get(initializer) - self.regularizer = tf.keras.regularizers.get(regularizer) - self.constraint = tf.keras.constraints.get(constraint) + self.activation = tf_keras.activations.get(activation) + self.initializer = tf_keras.initializers.get(initializer) + self.regularizer = tf_keras.regularizers.get(regularizer) + self.constraint = tf_keras.constraints.get(constraint) shape = tf.get_static_value(shape) if shape is None: diff --git a/tensorflow_probability/python/layers/variable_input_test.py b/tensorflow_probability/python/layers/variable_input_test.py index 94f9a57e1d..d80899bc64 100644 --- a/tensorflow_probability/python/layers/variable_input_test.py +++ b/tensorflow_probability/python/layers/variable_input_test.py @@ -18,6 +18,7 @@ from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import distribution_layer from tensorflow_probability.python.layers import variable_input @@ -27,13 +28,13 @@ class VariableInputLayerTest(test_util.TestCase): def test_sequential_api(self): # Create a trainable distribution using the Sequential API. - model = tf.keras.models.Sequential([ + model = tf_keras.models.Sequential([ variable_input.VariableLayer( shape=[2, 3, 4], dtype=tf.float64, trainable=False), # You'd probably never want this in IRL. # The Dense serves no real purpose; it will change the event_shape. - tf.keras.layers.Dense(5, use_bias=False, dtype=tf.float64), + tf_keras.layers.Dense(5, use_bias=False, dtype=tf.float64), distribution_layer.DistributionLambda( lambda t: independent.Independent( # pylint: disable=g-long-lambda normal.Normal(loc=t[0], scale=t[1]), @@ -68,19 +69,19 @@ def test_sequential_api(self): def test_functional_api(self): # Create a trainable distribution using the functional API. - dummy_input = tf.keras.Input(shape=()) + dummy_input = tf_keras.Input(shape=()) x = variable_input.VariableLayer( shape=[2, 3, 4], dtype=tf.float64, trainable=False, # You'd probably never want this in IRL. )(dummy_input) # The Dense serves no real purpose; it will change the event_shape. - x = tf.keras.layers.Dense(5, use_bias=False, dtype=tf.float64)(x) + x = tf_keras.layers.Dense(5, use_bias=False, dtype=tf.float64)(x) x = distribution_layer.DistributionLambda( lambda t: independent.Independent(normal.Normal(loc=t[0], scale=t[1]), # pylint: disable=g-long-lambda reinterpreted_batch_ndims=1), dtype=tf.float64)(x) - model = tf.keras.Model(dummy_input, x) + model = tf_keras.Model(dummy_input, x) # Instantiate the model (as a TFP distribution). dist = model(tf.zeros([])) diff --git a/tensorflow_probability/python/layers/weight_norm.py b/tensorflow_probability/python/layers/weight_norm.py index b8b7b84925..c255f1c5a1 100644 --- a/tensorflow_probability/python/layers/weight_norm.py +++ b/tensorflow_probability/python/layers/weight_norm.py @@ -17,9 +17,10 @@ import warnings import tensorflow.compat.v2 as tf +from tensorflow_probability.python.internal import tf_keras -class WeightNorm(tf.keras.layers.Wrapper): +class WeightNorm(tf_keras.layers.Wrapper): """Layer wrapper to decouple magnitude and direction of the layer's weights. This wrapper reparameterizes a layer by decoupling the weight's @@ -32,13 +33,13 @@ class WeightNorm(tf.keras.layers.Wrapper): #### Example ```python - net = WeightNorm(tf.keras.layers.Conv2D(2, 2, activation='relu'), + net = WeightNorm(tf_keras.layers.Conv2D(2, 2, activation='relu'), input_shape=(32, 32, 3), data_init=True)(x) - net = WeightNorm(tf.keras.layers.Conv2DTranspose(16, 5, activation='relu'), + net = WeightNorm(tf_keras.layers.Conv2DTranspose(16, 5, activation='relu'), data_init=True) - net = WeightNorm(tf.keras.layers.Dense(120, activation='relu'), + net = WeightNorm(tf_keras.layers.Dense(120, activation='relu'), data_init=True)(net) - net = WeightNorm(tf.keras.layers.Dense(num_classes), + net = WeightNorm(tf_keras.layers.Dense(num_classes), data_init=True)(net) ``` @@ -54,19 +55,19 @@ def __init__(self, layer, data_init=True, **kwargs): """Initialize WeightNorm wrapper. Args: - layer: A `tf.keras.layers.Layer` instance. Supported layer types are + layer: A `tf_keras.layers.Layer` instance. Supported layer types are `Dense`, `Conv2D`, and `Conv2DTranspose`. Layers with multiple inputs are not supported. data_init: `bool`, if `True` use data dependent variable initialization. - **kwargs: Additional keyword args passed to `tf.keras.layers.Wrapper`. + **kwargs: Additional keyword args passed to `tf_keras.layers.Wrapper`. Raises: - ValueError: If `layer` is not a `tf.keras.layers.Layer` instance. + ValueError: If `layer` is not a `tf_keras.layers.Layer` instance. """ - if not isinstance(layer, tf.keras.layers.Layer): + if not isinstance(layer, tf_keras.layers.Layer): raise ValueError( - 'Please initialize `WeightNorm` layer with a `tf.keras.layers.Layer` ' + 'Please initialize `WeightNorm` layer with a `tf_keras.layers.Layer` ' 'instance. You passed: {input}'.format(input=layer)) layer_type = type(layer).__name__ @@ -138,7 +139,7 @@ def build(self, input_shape=None): input_shape = tf.TensorShape(input_shape).as_list() input_shape[0] = None - self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) + self.input_spec = tf_keras.layers.InputSpec(shape=input_shape) if not self.layer.built: self.layer.build(input_shape) diff --git a/tensorflow_probability/python/layers/weight_norm_test.py b/tensorflow_probability/python/layers/weight_norm_test.py index 5d47d1a9ab..bdbda2fa49 100644 --- a/tensorflow_probability/python/layers/weight_norm_test.py +++ b/tensorflow_probability/python/layers/weight_norm_test.py @@ -24,10 +24,11 @@ import tensorflow.compat.v2 as tf from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.layers import weight_norm -tfk = tf.keras -tfkl = tf.keras.layers +tfk = tf_keras +tfkl = tf_keras.layers # TODO(b/143642032): Figure out how to get this working with @@ -225,9 +226,9 @@ def testGradientValues(self, model_type): @parameterized.parameters(['sequential', 'sequential_no_input', 'functional']) def testTrainableVariableInitializationInModelFit(self, model_type): if tf.__internal__.tf2.enabled() and tf.executing_eagerly(): - sgd = tf.keras.optimizers.SGD(learning_rate=0.) + sgd = tf_keras.optimizers.SGD(learning_rate=0.) else: - sgd = tf.keras.optimizers.legacy.SGD(learning_rate=0.) + sgd = tf_keras.optimizers.legacy.SGD(learning_rate=0.) model = self._define_model(model_type, self.data_dim, self.num_hidden) model.compile(optimizer=sgd, loss='mse') model.fit( diff --git a/tensorflow_probability/python/math/BUILD b/tensorflow_probability/python/math/BUILD index 2d962b39ca..f9b7557749 100644 --- a/tensorflow_probability/python/math/BUILD +++ b/tensorflow_probability/python/math/BUILD @@ -338,6 +338,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:loop_util", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", ], ) @@ -353,6 +354,7 @@ multi_substrate_py_test( # tensorflow dep, "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/optimizer", # "//third_party/tensorflow/compiler/jit:xla_cpu_jit", # DisableOnExport ], diff --git a/tensorflow_probability/python/math/minimize.py b/tensorflow_probability/python/math/minimize.py index 12f23cbae1..001ee1f5a0 100644 --- a/tensorflow_probability/python/math/minimize.py +++ b/tensorflow_probability/python/math/minimize.py @@ -410,7 +410,7 @@ def minimize_stateless(loss_fn, def _make_stateful_optimizer_step_fn(loss_fn, optimizer, trainable_variables): - """Constructs a single step of a stateful (`tf.keras.optimizers`) optimizer.""" + """Constructs a single step of a stateful (`tf_keras.optimizers`) optimizer.""" @tf.function(autograph=False) def optimizer_step(parameters, @@ -460,7 +460,7 @@ def minimize(loss_fn, `tfp.random.sanitize_seed`). num_steps: Python `int` maximum number of steps to run the optimizer. optimizer: Optimizer instance to use. This may be a TF1-style - `tf.train.Optimizer`, TF2-style `tf.keras.optimizers.Optimizer`, or any + `tf.train.Optimizer`, TF2-style `tf_keras.optimizers.Optimizer`, or any Python object that implements `optimizer.apply_gradients(grads_and_vars)`. convergence_criterion: Optional instance of `tfp.optimizer.convergence_criteria.ConvergenceCriterion` @@ -531,7 +531,7 @@ def minimize(loss_fn, losses = tfp.math.minimize( loss_fn, num_steps=100, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1)) + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1)) # In TF2/eager mode, the optimization runs immediately. print("optimized value is {} with loss {}".format(x, losses[-1])) @@ -555,7 +555,7 @@ def minimize(loss_fn, losses = tfp.math.minimize( loss_fn, num_steps=1000, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), convergence_criterion=( tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01))) ``` @@ -577,7 +577,7 @@ def minimize(loss_fn, trace_fn = lambda traceable_quantities: { 'loss': traceable_quantities.loss, 'x': x} trace = tfp.math.minimize(loss_fn, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), trace_fn=trace_fn) print(trace['loss'].shape, # => [100] trace['x'].shape) # => [100] @@ -597,7 +597,7 @@ def minimize(loss_fn, 'loss': traceable_quantities.loss, 'has_converged': traceable_quantities.has_converged} trace = tfp.math.minimize(loss_fn, num_steps=100, - optimizer=tf.keras.optimizers.Adam(0.1),, + optimizer=tf_keras.optimizers.Adam(0.1),, trace_fn=trace_fn, convergence_criterion=( tfp.optimizers.convergence_criteria.LossNotDecreasing(atol=0.01))) diff --git a/tensorflow_probability/python/math/minimize_test.py b/tensorflow_probability/python/math/minimize_test.py index ab16d8f602..ef373022cc 100644 --- a/tensorflow_probability/python/math/minimize_test.py +++ b/tensorflow_probability/python/math/minimize_test.py @@ -24,6 +24,7 @@ from tensorflow_probability.python import optimizer from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math.minimize import minimize from tensorflow_probability.python.math.minimize import minimize_stateless @@ -32,14 +33,14 @@ def _get_adam_optimizer(learning_rate): if tf.__internal__.tf2.enabled(): - return tf.keras.optimizers.Adam(learning_rate=learning_rate) - return tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate) + return tf_keras.optimizers.Adam(learning_rate=learning_rate) + return tf_keras.optimizers.legacy.Adam(learning_rate=learning_rate) def _get_sgd_optimizer(learning_rate): if tf.__internal__.tf2.enabled(): - return tf.keras.optimizers.SGD(learning_rate=learning_rate) - return tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate) + return tf_keras.optimizers.SGD(learning_rate=learning_rate) + return tf_keras.optimizers.legacy.SGD(learning_rate=learning_rate) @test_util.test_all_tf_execution_regimes diff --git a/tensorflow_probability/python/mcmc/BUILD b/tensorflow_probability/python/mcmc/BUILD index 8350fb5ea8..0e7b126578 100644 --- a/tensorflow_probability/python/mcmc/BUILD +++ b/tensorflow_probability/python/mcmc/BUILD @@ -141,6 +141,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:distribute_lib", "//tensorflow_probability/python/internal:dtype_util", "//tensorflow_probability/python/internal:prefer_static", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/mcmc/internal:leapfrog_integrator", "//tensorflow_probability/python/mcmc/internal:util", "//tensorflow_probability/python/util:seed_stream", @@ -175,6 +176,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:samplers", "//tensorflow_probability/python/internal:tensorshape_util", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:generic", "//tensorflow_probability/python/util:deferred_tensor", ], diff --git a/tensorflow_probability/python/mcmc/hmc.py b/tensorflow_probability/python/mcmc/hmc.py index 8f4f9d5a60..aceae587de 100644 --- a/tensorflow_probability/python/mcmc/hmc.py +++ b/tensorflow_probability/python/mcmc/hmc.py @@ -308,7 +308,7 @@ def make_response_likelihood(w, x): log_sigma = tf.Variable(0., dtype=dtype, name='log_sigma') - optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) + optimizer = tf_keras.optimizers.SGD(learning_rate=0.01) @tf.function def mcem_iter(weights_chain_start, step_size): diff --git a/tensorflow_probability/python/mcmc/hmc_test.py b/tensorflow_probability/python/mcmc/hmc_test.py index 464e655f41..ef0fd4d4a4 100644 --- a/tensorflow_probability/python/mcmc/hmc_test.py +++ b/tensorflow_probability/python/mcmc/hmc_test.py @@ -40,6 +40,7 @@ from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import tensorshape_util from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math import generic from tensorflow_probability.python.mcmc import hmc from tensorflow_probability.python.mcmc import sample as sample_lib @@ -997,7 +998,7 @@ def test_mcem_converges(self): sigma = deferred_tensor.TransformedVariable( name='sigma', initial_value=np.array(1, dtype), bijector=exp.Exp()) - optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) + optimizer = tf_keras.optimizers.SGD(learning_rate=0.01) # TODO(b/144045420): eliminate the need for this tf.function decorator. The # reason it was added was that the test code is written to work in both diff --git a/tensorflow_probability/python/optimizer/BUILD b/tensorflow_probability/python/optimizer/BUILD index fd6dc3df8e..46c51d0cf4 100644 --- a/tensorflow_probability/python/optimizer/BUILD +++ b/tensorflow_probability/python/optimizer/BUILD @@ -55,6 +55,7 @@ multi_substrate_py_library( deps = [ # tensorflow dep, "//tensorflow_probability/python/internal:assert_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math:diag_jacobian", ], ) @@ -84,6 +85,7 @@ multi_substrate_py_library( # tensorflow dep, "//tensorflow_probability/python/internal:assert_util", "//tensorflow_probability/python/internal:distribution_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/optimizer/convergence_criteria/BUILD b/tensorflow_probability/python/optimizer/convergence_criteria/BUILD index fb18821b74..731d84822e 100644 --- a/tensorflow_probability/python/optimizer/convergence_criteria/BUILD +++ b/tensorflow_probability/python/optimizer/convergence_criteria/BUILD @@ -99,6 +99,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/bijectors:softplus", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/util:deferred_tensor", "//tensorflow_probability/python/vi:csiszar_divergence", ], diff --git a/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py b/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py index d1dc41f262..33c46a6011 100644 --- a/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py +++ b/tensorflow_probability/python/optimizer/convergence_criteria/successive_gradients_are_uncorrelated_test.py @@ -20,6 +20,7 @@ from tensorflow_probability.python.bijectors import softplus from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.optimizer.convergence_criteria import successive_gradients_are_uncorrelated as sgau from tensorflow_probability.python.util import deferred_tensor from tensorflow_probability.python.vi import csiszar_divergence @@ -44,7 +45,7 @@ def test_stochastic_optimization(self): trained_dist = normal.Normal(locs, scales) target_dist = normal.Normal(loc=-0.4, scale=1.2) - optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) + optimizer = tf_keras.optimizers.Adam(learning_rate=0.1) @tf.function(autograph=False) def optimization_step(): with tf.GradientTape() as tape: diff --git a/tensorflow_probability/python/optimizer/sgld.py b/tensorflow_probability/python/optimizer/sgld.py index d51cd29574..8e27f87ff6 100644 --- a/tensorflow_probability/python/optimizer/sgld.py +++ b/tensorflow_probability/python/optimizer/sgld.py @@ -19,6 +19,7 @@ from tensorflow_probability.python.internal import assert_util from tensorflow_probability.python.internal import distribution_util from tensorflow_probability.python.internal import dtype_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math.diag_jacobian import diag_jacobian @@ -28,7 +29,7 @@ # pylint: disable=g-classes-have-attributes -class StochasticGradientLangevinDynamics(tf.keras.optimizers.legacy.Optimizer): +class StochasticGradientLangevinDynamics(tf_keras.optimizers.legacy.Optimizer): """An optimizer module for stochastic gradient Langevin dynamics. This implements the preconditioned Stochastic Gradient Langevin Dynamics @@ -167,7 +168,7 @@ def __init__(self, diagonal_bias, name='diagonal_bias') # TODO(b/124800185): Consider migrating `learning_rate` to be a # hyperparameter handled by the base Optimizer class. This would allow - # users to plug in a `tf.keras.optimizers.schedules.LearningRateSchedule` + # users to plug in a `tf_keras.optimizers.schedules.LearningRateSchedule` # object in addition to Tensors. self._learning_rate = tf.convert_to_tensor( learning_rate, name='learning_rate') diff --git a/tensorflow_probability/python/optimizer/variational_sgd.py b/tensorflow_probability/python/optimizer/variational_sgd.py index 40285776d6..8109f8ae5b 100644 --- a/tensorflow_probability/python/optimizer/variational_sgd.py +++ b/tensorflow_probability/python/optimizer/variational_sgd.py @@ -20,6 +20,8 @@ from tensorflow_probability.python.internal import distribution_util from tensorflow_probability.python.internal import dtype_util +from tensorflow_probability.python.internal import tf_keras + __all__ = [ 'VariationalSGD', @@ -27,7 +29,7 @@ # pylint: disable=g-classes-have-attributes -class VariationalSGD(tf.keras.optimizers.legacy.Optimizer): +class VariationalSGD(tf_keras.optimizers.legacy.Optimizer): """An optimizer module for constant stochastic gradient descent. This implements an optimizer module for the constant stochastic gradient diff --git a/tensorflow_probability/python/sts/BUILD b/tensorflow_probability/python/sts/BUILD index aab722e1f4..2c5c923dfb 100644 --- a/tensorflow_probability/python/sts/BUILD +++ b/tensorflow_probability/python/sts/BUILD @@ -113,6 +113,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/distributions:exponential", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/optimizer", "//tensorflow_probability/python/sts/components:local_linear_trend", "//tensorflow_probability/python/sts/components:seasonal", diff --git a/tensorflow_probability/python/sts/default_model.py b/tensorflow_probability/python/sts/default_model.py index 988e6328b2..6b494486db 100644 --- a/tensorflow_probability/python/sts/default_model.py +++ b/tensorflow_probability/python/sts/default_model.py @@ -95,7 +95,7 @@ def build_default_model(observed_time_series, losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=1000, convergence_criterion=( tfp.optimizer.convergence_criteria.SuccessiveGradientsAreUncorrelated( diff --git a/tensorflow_probability/python/sts/default_model_test.py b/tensorflow_probability/python/sts/default_model_test.py index ede2c6b6c3..d679401533 100644 --- a/tensorflow_probability/python/sts/default_model_test.py +++ b/tensorflow_probability/python/sts/default_model_test.py @@ -22,6 +22,7 @@ from tensorflow_probability.python.distributions import exponential from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.optimizer.convergence_criteria import successive_gradients_are_uncorrelated from tensorflow_probability.python.sts import default_model from tensorflow_probability.python.sts import fitting @@ -111,7 +112,7 @@ def test_docstring_fitting_example(self): _ = optimization.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=1000, convergence_criterion=(successive_gradients_are_uncorrelated .SuccessiveGradientsAreUncorrelated( diff --git a/tensorflow_probability/python/sts/fitting.py b/tensorflow_probability/python/sts/fitting.py index 8daea0c33a..38eec124e3 100644 --- a/tensorflow_probability/python/sts/fitting.py +++ b/tensorflow_probability/python/sts/fitting.py @@ -132,7 +132,7 @@ def build_factored_surrogate_posterior( loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=200) posterior_samples = surrogate_posterior.sample(50) @@ -152,7 +152,7 @@ def loss_fn(): surrogate_posterior, sample_size=10) - optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) + optimizer = tf_keras.optimizers.Adam(learning_rate=0.1) for step in range(200): with tf.GradientTape() as tape: loss = loss_fn() diff --git a/tensorflow_probability/python/sts/forecast.py b/tensorflow_probability/python/sts/forecast.py index c23154e69e..3950b559af 100644 --- a/tensorflow_probability/python/sts/forecast.py +++ b/tensorflow_probability/python/sts/forecast.py @@ -120,7 +120,7 @@ def one_step_predictive(model, observed_time_series, parameter_samples, loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=200) samples = surrogate_posterior.sample(30) @@ -272,7 +272,7 @@ def forecast(model, loss_curve = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=model.joint_distribution(observed_time_series).log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=200) samples = surrogate_posterior.sample(30) diff --git a/tensorflow_probability/python/sts/structural_time_series.py b/tensorflow_probability/python/sts/structural_time_series.py index 9b3a4c92fb..483ecd4100 100644 --- a/tensorflow_probability/python/sts/structural_time_series.py +++ b/tensorflow_probability/python/sts/structural_time_series.py @@ -346,7 +346,7 @@ def joint_distribution(self, losses = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=jd.unnormalized_log_prob, surrogate_posterior=surrogate_posterior, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), num_steps=200) parameter_samples = surrogate_posterior.sample(50) diff --git a/tensorflow_probability/python/util/BUILD b/tensorflow_probability/python/util/BUILD index 1c9df28512..66e603cf60 100644 --- a/tensorflow_probability/python/util/BUILD +++ b/tensorflow_probability/python/util/BUILD @@ -48,6 +48,7 @@ multi_substrate_py_library( "//tensorflow_probability/python/internal:name_util", "//tensorflow_probability/python/internal:tensor_util", "//tensorflow_probability/python/internal:tensorshape_util", + "//tensorflow_probability/python/internal:tf_keras", ], ) diff --git a/tensorflow_probability/python/util/deferred_tensor.py b/tensorflow_probability/python/util/deferred_tensor.py index 4d846a2577..65858e5286 100644 --- a/tensorflow_probability/python/util/deferred_tensor.py +++ b/tensorflow_probability/python/util/deferred_tensor.py @@ -156,7 +156,7 @@ class DeferredTensor(six.with_metaclass( Which we could then fit as: ```python - opt = tf.keras.optimizers.Adam(learning_rate=0.05) + opt = tf_keras.optimizers.Adam(learning_rate=0.05) loss = tf.function(lambda: -trainable_normal.log_prob(0.5), autograph=True) for _ in range(int(1e3)): opt.minimize(loss, trainable_normal.trainable_variables) @@ -477,7 +477,7 @@ class TransformedVariable(DeferredTensor): g = tape.gradient(negloglik, trainable_normal.trainable_variables) # ==> (-0.5, 0.75) - opt = tf.keras.optimizers.Adam(learning_rate=0.05) + opt = tf_keras.optimizers.Adam(learning_rate=0.05) loss = tf.function(lambda: -trainable_normal.log_prob(0.5)) for _ in range(int(1e3)): opt.minimize(loss, trainable_normal.trainable_variables) diff --git a/tensorflow_probability/python/vi/BUILD b/tensorflow_probability/python/vi/BUILD index 3f9b74522b..80bcc0ea12 100644 --- a/tensorflow_probability/python/vi/BUILD +++ b/tensorflow_probability/python/vi/BUILD @@ -149,6 +149,7 @@ multi_substrate_py_test( "//tensorflow_probability/python/experimental/util", "//tensorflow_probability/python/internal:samplers", "//tensorflow_probability/python/internal:test_util", + "//tensorflow_probability/python/internal:tf_keras", "//tensorflow_probability/python/math/psd_kernels:exponentiated_quadratic", "//tensorflow_probability/python/util:deferred_tensor", ], diff --git a/tensorflow_probability/python/vi/optimization.py b/tensorflow_probability/python/vi/optimization.py index 233499c7db..983fa8a8aa 100644 --- a/tensorflow_probability/python/vi/optimization.py +++ b/tensorflow_probability/python/vi/optimization.py @@ -442,7 +442,7 @@ def fit_surrogate_posterior(target_log_prob_fn, transformations of unconstrained variables, so that the transformations execute at runtime instead of at distribution creation. optimizer: Optimizer instance to use. This may be a TF1-style - `tf.train.Optimizer`, TF2-style `tf.keras.optimizers.Optimizer`, or any + `tf.train.Optimizer`, TF2-style `tf_keras.optimizers.Optimizer`, or any Python object that implements `optimizer.apply_gradients(grads_and_vars)`. num_steps: Python `int` number of steps to run the optimizer. convergence_criterion: Optional instance of @@ -522,7 +522,7 @@ def log_prob(z, x): losses = tfp.vi.fit_surrogate_posterior( conditioned_log_prob, surrogate_posterior=q_z, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=100) print(q_z.mean(), q_z.stddev()) # => approximately [2.5, 1/sqrt(2)] ``` @@ -535,7 +535,7 @@ def log_prob(z, x): losses = tfp.vi.fit_surrogate_posterior( conditioned_log_prob, surrogate_posterior=q_z, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=100, discrepancy_fn=tfp.vi.kl_forward) ``` @@ -589,7 +589,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z, importance_sample_size=10, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=200) # Estimate posterior statistics with importance sampling. @@ -680,7 +680,7 @@ def variational_model_fn(): losses, log_amplitude_path, sample_path = tfp.vi.fit_surrogate_posterior( target_log_prob_fn=lambda *args: model.log_prob(args), surrogate_posterior=q, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), sample_size=1, num_steps=500, trace_fn=lambda loss, grads, vars: (loss, kernel_log_amplitude, diff --git a/tensorflow_probability/python/vi/optimization_test.py b/tensorflow_probability/python/vi/optimization_test.py index 0db4a217ff..5193f5b898 100644 --- a/tensorflow_probability/python/vi/optimization_test.py +++ b/tensorflow_probability/python/vi/optimization_test.py @@ -33,6 +33,7 @@ from tensorflow_probability.python.experimental.util import trainable from tensorflow_probability.python.internal import samplers from tensorflow_probability.python.internal import test_util +from tensorflow_probability.python.internal import tf_keras from tensorflow_probability.python.math.psd_kernels import exponentiated_quadratic from tensorflow_probability.python.util import deferred_tensor from tensorflow_probability.python.vi import optimization @@ -79,7 +80,7 @@ def trainable_log_prob(z): q, num_steps=1000, sample_size=10, - optimizer=tf.keras.optimizers.Adam(0.1), + optimizer=tf_keras.optimizers.Adam(0.1), seed=seed) self.evaluate(tf1.global_variables_initializer()) with tf.control_dependencies([loss_curve]): @@ -112,7 +113,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z, importance_sample_size=10, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=opt_seed) self.evaluate(tf1.global_variables_initializer()) @@ -140,7 +141,7 @@ def log_prob(z, x): conditioned_log_prob, surrogate_posterior=q_z_again, importance_sample_size=10, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=opt_seed) self.evaluate(tf1.global_variables_initializer()) @@ -172,7 +173,7 @@ def trainable_q_fn(): q, num_steps=1000, sample_size=100, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), seed=seed) self.evaluate(tf1.global_variables_initializer()) loss_curve_ = self.evaluate((loss_curve)) @@ -230,7 +231,7 @@ def variational_model_fn(): losses, sample_path = optimization.fit_surrogate_posterior( target_log_prob_fn=lambda *args: model.log_prob(args), surrogate_posterior=q, - optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), + optimizer=tf_keras.optimizers.Adam(learning_rate=0.1), num_steps=100, seed=test_util.test_seed(), sample_size=1, diff --git a/tensorflow_probability/substrates/meta/rewrite.py b/tensorflow_probability/substrates/meta/rewrite.py index 773292cc01..0fa004ecfd 100644 --- a/tensorflow_probability/substrates/meta/rewrite.py +++ b/tensorflow_probability/substrates/meta/rewrite.py @@ -86,7 +86,12 @@ 'pass', ('from tensorflow.python ' 'import pywrap_tensorflow as c_api'): - 'pass' + 'pass', + 'from tensorflow_probability.python.internal import tf_keras': + ('from tensorflow_probability.python.internal.backend.numpy ' + 'import keras as tf_keras'), + # 'import tf_keras.api': '# import tf_keras.api', + # 'tf1_layers = tf1.layers': '# tf1_layers = tf1.layers', } DISABLED_BY_PKG = { diff --git a/testing/run_tfp_test.sh b/testing/run_tfp_test.sh index d7ec5a9ff3..ce96c2840c 100755 --- a/testing/run_tfp_test.sh +++ b/testing/run_tfp_test.sh @@ -66,7 +66,6 @@ bazel test \ --test_timeout 300,450,1200,3600 \ --test_tag_filters="-gpu,-requires-gpu-nvidia,-notap,-no-oss-ci,-tf2-broken,-tf2-kokoro-broken" \ --test_env=TFP_HYPOTHESIS_MAX_EXAMPLES=2 \ - --test_env=TF_USE_LEGACY_KERAS=1 \ --action_env=PATH \ --action_env=LD_LIBRARY_PATH \ --test_output=errors \