From 6cdbf6f7de10a82eec6ec74302c3326edbee23a8 Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:20:30 -0700 Subject: [PATCH 1/3] Fix saving bug for untied weights with keras 3.2 (#1568) Tricky bug, keras.layers.Embedding has start overriding save_own_variables/load_own_variables for quantization. We can no longer rely on absolute indexing for the reverse embedding, e.g. store["1"] = self.reverse_embeddings I also couldn't figure out a way to continue to support loading tied weights untied. This is because the embedding will now unconditionally error if weight counts are mismatched here: https://github.com/keras-team/keras/blob/v3.2.0/keras/layers/core/embedding.py#L232-L264 I don't think we have a direct need for this functionality today, so I just removed it, but could be worth considering how to bring back in the future. --- .../layers/modeling/reversible_embedding.py | 24 ++++++----- .../modeling/reversible_embedding_test.py | 41 ++++++++++--------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/keras_nlp/layers/modeling/reversible_embedding.py b/keras_nlp/layers/modeling/reversible_embedding.py index 1fa5f5f903..9f80460e0a 100644 --- a/keras_nlp/layers/modeling/reversible_embedding.py +++ b/keras_nlp/layers/modeling/reversible_embedding.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np - from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops @@ -140,18 +138,24 @@ def get_config(self): ) return config + def save_own_variables(self, store): + if not self.built: + return + super().save_own_variables(store) + # Before Keras 3.2, the reverse weight is saved in the super() call. + # After Keras 3.2, the reverse weight must be saved manually. + if len(store.keys()) < len(self.weights): + # Store the reverse embedding as the last weight. + store[str(len(store.keys()))] = self.reverse_embeddings + def load_own_variables(self, store): if not self.built: self.build() - self.embeddings.assign(store["0"]) + super().load_own_variables(store) if not self.tie_weights: - # Handle the case where saved weights are tied, but the layer - # weights untied. We can simply assign the embedding weights to both - # variables in this case. - if len(store.keys()) == 1: - self.reverse_embeddings.assign(np.transpose(store["0"])) - else: - self.reverse_embeddings.assign(store["1"]) + # Last weight in the store is the reverse embedding weights. + key = str(len(store.keys()) - 1) + self.reverse_embeddings.assign(store[key]) def compute_output_spec(self, inputs, reverse=False): output_shape = list(inputs.shape) diff --git a/keras_nlp/layers/modeling/reversible_embedding_test.py b/keras_nlp/layers/modeling/reversible_embedding_test.py index 0875759a77..62ddd49d77 100644 --- a/keras_nlp/layers/modeling/reversible_embedding_test.py +++ b/keras_nlp/layers/modeling/reversible_embedding_test.py @@ -44,6 +44,28 @@ def test_layer_behaviors_tied(self, tie_weights): expected_num_trainable_weights=1 if tie_weights else 2, ) + @parameterized.named_parameters( + ("tie_weights", True), + ("untie_weights", False), + ) + def test_saving(self, tie_weights): + input_data = random.randint(minval=0, maxval=100, shape=(4, 10)) + model = keras.Sequential( + [ + ReversibleEmbedding( + input_dim=100, + output_dim=32, + tie_weights=tie_weights, + ) + ] + ) + path = os.path.join(self.get_temp_dir(), "model.keras") + model_output = model(input_data) + model.save(path, save_format="keras_v3") + restored_model = keras.models.load_model(path) + restored_output = restored_model(input_data) + self.assertAllClose(model_output, restored_output) + def test_correctness(self): layer = ReversibleEmbedding(input_dim=3, output_dim=2) layer.build() @@ -57,25 +79,6 @@ def test_correctness(self): out = layer(np.array(([[1.0, 1.0]])), reverse=True) self.assertAllClose(out, np.array([[0.0, 4.0, 6.0]])) - def test_tied_checkpoint_untied_weights(self): - embedding = ReversibleEmbedding(100, 16, tie_weights=True) - inputs = keras.Input(shape=(10,), dtype="int32") - hidden_states = embedding(inputs) - outputs = embedding(hidden_states, reverse=True) - tied_model = keras.Model(inputs, outputs) - path = os.path.join(self.get_temp_dir(), "checkpoint.weights.h5") - tied_model.save_weights(path) - - embedding = ReversibleEmbedding(100, 16, tie_weights=False) - inputs = keras.Input(shape=(10,), dtype="int32") - hidden_states = embedding(inputs) - outputs = embedding(hidden_states, reverse=True) - untied_model = keras.Model(inputs, outputs) - untied_model.load_weights(path) - - input_data = ops.ones(shape=(4, 10), dtype="int32") - self.assertAllClose(untied_model(input_data), tied_model(input_data)) - def test_reverse_dtype(self): embedding = ReversibleEmbedding(100, 16, reverse_dtype="float32") input_data = ops.ones(shape=(4, 10, 16)) From c157ac2a0f073e0dc1c472d57255fc674ea5688c Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:30:42 -0700 Subject: [PATCH 2/3] 0.9 is out, nightly should be a preview of 0.10 now (#1570) --- keras_nlp/version_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/version_utils.py b/keras_nlp/version_utils.py index 5735e45aaa..8d719dc4da 100644 --- a/keras_nlp/version_utils.py +++ b/keras_nlp/version_utils.py @@ -15,7 +15,7 @@ from keras_nlp.api_export import keras_nlp_export # Unique source of truth for the version number. -__version__ = "0.9.0" +__version__ = "0.10.0" @keras_nlp_export("keras_nlp.version") From ab649f570435503b0ab6c665f4a2f72363036522 Mon Sep 17 00:00:00 2001 From: Matt Watson <1389937+mattdangerw@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:35:17 -0700 Subject: [PATCH 3/3] Do the reverse embedding in the same dtype as the input embedding (#1548) --- keras_nlp/layers/modeling/reversible_embedding.py | 10 +++++----- keras_nlp/models/llama/llama_backbone.py | 1 - keras_nlp/models/mistral/mistral_backbone.py | 1 - keras_nlp/samplers/sampler.py | 4 +--- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/keras_nlp/layers/modeling/reversible_embedding.py b/keras_nlp/layers/modeling/reversible_embedding.py index 9f80460e0a..df57860c4d 100644 --- a/keras_nlp/layers/modeling/reversible_embedding.py +++ b/keras_nlp/layers/modeling/reversible_embedding.py @@ -48,8 +48,7 @@ class ReversibleEmbedding(keras.layers.Embedding): mask_zero: Boolean, whether or not the input value 0 is a special "padding" value that should be masked out. reverse_dtype: The dtype for the reverse projection computation. - For stability, it is usually best to use full precision even when - working with half or mixed precision training. + Defaults to the `compute_dtype` of the layer. **kwargs: other keyword arguments passed to `keras.layers.Embedding`, including `name`, `trainable`, `dtype` etc. @@ -90,7 +89,7 @@ def __init__( embeddings_regularizer=None, embeddings_constraint=None, mask_zero=False, - reverse_dtype="float32", + reverse_dtype=None, **kwargs, ): super().__init__( @@ -122,8 +121,9 @@ def call(self, inputs, reverse=False): kernel = ops.transpose(ops.convert_to_tensor(self.embeddings)) else: kernel = self.reverse_embeddings - inputs = ops.cast(inputs, self.reverse_dtype) - kernel = ops.cast(kernel, self.reverse_dtype) + if self.reverse_dtype is not None: + inputs = ops.cast(inputs, self.reverse_dtype) + kernel = ops.cast(kernel, self.reverse_dtype) return ops.matmul(inputs, kernel) return super().call(inputs) diff --git a/keras_nlp/models/llama/llama_backbone.py b/keras_nlp/models/llama/llama_backbone.py index 087944c552..e586fa97f5 100644 --- a/keras_nlp/models/llama/llama_backbone.py +++ b/keras_nlp/models/llama/llama_backbone.py @@ -109,7 +109,6 @@ def __init__( tie_weights=False, embeddings_initializer=_llama_kernel_initializer(stddev=0.01), dtype=dtype, - reverse_dtype=dtype, name="token_embedding", ) self.transformer_layers = [] diff --git a/keras_nlp/models/mistral/mistral_backbone.py b/keras_nlp/models/mistral/mistral_backbone.py index 6e9315690a..1ee2dfce66 100644 --- a/keras_nlp/models/mistral/mistral_backbone.py +++ b/keras_nlp/models/mistral/mistral_backbone.py @@ -121,7 +121,6 @@ def __init__( tie_weights=False, embeddings_initializer=_mistral_kernel_initializer(stddev=0.01), dtype=dtype, - reverse_dtype=dtype, name="token_embedding", ) self.transformer_layers = [] diff --git a/keras_nlp/samplers/sampler.py b/keras_nlp/samplers/sampler.py index 43950dea2f..d3680a3d51 100644 --- a/keras_nlp/samplers/sampler.py +++ b/keras_nlp/samplers/sampler.py @@ -145,10 +145,8 @@ def compute_probabilities(self, logits): This will always be done in full precision, regardless of dtype, and scale by `temperature`. """ - logits_dtype = logits.dtype logits = ops.cast(logits, "float32") - probs = keras.activations.softmax(logits / self.temperature) - return ops.cast(probs, logits_dtype) + return keras.activations.softmax(logits / self.temperature) def run_loop( self, cond, body, model=None, loop_vars=None, maximum_iterations=None