diff --git a/generative_ai/embeddings/normalize_embeddings.py b/generative_ai/embeddings/normalize_embeddings.py new file mode 100644 index 00000000000..0052f6b3ad1 --- /dev/null +++ b/generative_ai/embeddings/normalize_embeddings.py @@ -0,0 +1,39 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def normalize_embedding(embedding_np: np.ndarray) -> np.ndarray: + """ + Normalizes an embedding array to have a magnitude (L2 norm) of 1. + + Args: + embedding_np: The input NumPy array to be normalized. + + Returns: + The normalized NumPy array with a magnitude of 1. + Returns the original array if its magnitude is 0. + """ + # Calculate the L2 norm (magnitude) of the array + norm = np.linalg.norm(embedding_np) + + # Avoid division by zero if the array is all zeros + # + # An all-zeros embedding array does not exist in theroy + if norm == 0: + return embedding_np + + # Divide the array by its norm to normalize it + return embedding_np / norm diff --git a/generative_ai/embeddings/test_embeddings_examples.py b/generative_ai/embeddings/test_embeddings_examples.py index b430b978e2c..70b48f52dc1 100644 --- a/generative_ai/embeddings/test_embeddings_examples.py +++ b/generative_ai/embeddings/test_embeddings_examples.py @@ -31,6 +31,7 @@ import multimodal_example import multimodal_image_example import multimodal_video_example +import normalize_embeddings @backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) @@ -97,6 +98,22 @@ def test_code_embed_text() -> None: assert [len(e) for e in embeddings] == [dimensionality or 768] * len(texts) +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_embedding_normalization() -> None: + import numpy as np + + embedding_value = [0.01] * 256 + embedding_np = np.linalg.norm(np.array(embedding_value)) + assert np.isclose(np.linalg.norm(embedding_np), 0.16) + + normalized_embedding_np = normalize_embeddings.normalize_embedding(embedding_np) + assert np.isclose(np.linalg.norm(normalized_embedding_np), 1) + + invalid_embedding_np = np.linalg.norm(np.array([0])) + normalized_embedding_np = normalize_embeddings.normalize_embedding(invalid_embedding_np) + assert np.isclose(np.linalg.norm(normalized_embedding_np), 0) + + @backoff.on_exception(backoff.expo, FailedPrecondition, max_time=300) def dispose(tuning_job) -> None: # noqa: ANN001 if tuning_job._status.name == "PIPELINE_STATE_RUNNING":