From 2834558bd8a7418ed5bbd39acffab6503cb82a7a Mon Sep 17 00:00:00 2001 From: Aakash Thatte <84656834+sky-2002@users.noreply.github.com> Date: Thu, 7 Mar 2024 07:56:45 +0530 Subject: [PATCH] Add Nomic embedding integration (#610) * Add nomic embedding integration Signed-off-by: Aakash Thatte * Add imports for nomic integration Signed-off-by: Aakash Thatte * Add example for nomic integration Signed-off-by: Aakash Thatte * Fix pylint issues Signed-off-by: Aakash Thatte * Fix pylint issues * Add test for nomic integration * fix the lint for the embedding init.py file * fix the lint error for the nomic.py file * fix the lint error for the nomic.py file --------- Signed-off-by: Aakash Thatte Co-authored-by: SimFG --- examples/README.md | 14 ++++ gptcache/adapter/api.py | 3 + gptcache/embedding/__init__.py | 8 +++ gptcache/embedding/nomic.py | 82 ++++++++++++++++++++++++ gptcache/utils/__init__.py | 5 ++ tests/unit_tests/embedding/test_nomic.py | 17 +++++ 6 files changed, 129 insertions(+) create mode 100644 gptcache/embedding/nomic.py create mode 100644 tests/unit_tests/embedding/test_nomic.py diff --git a/examples/README.md b/examples/README.md index 5cad5c05..b3b3cbc4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -150,6 +150,20 @@ paddlenlp = PaddleNLP() +
+ + Nomic + +```python +from gptcache.embedding import Nomic + +nm = Nomic(api_key='your-api-key') +# nm.dimension +# nm.to_embeddings +``` + +
+ ### Custom embedding The function has two parameters: the preprocessed string and parameters reserved for user customization. To acquire these parameters, a similar method to the one above is used: `kwargs.get("embedding_func", {})`. diff --git a/gptcache/adapter/api.py b/gptcache/adapter/api.py index 09a2b1e4..9ee5ec10 100644 --- a/gptcache/adapter/api.py +++ b/gptcache/adapter/api.py @@ -18,6 +18,7 @@ Rwkv, PaddleNLP, UForm, + Nomic ) from gptcache.embedding.base import BaseEmbedding from gptcache.manager import manager_factory @@ -285,6 +286,8 @@ def _get_model(model_src, model_config=None): return PaddleNLP(**model_config) if model_src == "uform": return UForm(**model_config) + if model_src == "nomic": + return Nomic(**model_config) def _get_eval(strategy, kws=None): diff --git a/gptcache/embedding/__init__.py b/gptcache/embedding/__init__.py index 08b255c7..65c2dde4 100644 --- a/gptcache/embedding/__init__.py +++ b/gptcache/embedding/__init__.py @@ -30,6 +30,14 @@ rwkv = LazyImport("rwkv", globals(), "gptcache.embedding.rwkv") paddlenlp = LazyImport("paddlenlp", globals(), "gptcache.embedding.paddlenlp") uform = LazyImport("uform", globals(), "gptcache.embedding.uform") +nomic = LazyImport("nomic", globals(), "gptcache.embedding.nomic") + + +def Nomic(model: str = "nomic-embed-text-v1.5", + api_key: str = None, + task_type: str = "search_document", + dimensionality: int = None): + return nomic.Nomic(model, api_key, task_type, dimensionality) def Cohere(model="large", api_key=None): diff --git a/gptcache/embedding/nomic.py b/gptcache/embedding/nomic.py new file mode 100644 index 00000000..78b66731 --- /dev/null +++ b/gptcache/embedding/nomic.py @@ -0,0 +1,82 @@ +"""Nomic embedding integration""" + +import numpy as np + +from gptcache.utils import import_nomic +from gptcache.embedding.base import BaseEmbedding + +import_nomic() + +# import nomic # pylint: disable=C0413 +from nomic import cli # pylint: disable=C0413 +from nomic import embed # pylint: disable=C0413 + +class Nomic(BaseEmbedding): + """Generate text embedding for given text using Cohere. + """ + def __init__(self, + model: str = "nomic-embed-text-v1.5", + api_key: str = None, + task_type: str = "search_document", + dimensionality: int = None) -> None: + """Generate text embedding for given text using Nomic embed. + + :param model: model name, defaults to 'nomic-embed-text-v1.5'. + :type model: str + :param api_key: Nomic API Key. + :type api_key: str + :param task_type: Task type in Nomic, defaults to 'search_document'. + :type task_type: str + :param dimensionality: Desired dimension of embeddings. + :type dimensionality: int + + Example: + .. code-block:: python + + import os + from gptcache.embedding import Nomic + + test_sentence = 'Hey this is Nomic embedding integration to gaptcache.' + encoder = Nomic(model='nomic-embed-text-v1.5', + api_key=os.getenv("NOMIC_API_KEY"), + dimensionality=64) + embed = encoder.to_embeddings(test_sentence) + """ + # Login to nomic + cli.login(token=api_key) + + self._model = model + self._task_type = task_type + self._dimensionality = dimensionality + + def to_embeddings(self, data, **_): + """Generate embedding given text input + + :param data: text in string. + :type data: str + + :return: a text embedding in shape of (self._dimensionality,). + """ + if not isinstance(data, list): + data = [data] + + # Response will be a dictionary with key 'embeddings' + # and value will be a list of lists + response = embed.text( + texts=data, + model=self._model, + task_type=self._task_type, + dimensionality=self._dimensionality) + embeddings = response["embeddings"] + return np.array(embeddings).astype("float32").squeeze(0) + + @property + def dimension(self): + """Embedding dimension. + + :return: embedding dimension + """ + if not self._dimensionality: + foo_emb = self.to_embeddings("foo") + self._dimensionality = len(foo_emb) + return self._dimensionality diff --git a/gptcache/utils/__init__.py b/gptcache/utils/__init__.py index 093fd354..7cc50d3d 100644 --- a/gptcache/utils/__init__.py +++ b/gptcache/utils/__init__.py @@ -3,6 +3,7 @@ "import_milvus_lite", "import_sbert", "import_cohere", + "import_nomic", "import_fasttext", "import_huggingface", "import_uform", @@ -80,6 +81,10 @@ def import_cohere(): _check_library("cohere") +def import_nomic(): + _check_library("nomic") + + def import_fasttext(): _check_library("fasttext", package="fasttext==0.9.2") diff --git a/tests/unit_tests/embedding/test_nomic.py b/tests/unit_tests/embedding/test_nomic.py new file mode 100644 index 00000000..059c2faf --- /dev/null +++ b/tests/unit_tests/embedding/test_nomic.py @@ -0,0 +1,17 @@ +import os +import types +from unittest.mock import patch +from gptcache.utils import import_nomic +from gptcache.embedding import Nomic +from gptcache.adapter.api import _get_model + +import_nomic() + +def test_nomic(): + t = Nomic(model='nomic-embed-text-v1.5', api_key=os.getenv("NOMIC_API_KEY"), dimensionality=64) + data = t.to_embeddings("foo") + assert len(data) == t.dimension, f"{len(data)}, {t.dimension}" + + t = _get_model(model_src="nomic", model_config={"model": "nomic-embed-text-v1.5"}) + data = t.to_embeddings("foo") + assert len(data) == t.dimension, f"{len(data)}, {t.dimension}" \ No newline at end of file