Skip to content

Commit

Permalink
Add Nomic embedding integration (#610)
Browse files Browse the repository at this point in the history
* Add nomic embedding integration

Signed-off-by: Aakash Thatte <[email protected]>

* Add imports for nomic integration

Signed-off-by: Aakash Thatte <[email protected]>

* Add example for nomic integration

Signed-off-by: Aakash Thatte <[email protected]>

* Fix pylint issues

Signed-off-by: Aakash Thatte <[email protected]>

* Fix pylint issues

* Add test for nomic integration

* fix the lint for the embedding init.py file

* fix the lint error for the nomic.py file

* fix the lint error for the nomic.py file

---------

Signed-off-by: Aakash Thatte <[email protected]>
Co-authored-by: SimFG <[email protected]>
  • Loading branch information
sky-2002 and SimFG authored Mar 7, 2024
1 parent 5f110cd commit 2834558
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 0 deletions.
14 changes: 14 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,20 @@ paddlenlp = PaddleNLP()

</details>

<details>

<summary> Nomic </summary>

```python
from gptcache.embedding import Nomic

nm = Nomic(api_key='your-api-key')
# nm.dimension
# nm.to_embeddings
```

</details>

### Custom embedding

The function has two parameters: the preprocessed string and parameters reserved for user customization. To acquire these parameters, a similar method to the one above is used: `kwargs.get("embedding_func", {})`.
Expand Down
3 changes: 3 additions & 0 deletions gptcache/adapter/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Rwkv,
PaddleNLP,
UForm,
Nomic
)
from gptcache.embedding.base import BaseEmbedding
from gptcache.manager import manager_factory
Expand Down Expand Up @@ -285,6 +286,8 @@ def _get_model(model_src, model_config=None):
return PaddleNLP(**model_config)
if model_src == "uform":
return UForm(**model_config)
if model_src == "nomic":
return Nomic(**model_config)


def _get_eval(strategy, kws=None):
Expand Down
8 changes: 8 additions & 0 deletions gptcache/embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
rwkv = LazyImport("rwkv", globals(), "gptcache.embedding.rwkv")
paddlenlp = LazyImport("paddlenlp", globals(), "gptcache.embedding.paddlenlp")
uform = LazyImport("uform", globals(), "gptcache.embedding.uform")
nomic = LazyImport("nomic", globals(), "gptcache.embedding.nomic")


def Nomic(model: str = "nomic-embed-text-v1.5",
api_key: str = None,
task_type: str = "search_document",
dimensionality: int = None):
return nomic.Nomic(model, api_key, task_type, dimensionality)


def Cohere(model="large", api_key=None):
Expand Down
82 changes: 82 additions & 0 deletions gptcache/embedding/nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Nomic embedding integration"""

import numpy as np

from gptcache.utils import import_nomic
from gptcache.embedding.base import BaseEmbedding

import_nomic()

# import nomic # pylint: disable=C0413
from nomic import cli # pylint: disable=C0413
from nomic import embed # pylint: disable=C0413

class Nomic(BaseEmbedding):
"""Generate text embedding for given text using Cohere.
"""
def __init__(self,
model: str = "nomic-embed-text-v1.5",
api_key: str = None,
task_type: str = "search_document",
dimensionality: int = None) -> None:
"""Generate text embedding for given text using Nomic embed.
:param model: model name, defaults to 'nomic-embed-text-v1.5'.
:type model: str
:param api_key: Nomic API Key.
:type api_key: str
:param task_type: Task type in Nomic, defaults to 'search_document'.
:type task_type: str
:param dimensionality: Desired dimension of embeddings.
:type dimensionality: int
Example:
.. code-block:: python
import os
from gptcache.embedding import Nomic
test_sentence = 'Hey this is Nomic embedding integration to gaptcache.'
encoder = Nomic(model='nomic-embed-text-v1.5',
api_key=os.getenv("NOMIC_API_KEY"),
dimensionality=64)
embed = encoder.to_embeddings(test_sentence)
"""
# Login to nomic
cli.login(token=api_key)

self._model = model
self._task_type = task_type
self._dimensionality = dimensionality

def to_embeddings(self, data, **_):
"""Generate embedding given text input
:param data: text in string.
:type data: str
:return: a text embedding in shape of (self._dimensionality,).
"""
if not isinstance(data, list):
data = [data]

# Response will be a dictionary with key 'embeddings'
# and value will be a list of lists
response = embed.text(
texts=data,
model=self._model,
task_type=self._task_type,
dimensionality=self._dimensionality)
embeddings = response["embeddings"]
return np.array(embeddings).astype("float32").squeeze(0)

@property
def dimension(self):
"""Embedding dimension.
:return: embedding dimension
"""
if not self._dimensionality:
foo_emb = self.to_embeddings("foo")
self._dimensionality = len(foo_emb)
return self._dimensionality
5 changes: 5 additions & 0 deletions gptcache/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"import_milvus_lite",
"import_sbert",
"import_cohere",
"import_nomic",
"import_fasttext",
"import_huggingface",
"import_uform",
Expand Down Expand Up @@ -80,6 +81,10 @@ def import_cohere():
_check_library("cohere")


def import_nomic():
_check_library("nomic")


def import_fasttext():
_check_library("fasttext", package="fasttext==0.9.2")

Expand Down
17 changes: 17 additions & 0 deletions tests/unit_tests/embedding/test_nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os
import types
from unittest.mock import patch
from gptcache.utils import import_nomic
from gptcache.embedding import Nomic
from gptcache.adapter.api import _get_model

import_nomic()

def test_nomic():
t = Nomic(model='nomic-embed-text-v1.5', api_key=os.getenv("NOMIC_API_KEY"), dimensionality=64)
data = t.to_embeddings("foo")
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}"

t = _get_model(model_src="nomic", model_config={"model": "nomic-embed-text-v1.5"})
data = t.to_embeddings("foo")
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}"

0 comments on commit 2834558

Please sign in to comment.