Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Nomic embedding integration #610

Merged
merged 9 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,20 @@ paddlenlp = PaddleNLP()

</details>

<details>

<summary> Nomic </summary>

```python
from gptcache.embedding import Nomic

nm = Nomic(api_key='your-api-key')
# nm.dimension
# nm.to_embeddings
```

</details>

### Custom embedding

The function has two parameters: the preprocessed string and parameters reserved for user customization. To acquire these parameters, a similar method to the one above is used: `kwargs.get("embedding_func", {})`.
Expand Down
3 changes: 3 additions & 0 deletions gptcache/adapter/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Rwkv,
PaddleNLP,
UForm,
Nomic
)
from gptcache.embedding.base import BaseEmbedding
from gptcache.manager import manager_factory
Expand Down Expand Up @@ -285,6 +286,8 @@ def _get_model(model_src, model_config=None):
return PaddleNLP(**model_config)
if model_src == "uform":
return UForm(**model_config)
if model_src == "nomic":
return Nomic(**model_config)


def _get_eval(strategy, kws=None):
Expand Down
8 changes: 8 additions & 0 deletions gptcache/embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
rwkv = LazyImport("rwkv", globals(), "gptcache.embedding.rwkv")
paddlenlp = LazyImport("paddlenlp", globals(), "gptcache.embedding.paddlenlp")
uform = LazyImport("uform", globals(), "gptcache.embedding.uform")
nomic = LazyImport("nomic", globals(), "gptcache.embedding.nomic")


def Nomic(model: str = "nomic-embed-text-v1.5",
api_key: str = None,
task_type: str = "search_document",
dimensionality: int = None):
return nomic.Nomic(model, api_key, task_type, dimensionality)


def Cohere(model="large", api_key=None):
Expand Down
82 changes: 82 additions & 0 deletions gptcache/embedding/nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Nomic embedding integration"""

import numpy as np

from gptcache.utils import import_nomic
from gptcache.embedding.base import BaseEmbedding

import_nomic()

# import nomic # pylint: disable=C0413
from nomic import cli # pylint: disable=C0413
from nomic import embed # pylint: disable=C0413

class Nomic(BaseEmbedding):
"""Generate text embedding for given text using Cohere.
"""
def __init__(self,
model: str = "nomic-embed-text-v1.5",
api_key: str = None,
task_type: str = "search_document",
dimensionality: int = None) -> None:
"""Generate text embedding for given text using Nomic embed.

:param model: model name, defaults to 'nomic-embed-text-v1.5'.
:type model: str
:param api_key: Nomic API Key.
:type api_key: str
:param task_type: Task type in Nomic, defaults to 'search_document'.
:type task_type: str
:param dimensionality: Desired dimension of embeddings.
:type dimensionality: int

Example:
.. code-block:: python

import os
from gptcache.embedding import Nomic

test_sentence = 'Hey this is Nomic embedding integration to gaptcache.'
encoder = Nomic(model='nomic-embed-text-v1.5',
api_key=os.getenv("NOMIC_API_KEY"),
dimensionality=64)
embed = encoder.to_embeddings(test_sentence)
"""
# Login to nomic
cli.login(token=api_key)

self._model = model
self._task_type = task_type
self._dimensionality = dimensionality

def to_embeddings(self, data, **_):
"""Generate embedding given text input

:param data: text in string.
:type data: str

:return: a text embedding in shape of (self._dimensionality,).
"""
if not isinstance(data, list):
data = [data]

# Response will be a dictionary with key 'embeddings'
# and value will be a list of lists
response = embed.text(
texts=data,
model=self._model,
task_type=self._task_type,
dimensionality=self._dimensionality)
embeddings = response["embeddings"]
return np.array(embeddings).astype("float32").squeeze(0)

@property
def dimension(self):
"""Embedding dimension.

:return: embedding dimension
"""
if not self._dimensionality:
foo_emb = self.to_embeddings("foo")
self._dimensionality = len(foo_emb)
return self._dimensionality
5 changes: 5 additions & 0 deletions gptcache/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"import_milvus_lite",
"import_sbert",
"import_cohere",
"import_nomic",
"import_fasttext",
"import_huggingface",
"import_uform",
Expand Down Expand Up @@ -80,6 +81,10 @@ def import_cohere():
_check_library("cohere")


def import_nomic():
_check_library("nomic")


def import_fasttext():
_check_library("fasttext", package="fasttext==0.9.2")

Expand Down
17 changes: 17 additions & 0 deletions tests/unit_tests/embedding/test_nomic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os
import types
from unittest.mock import patch
from gptcache.utils import import_nomic
from gptcache.embedding import Nomic
from gptcache.adapter.api import _get_model

import_nomic()

def test_nomic():
t = Nomic(model='nomic-embed-text-v1.5', api_key=os.getenv("NOMIC_API_KEY"), dimensionality=64)
data = t.to_embeddings("foo")
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}"

t = _get_model(model_src="nomic", model_config={"model": "nomic-embed-text-v1.5"})
data = t.to_embeddings("foo")
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}"
Loading