-
Notifications
You must be signed in to change notification settings - Fork 520
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Nomic embedding integration (#610)
* Add nomic embedding integration Signed-off-by: Aakash Thatte <[email protected]> * Add imports for nomic integration Signed-off-by: Aakash Thatte <[email protected]> * Add example for nomic integration Signed-off-by: Aakash Thatte <[email protected]> * Fix pylint issues Signed-off-by: Aakash Thatte <[email protected]> * Fix pylint issues * Add test for nomic integration * fix the lint for the embedding init.py file * fix the lint error for the nomic.py file * fix the lint error for the nomic.py file --------- Signed-off-by: Aakash Thatte <[email protected]> Co-authored-by: SimFG <[email protected]>
- Loading branch information
Showing
6 changed files
with
129 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
"""Nomic embedding integration""" | ||
|
||
import numpy as np | ||
|
||
from gptcache.utils import import_nomic | ||
from gptcache.embedding.base import BaseEmbedding | ||
|
||
import_nomic() | ||
|
||
# import nomic # pylint: disable=C0413 | ||
from nomic import cli # pylint: disable=C0413 | ||
from nomic import embed # pylint: disable=C0413 | ||
|
||
class Nomic(BaseEmbedding): | ||
"""Generate text embedding for given text using Cohere. | ||
""" | ||
def __init__(self, | ||
model: str = "nomic-embed-text-v1.5", | ||
api_key: str = None, | ||
task_type: str = "search_document", | ||
dimensionality: int = None) -> None: | ||
"""Generate text embedding for given text using Nomic embed. | ||
:param model: model name, defaults to 'nomic-embed-text-v1.5'. | ||
:type model: str | ||
:param api_key: Nomic API Key. | ||
:type api_key: str | ||
:param task_type: Task type in Nomic, defaults to 'search_document'. | ||
:type task_type: str | ||
:param dimensionality: Desired dimension of embeddings. | ||
:type dimensionality: int | ||
Example: | ||
.. code-block:: python | ||
import os | ||
from gptcache.embedding import Nomic | ||
test_sentence = 'Hey this is Nomic embedding integration to gaptcache.' | ||
encoder = Nomic(model='nomic-embed-text-v1.5', | ||
api_key=os.getenv("NOMIC_API_KEY"), | ||
dimensionality=64) | ||
embed = encoder.to_embeddings(test_sentence) | ||
""" | ||
# Login to nomic | ||
cli.login(token=api_key) | ||
|
||
self._model = model | ||
self._task_type = task_type | ||
self._dimensionality = dimensionality | ||
|
||
def to_embeddings(self, data, **_): | ||
"""Generate embedding given text input | ||
:param data: text in string. | ||
:type data: str | ||
:return: a text embedding in shape of (self._dimensionality,). | ||
""" | ||
if not isinstance(data, list): | ||
data = [data] | ||
|
||
# Response will be a dictionary with key 'embeddings' | ||
# and value will be a list of lists | ||
response = embed.text( | ||
texts=data, | ||
model=self._model, | ||
task_type=self._task_type, | ||
dimensionality=self._dimensionality) | ||
embeddings = response["embeddings"] | ||
return np.array(embeddings).astype("float32").squeeze(0) | ||
|
||
@property | ||
def dimension(self): | ||
"""Embedding dimension. | ||
:return: embedding dimension | ||
""" | ||
if not self._dimensionality: | ||
foo_emb = self.to_embeddings("foo") | ||
self._dimensionality = len(foo_emb) | ||
return self._dimensionality |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import os | ||
import types | ||
from unittest.mock import patch | ||
from gptcache.utils import import_nomic | ||
from gptcache.embedding import Nomic | ||
from gptcache.adapter.api import _get_model | ||
|
||
import_nomic() | ||
|
||
def test_nomic(): | ||
t = Nomic(model='nomic-embed-text-v1.5', api_key=os.getenv("NOMIC_API_KEY"), dimensionality=64) | ||
data = t.to_embeddings("foo") | ||
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}" | ||
|
||
t = _get_model(model_src="nomic", model_config={"model": "nomic-embed-text-v1.5"}) | ||
data = t.to_embeddings("foo") | ||
assert len(data) == t.dimension, f"{len(data)}, {t.dimension}" |