From 8428df6a94a4912f22f2105ccfa070267411929e Mon Sep 17 00:00:00 2001
From: Ashwin Mathur <97467100+awinml@users.noreply.github.com>
Date: Mon, 18 Mar 2024 02:14:48 +0530
Subject: [PATCH] feat!: Update embedders to use new `Secret` API (#29)
---
README.md | 50 ++++----
examples/document_embedder_example.py | 3 +-
examples/semantic_search_pipeline_example.py | 7 +-
examples/text_embedder_example.py | 2 +-
pyproject.toml | 44 ++++---
.../embedders}/voyage_embedders/__about__.py | 2 +-
.../embedders/voyage_embedders/__init__.py | 8 ++
.../voyage_document_embedder.py | 94 +++++++++------
.../voyage_embedders/voyage_text_embedder.py | 79 ++++++++-----
src/voyage_embedders/__init__.py | 8 --
tests/test_voyage_document_embedder.py | 108 +++++++++++++++---
tests/test_voyage_text_embedder.py | 76 ++++++++++--
12 files changed, 331 insertions(+), 150 deletions(-)
rename src/{ => haystack_integrations/components/embedders}/voyage_embedders/__about__.py (81%)
create mode 100644 src/haystack_integrations/components/embedders/voyage_embedders/__init__.py
rename src/{ => haystack_integrations/components/embedders}/voyage_embedders/voyage_document_embedder.py (67%)
rename src/{ => haystack_integrations/components/embedders}/voyage_embedders/voyage_text_embedder.py (58%)
delete mode 100644 src/voyage_embedders/__init__.py
diff --git a/README.md b/README.md
index a69d64b..713b5cf 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,13 @@
-[data:image/s3,"s3://crabby-images/4d1f2/4d1f2af1e6a139f7aee0d69143594c64c759d16d" alt="PyPI"](https://pypi.org/project/voyage-embedders-haystack/)
-data:image/s3,"s3://crabby-images/8433f/8433f9fd314135914db2e2be68e7864161747eb0" alt="PyPI - Downloads"
-data:image/s3,"s3://crabby-images/a3bfe/a3bfe8269125422c18585a908f7be92273171924" alt="PyPI - Python Version"
-[data:image/s3,"s3://crabby-images/3701d/3701dee59362f6b0b2f86728d1601772f53f107d" alt="GitHub"](LICENSE)
+[data:image/s3,"s3://crabby-images/4d1f2/4d1f2af1e6a139f7aee0d69143594c64c759d16d" alt="PyPI"](https://pypi.org/project/voyage-embedders-haystack/)
+data:image/s3,"s3://crabby-images/8433f/8433f9fd314135914db2e2be68e7864161747eb0" alt="PyPI - Downloads"
+data:image/s3,"s3://crabby-images/a3bfe/a3bfe8269125422c18585a908f7be92273171924" alt="PyPI - Python Version"
+[data:image/s3,"s3://crabby-images/3701d/3701dee59362f6b0b2f86728d1601772f53f107d" alt="GitHub"](LICENSE)
[data:image/s3,"s3://crabby-images/3eff8/3eff808b27aab3308652f06479dbc59943e0da8d" alt="Actions status"](https://github.com/awinml/voyage-embedders-haystack/actions)
[data:image/s3,"s3://crabby-images/74b62/74b6254afad94bb07e4a356fc53392011de9030b" alt="Coverage Status"](https://coveralls.io/github/awinml/voyage-embedders-haystack?branch=main)
-[data:image/s3,"s3://crabby-images/ddfdd/ddfdde19007cfe942873b879d6eddc6eb258c6e0" alt="Types - Mypy"](https://github.com/python/mypy)
+[data:image/s3,"s3://crabby-images/ddfdd/ddfdde19007cfe942873b879d6eddc6eb258c6e0" alt="Types - Mypy"](https://github.com/python/mypy)
[data:image/s3,"s3://crabby-images/6a099/6a099727a52cf617121ab5d23cc43109ed9fa550" alt="Ruff"](https://github.com/astral-sh/ruff)
-[data:image/s3,"s3://crabby-images/98647/986475842f2907062b79c4bb27fdd075d638e5b9" alt="Code Style - Black"](https://github.com/psf/black)
-
-
+[data:image/s3,"s3://crabby-images/98647/986475842f2907062b79c4bb27fdd075d638e5b9" alt="Code Style - Black"](https://github.com/psf/black)
@@ -17,21 +15,27 @@ Custom component for [Haystack](https://github.com/deepset-ai/haystack) (2.x) fo
Voyage’s embedding models, `voyage-2` and `voyage-2-code`, are state-of-the-art in retrieval accuracy. These models outperform top performing embedding models like `intfloat/e5-mistral-7b-instruct` and `OpenAI/text-embedding-3-large` on the [MTEB Benchmark](https://github.com/embeddings-benchmark/mteb). `voyage-2` is current ranked second on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
-
#### What's New
+- **[v1.3.0 - 18/03/24]:**
+
+ - **Breaking Change:** The import path for the embedders has been changed to `haystack_integrations.components.embedders.voyage_embedders`.
+ Please replace all instances of `from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder` and `from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder` with
+ `from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder`.
+ - The embedders now use the Haystack `Secret` API for authentication. For more information please see the [Secret Management Documentation](https://docs.haystack.deepset.ai/docs/secret-management).
+
- **[v1.2.0 - 02/02/24]:**
- - **Breaking Change:** `VoyageDocumentEmbedder` and `VoyageTextEmbedder` now accept the `model` parameter instead of `model_name`.
- - The embedders have been use the new `voyageai.Client.embed()` method instead of the deprecated `get_embedding` and `get_embeddings` methods of the global namespace.
- - Support for the new `truncate` parameter has been added.
- - Default embedding model has been changed to "voyage-2" from the deprecated "voyage-01".
- - The embedders now return the total number of tokens used as part of the `"total_tokens"` in the metadata.
+
+ - **Breaking Change:** `VoyageDocumentEmbedder` and `VoyageTextEmbedder` now accept the `model` parameter instead of `model_name`.
+ - The embedders have been use the new `voyageai.Client.embed()` method instead of the deprecated `get_embedding` and `get_embeddings` methods of the global namespace.
+ - Support for the new `truncate` parameter has been added.
+ - Default embedding model has been changed to "voyage-2" from the deprecated "voyage-01".
+ - The embedders now return the total number of tokens used as part of the `"total_tokens"` in the metadata.
- **[v1.1.0 - 13/12/23]:** Added support for `input_type` parameter in `VoyageTextEmbedder` and `VoyageDocument Embedder`.
- **[v1.0.0 - 21/11/23]:** Added `VoyageTextEmbedder` and `VoyageDocument Embedder` to embed strings and documents.
-
## Installation
```bash
@@ -42,17 +46,18 @@ pip install voyage-embedders-haystack
You can use Voyage Embedding models with two components: [VoyageTextEmbedder](https://github.com/awinml/voyage-embedders-haystack/blob/main/src/voyage_embedders/voyage_text_embedder.py) and [VoyageDocumentEmbedder](https://github.com/awinml/voyage-embedders-haystack/blob/main/src/voyage_embedders/voyage_document_embedder.py).
-To create semantic embeddings for documents, use `VoyageDocumentEmbedder` in your indexing pipeline. For generating embeddings for queries, use `VoyageTextEmbedder`. Once you've selected the suitable component for your specific use case, initialize the component with the model name and VoyageAI API key. You can also
-set the environment variable "VOYAGE_API_KEY" instead of passing the api key as an argument.
+To create semantic embeddings for documents, use `VoyageDocumentEmbedder` in your indexing pipeline. For generating embeddings for queries, use `VoyageTextEmbedder`.
+
+Once you've selected the suitable component for your specific use case, initialize the component with the model name and VoyageAI API key. You can also
+set the environment variable `VOYAGE_API_KEY` instead of passing the API key as an argument.
Information about the supported models, can be found on the [Embeddings Documentation.](https://docs.voyageai.com/embeddings/)
To get an API key, please see the [Voyage AI website.](https://www.voyageai.com/)
-
## Example
-Below is the example Semantic Search pipeline that uses the [Simple Wikipedia](https://huggingface.co/datasets/pszemraj/simple_wikipedia) Dataset from HuggingFace. You can find more examples in the [`examples`](https://github.com/awinml/voyage-embedders-haystack/tree/main/examples) folder.
+Below is the example Semantic Search pipeline that uses the [Simple Wikipedia](https://huggingface.co/datasets/pszemraj/simple_wikipedia) Dataset from HuggingFace. You can find more examples in the [`examples`](https://github.com/awinml/voyage-embedders-haystack/tree/main/examples) folder.
Load the dataset:
@@ -66,8 +71,7 @@ from haystack.dataclasses import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
# Import Voyage Embedders
-from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
-from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
+from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder
# Load first 100 rows of the Simple Wikipedia Dataset from HuggingFace
dataset = load_dataset("pszemraj/simple_wikipedia", split="validation[:100]")
@@ -101,7 +105,7 @@ text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query")
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=doc_embedder, name="DocEmbedder")
indexing_pipeline.add_component(instance=doc_writer, name="DocWriter")
-indexing_pipeline.connect(sender="DocEmbedder", receiver="DocWriter")
+indexing_pipeline.connect("DocEmbedder", "DocWriter")
indexing_pipeline.run({"DocEmbedder": {"documents": docs}})
@@ -111,6 +115,7 @@ print(f"Embedding of first Document: {doc_store.filter_documents()[0].embedding}
```
Query the Semantic Search Pipeline using the `InMemoryEmbeddingRetriever` and `VoyageTextEmbedder`:
+
```python
text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query")
@@ -120,7 +125,6 @@ query_pipeline.add_component(instance=text_embedder, name="TextEmbedder")
query_pipeline.add_component(instance=retriever, name="Retriever")
query_pipeline.connect("TextEmbedder.embedding", "Retriever.query_embedding")
-
# Search
results = query_pipeline.run({"TextEmbedder": {"text": "Which year did the Joker movie release?"}})
diff --git a/examples/document_embedder_example.py b/examples/document_embedder_example.py
index 104150b..29cbb7f 100644
--- a/examples/document_embedder_example.py
+++ b/examples/document_embedder_example.py
@@ -1,6 +1,5 @@
from haystack.dataclasses import Document
-
-from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
+from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder
# Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa)
document_list = [
diff --git a/examples/semantic_search_pipeline_example.py b/examples/semantic_search_pipeline_example.py
index e426794..1e69d77 100644
--- a/examples/semantic_search_pipeline_example.py
+++ b/examples/semantic_search_pipeline_example.py
@@ -7,8 +7,7 @@
from haystack.document_stores.in_memory import InMemoryDocumentStore
# Import Voyage Embedders
-from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
-from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
+from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder
# Load first 100 rows of the Simple Wikipedia Dataset from HuggingFace
dataset = load_dataset("pszemraj/simple_wikipedia", split="validation[:100]")
@@ -32,13 +31,12 @@
model="voyage-2",
input_type="document",
)
-text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query")
# Indexing Pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=doc_embedder, name="DocEmbedder")
indexing_pipeline.add_component(instance=doc_writer, name="DocWriter")
-indexing_pipeline.connect(sender="DocEmbedder", receiver="DocWriter")
+indexing_pipeline.connect("DocEmbedder", "DocWriter")
indexing_pipeline.run({"DocEmbedder": {"documents": docs}})
@@ -46,6 +44,7 @@
print(f"First Document: {doc_store.filter_documents()[0]}")
print(f"Embedding of first Document: {doc_store.filter_documents()[0].embedding}")
+text_embedder = VoyageTextEmbedder(model="voyage-2", input_type="query")
# Query Pipeline
query_pipeline = Pipeline()
diff --git a/examples/text_embedder_example.py b/examples/text_embedder_example.py
index 1a220b1..1f9efb3 100644
--- a/examples/text_embedder_example.py
+++ b/examples/text_embedder_example.py
@@ -1,4 +1,4 @@
-from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
+from haystack_integrations.components.embedders.voyage_embedders import VoyageTextEmbedder
# Example text from the Amazon Reviews Polarity Dataset (https://huggingface.co/datasets/amazon_polarity)
text = (
diff --git a/pyproject.toml b/pyproject.toml
index 3e0d9ef..6a45058 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,10 +36,10 @@ Issues = "https://github.com/awinml/voyage-embedders-haystack/issues"
Source = "https://github.com/awinml/voyage-embedders-haystack"
[tool.hatch.build.targets.wheel]
-packages = ["src/voyage_embedders"]
+packages = ["src/haystack_integrations"]
[tool.hatch.version]
-path = "src/voyage_embedders/__about__.py"
+path = "src/haystack_integrations/components/embedders/voyage_embedders/__about__.py"
[tool.hatch.envs.default]
dependencies = ["coverage[toml]>=6.5", "coveralls", "pytest", "datasets"]
@@ -52,7 +52,6 @@ cov = ["test-cov", "cov-report"]
example-text-embedder = "python examples/text_embedder_example.py"
example-doc-embedder = "python examples/document_embedder_example.py"
example-semantic-search = "python examples/semantic_search_pipeline_example.py"
-
test-examples = [
"example-text-embedder",
"example-doc-embedder",
@@ -65,10 +64,11 @@ python = ["3.8", "3.9", "3.10", "3.11", "3.12"]
[tool.hatch.envs.lint]
detached = true
dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
+
[tool.hatch.envs.lint.scripts]
-typing = "mypy --install-types --non-interactive {args:src/voyage_embedders tests}"
-style = ["ruff {args:.}", "black --check --diff {args:.}"]
-fmt = ["black {args:.}", "ruff --fix {args:.}", "style"]
+typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
+style = ["ruff check {args:.}", "black --check --diff {args:.}"]
+fmt = ["black {args:.}", "ruff check --fix {args:.}", "style"]
all = ["fmt", "typing"]
[tool.hatch.metadata]
@@ -82,7 +82,7 @@ skip-string-normalization = true
[tool.ruff]
target-version = "py37"
line-length = 120
-select = [
+lint.select = [
"A",
"ARG",
"B",
@@ -109,7 +109,7 @@ select = [
"W",
"YTT",
]
-ignore = [
+lint.ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Allow boolean positional values in function calls, like `dict.get(... True)`
@@ -126,46 +126,54 @@ ignore = [
"PLR0915",
# Ignore print statements
"T201",
+ # Ignore function call in argument default - for secrets
+ "B008",
]
-unfixable = [
+lint.unfixable = [
# Don't touch unused imports
"F401",
]
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
known-first-party = ["voyage_embedders"]
-[tool.ruff.flake8-tidy-imports]
-ban-relative-imports = "all"
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "parents"
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]
[tool.coverage.run]
-source_pkgs = ["voyage_embedders", "tests"]
+source_pkgs = ["haystack_integrations", "tests"]
branch = true
parallel = true
-omit = ["src/voyage_embedders/__about__.py", "example"]
+omit = [
+ "src/haystack_integrations/components/embedders/voyage_embedders/__about__.py",
+ "example",
+]
[tool.coverage.paths]
voyage_embedders = [
- "src/voyage_embedders",
- "*/voyage_embedders/src/voyage_embedders",
+ "src/haystack_integrations/components/embedders/voyage_embedders",
+ "*/voyage_embedders/src/haystack_integrations/components/embedders/voyage_embedders",
]
tests = ["tests", "*voyage_embedders/tests"]
[tool.coverage.report]
+omit = ["*/__init__.py"]
+show_missing = true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
[tool.pytest.ini_options]
minversion = "6.0"
addopts = "-vv"
markers = ["unit: unit tests", "integration: integration tests"]
+log_cli = true
[tool.mypy]
ignore_missing_imports = true
[[tool.mypy.overrides]]
-module = ["haystack.*", "pytest.*"]
+module = ["haystack.*", "haystack_integrations.*", "pytest.*"]
ignore_missing_imports = true
diff --git a/src/voyage_embedders/__about__.py b/src/haystack_integrations/components/embedders/voyage_embedders/__about__.py
similarity index 81%
rename from src/voyage_embedders/__about__.py
rename to src/haystack_integrations/components/embedders/voyage_embedders/__about__.py
index d28709d..dad9de6 100644
--- a/src/voyage_embedders/__about__.py
+++ b/src/haystack_integrations/components/embedders/voyage_embedders/__about__.py
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2023-present Ashwin Mathur <>
#
# SPDX-License-Identifier: Apache-2.0
-__version__ = "1.2.0"
+__version__ = "1.3.0"
diff --git a/src/haystack_integrations/components/embedders/voyage_embedders/__init__.py b/src/haystack_integrations/components/embedders/voyage_embedders/__init__.py
new file mode 100644
index 0000000..ac9f1a3
--- /dev/null
+++ b/src/haystack_integrations/components/embedders/voyage_embedders/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: 2023-present Ashwin Mathur <>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack_integrations.components.embedders.voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
+from haystack_integrations.components.embedders.voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
+
+__all__ = ["VoyageDocumentEmbedder", "VoyageTextEmbedder"]
diff --git a/src/voyage_embedders/voyage_document_embedder.py b/src/haystack_integrations/components/embedders/voyage_embedders/voyage_document_embedder.py
similarity index 67%
rename from src/voyage_embedders/voyage_document_embedder.py
rename to src/haystack_integrations/components/embedders/voyage_embedders/voyage_document_embedder.py
index e651b37..8783fe6 100644
--- a/src/voyage_embedders/voyage_document_embedder.py
+++ b/src/haystack_integrations/components/embedders/voyage_embedders/voyage_document_embedder.py
@@ -1,9 +1,7 @@
-import os
from typing import Any, Dict, List, Optional, Tuple
-from haystack.core.component import component
-from haystack.core.serialization import default_to_dict
-from haystack.dataclasses import Document
+from haystack import Document, component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
from tqdm import tqdm
from voyageai import Client
@@ -16,10 +14,10 @@ class VoyageDocumentEmbedder:
Usage example:
```python
- from haystack.preview import Document
- from haystack.preview.components.embedders import VoyageDocumentEmbedder
+ from haystack import Document
+ from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder
- doc = Document(text="I love pizza!")
+ doc = Document(content="I love pizza!")
document_embedder = VoyageDocumentEmbedder()
@@ -32,7 +30,7 @@ class VoyageDocumentEmbedder:
def __init__(
self,
- api_key: Optional[str] = None,
+ api_key: Secret = Secret.from_env_var("VOYAGE_API_KEY"),
model: str = "voyage-2",
input_type: str = "document",
truncate: Optional[bool] = None,
@@ -45,42 +43,43 @@ def __init__(
):
"""
Create a VoyageDocumentEmbedder component.
- :param api_key: The VoyageAI API key. It can be explicitly provided or automatically read from the
- environment variable VOYAGE_API_KEY (recommended).
- :param model: The name of the model to use. Defaults to "voyage-2".
- For more details on the available models,
+
+ :param api_key:
+ The VoyageAI API key. It can be explicitly provided or automatically read from the environment variable
+ VOYAGE_API_KEY (recommended).
+ :param model:
+ The name of the model to use. Defaults to "voyage-2".
+ For more details on the available models,
see [Voyage Embeddings documentation](https://docs.voyageai.com/embeddings/).
- :param input_type: Type of the input text. This is used to prepend different prompts to the text.
+ :param input_type:
+ Type of the input text. This is used to prepend different prompts to the text.
- Defaults to `"document"`. This will prepend the text with, "Represent the document for retrieval: ".
- Can be set to `"query"`. For query, the prompt is "Represent the query for retrieving
supporting documents: ".
- Can be set to `None` for no prompt.
- :param truncate: Whether to truncate the input texts to fit within the context length.
+ :param truncate:
+ Whether to truncate the input texts to fit within the context length.
- If `True`, over-length input texts will be truncated to fit within the context length, before vectorized
by the embedding model.
- If False, an error will be raised if any given text exceeds the context length.
- Defaults to `None`, which will truncate the input text before sending it to the embedding model if it
slightly exceeds the context window length. If it significantly exceeds the context window length, an
error will be raised.
- :param prefix: A string to add to the beginning of each text.
- :param suffix: A string to add to the end of each text.
- :param batch_size: Number of Documents to encode at once.
- :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text.
- :param embedding_separator: Separator used to concatenate the meta fields to the Document text.
- :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
- to keep the logs clean.
+ :param prefix:
+ A string to add to the beginning of each text.
+ :param suffix:
+ A string to add to the end of each text.
+ :param batch_size:
+ Number of Documents to encode at once.
+ :param metadata_fields_to_embed:
+ List of meta fields that should be embedded along with the Document text.
+ :param embedding_separator:
+ Separator used to concatenate the meta fields to the Document text.
+ :param progress_bar:
+ Whether to show a progress bar or not. Can be helpful to disable in production deployments to keep the logs
+ clean.
"""
- if api_key is None:
- try:
- api_key = os.environ["VOYAGE_API_KEY"]
- except KeyError as e:
- msg = (
- "VoyageDocumentEmbedder expects an VoyageAI API key. "
- "Set the VOYAGE_API_KEY environment variable (recommended) or pass it explicitly."
- )
- raise ValueError(msg) from e
-
- self.client = Client(api_key=api_key)
+ self.api_key = api_key
self.model = model
self.input_type = input_type
self.truncate = truncate
@@ -91,10 +90,14 @@ def __init__(
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.embedding_separator = embedding_separator
+ self.client = Client(api_key=api_key.resolve_value())
+
def to_dict(self) -> Dict[str, Any]:
"""
- This method overrides the default serializer in order to avoid leaking the `api_key` value passed
- to the constructor.
+ Serializes the component to a dictionary.
+
+ :returns:
+ Dictionary with serialized data.
"""
return default_to_dict(
self,
@@ -107,8 +110,22 @@ def to_dict(self) -> Dict[str, Any]:
progress_bar=self.progress_bar,
metadata_fields_to_embed=self.metadata_fields_to_embed,
embedding_separator=self.embedding_separator,
+ api_key=self.api_key.to_dict(),
)
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "VoyageDocumentEmbedder":
+ """
+ Deserializes the component from a dictionary.
+
+ :param data:
+ Dictionary to deserialize from.
+ :returns:
+ Deserialized component.
+ """
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+ return default_from_dict(cls, data)
+
def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
"""
Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
@@ -155,9 +172,14 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
def run(self, documents: List[Document]):
"""
Embed a list of Documents.
- The embedding of each Document is stored in the `embedding` field of the Document.
- :param documents: A list of Documents to embed.
+ :param documents:
+ Documents to embed.
+
+ :returns:
+ A dictionary with the following keys:
+ - `documents`: Documents with embeddings
+ - `meta`: Information about the usage of the model.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
msg = (
diff --git a/src/voyage_embedders/voyage_text_embedder.py b/src/haystack_integrations/components/embedders/voyage_embedders/voyage_text_embedder.py
similarity index 58%
rename from src/voyage_embedders/voyage_text_embedder.py
rename to src/haystack_integrations/components/embedders/voyage_embedders/voyage_text_embedder.py
index b156f91..a92f06e 100644
--- a/src/voyage_embedders/voyage_text_embedder.py
+++ b/src/haystack_integrations/components/embedders/voyage_embedders/voyage_text_embedder.py
@@ -1,8 +1,7 @@
-import os
from typing import Any, Dict, List, Optional
-from haystack.core.component import component
-from haystack.core.serialization import default_to_dict
+from haystack import component, default_from_dict, default_to_dict
+from haystack.utils import Secret, deserialize_secrets_inplace
from voyageai import Client
@@ -13,7 +12,7 @@ class VoyageTextEmbedder:
Usage example:
```python
- from haystack.preview.components.embedders import VoyageTextEmbedder
+ from haystack_integrations.components.embedders.voyage_embedders import VoyageTextEmbedder
text_to_embed = "I love pizza!"
@@ -27,7 +26,7 @@ class VoyageTextEmbedder:
def __init__(
self,
- api_key: Optional[str] = None,
+ api_key: Secret = Secret.from_env_var("VOYAGE_API_KEY"),
model: str = "voyage-2",
input_type: str = "query",
truncate: Optional[bool] = None,
@@ -37,51 +36,49 @@ def __init__(
"""
Create an VoyageTextEmbedder component.
- :param api_key: The VoyageAI API key. It can be explicitly provided or automatically read from the
- environment variable VOYAGE_API_KEY (recommended).
- :param model: The name of the Voyage model to use. Defaults to "voyage-2".
+ :param api_key:
+ The VoyageAI API key. It can be explicitly provided or automatically read from the environment variable
+ VOYAGE_API_KEY (recommended).
+ :param model:
+ The name of the Voyage model to use. Defaults to "voyage-2".
For more details on the available models,
- see [Voyage Embeddings documentation](https://docs.voyageai.com/embeddings/).
- :param input_type: Type of the input text. This is used to prepend different prompts to the text.
+ see [Voyage Embeddings documentation](https://docs.voyageai.com/embeddings/).
+ :param input_type:
+ Type of the input text. This is used to prepend different prompts to the text.
- Defaults to `"query"`. This will prepend the text with, "Represent the query for retrieving
supporting documents: ".
- Can be set to `"document"`. For document, the prompt is "Represent the document for retrieval: ".
- Can be set to `None` for no prompt.
for the document prompt.
- :param truncate: Whether to truncate the input texts to fit within the context length.
+ :param truncate:
+ Whether to truncate the input texts to fit within the context length.
- If `True`, over-length input texts will be truncated to fit within the context length, before vectorized
by the embedding model.
- If False, an error will be raised if any given text exceeds the context length.
- Defaults to `None`, which will truncate the input text before sending it to the embedding model if it
slightly exceeds the context window length. If it significantly exceeds the context window length, an
error will be raised.
- :param prefix: A string to add to the beginning of each text.
- :param suffix: A string to add to the end of each text.
+ :param prefix:
+ A string to add to the beginning of each text.
+ :param suffix:
+ A string to add to the end of each text.
"""
- # if the user does not provide the API key, check if it is set in the module client
- if api_key is None:
- try:
- api_key = os.environ["VOYAGE_API_KEY"]
- except KeyError as e:
- msg = (
- "VoyageTextEmbedder expects an VoyageAI API key."
- " Set the VOYAGE_API_KEY environment variable (recommended) or pass it explicitly."
- )
- raise ValueError(msg) from e
-
- self.client = Client(api_key=api_key)
+ self.api_key = api_key
self.model = model
self.input_type = input_type
self.truncate = truncate
self.prefix = prefix
self.suffix = suffix
+ self.client = Client(api_key=api_key.resolve_value())
+
def to_dict(self) -> Dict[str, Any]:
"""
- This method overrides the default serializer in order to avoid leaking the `api_key` value passed
- to the constructor.
- """
+ Serializes the component to a dictionary.
+ :returns:
+ Dictionary with serialized data.
+ """
return default_to_dict(
self,
model=self.model,
@@ -89,11 +86,35 @@ def to_dict(self) -> Dict[str, Any]:
truncate=self.truncate,
prefix=self.prefix,
suffix=self.suffix,
+ api_key=self.api_key.to_dict(),
)
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "VoyageTextEmbedder":
+ """
+ Deserializes the component from a dictionary.
+
+ :param data:
+ Dictionary to deserialize from.
+ :returns:
+ Deserialized component.
+ """
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+ return default_from_dict(cls, data)
+
@component.output_types(embedding=List[float], meta=Dict[str, Any])
def run(self, text: str):
- """Embed a string."""
+ """
+ Embed a single string.
+
+ :param text:
+ Text to embed.
+
+ :returns:
+ A dictionary with the following keys:
+ - `embedding`: The embedding of the input text.
+ - `meta`: Information about the usage of the model.
+ """
if not isinstance(text, str):
msg = (
"VoyageTextEmbedder expects a string as an input. "
diff --git a/src/voyage_embedders/__init__.py b/src/voyage_embedders/__init__.py
deleted file mode 100644
index 932dd30..0000000
--- a/src/voyage_embedders/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-FileCopyrightText: 2023-present Ashwin Mathur <>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
-from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
-
-__all__ = ["VoyageDocumentEmbedder", "VoyageTextEmbedder"]
diff --git a/tests/test_voyage_document_embedder.py b/tests/test_voyage_document_embedder.py
index ad05bb7..f7cf5ad 100644
--- a/tests/test_voyage_document_embedder.py
+++ b/tests/test_voyage_document_embedder.py
@@ -1,9 +1,9 @@
import os
import pytest
-from haystack.dataclasses import Document
-
-from voyage_embedders.voyage_document_embedder import VoyageDocumentEmbedder
+from haystack import Document
+from haystack.utils.auth import Secret
+from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder
class TestVoyageDocumentEmbedder:
@@ -26,7 +26,7 @@ def test_init_default(self, monkeypatch):
@pytest.mark.unit
def test_init_with_parameters(self):
embedder = VoyageDocumentEmbedder(
- api_key="fake-api-key",
+ api_key=Secret.from_token("fake-api-key"),
model="model",
input_type="query",
truncate=True,
@@ -52,16 +52,19 @@ def test_init_with_parameters(self):
@pytest.mark.unit
def test_init_fail_wo_api_key(self, monkeypatch):
monkeypatch.delenv("VOYAGE_API_KEY", raising=False)
- with pytest.raises(ValueError, match="VoyageDocumentEmbedder expects an VoyageAI API key"):
+ with pytest.raises(ValueError, match="None of the .* environment variables are set"):
VoyageDocumentEmbedder()
@pytest.mark.unit
- def test_to_dict(self):
- component = VoyageDocumentEmbedder(api_key="fake-api-key")
+ def test_to_dict(self, monkeypatch):
+ monkeypatch.setenv("VOYAGE_API_KEY", "fake-api-key")
+ component = VoyageDocumentEmbedder()
data = component.to_dict()
assert data == {
- "type": "voyage_embedders.voyage_document_embedder.VoyageDocumentEmbedder",
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_document_embedder."
+ "VoyageDocumentEmbedder",
"init_parameters": {
+ "api_key": {"env_vars": ["VOYAGE_API_KEY"], "strict": True, "type": "env_var"},
"model": "voyage-2",
"input_type": "document",
"truncate": None,
@@ -75,9 +78,43 @@ def test_to_dict(self):
}
@pytest.mark.unit
- def test_to_dict_with_custom_init_parameters(self):
+ def test_from_dict(self, monkeypatch):
+ monkeypatch.setenv("VOYAGE_API_KEY", "fake-api-key")
+ data = {
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_document_embedder."
+ "VoyageDocumentEmbedder",
+ "init_parameters": {
+ "api_key": {"env_vars": ["VOYAGE_API_KEY"], "strict": True, "type": "env_var"},
+ "model": "voyage-2",
+ "input_type": "document",
+ "truncate": None,
+ "prefix": "",
+ "suffix": "",
+ "batch_size": 32,
+ "progress_bar": True,
+ "metadata_fields_to_embed": [],
+ "embedding_separator": "\n",
+ },
+ }
+
+ embedder = VoyageDocumentEmbedder.from_dict(data)
+
+ assert embedder.client.api_key == "fake-api-key"
+ assert embedder.model == "voyage-2"
+ assert embedder.input_type == "document"
+ assert embedder.truncate is None
+ assert embedder.prefix == ""
+ assert embedder.suffix == ""
+ assert embedder.batch_size == 32
+ assert embedder.progress_bar is True
+ assert embedder.metadata_fields_to_embed == []
+ assert embedder.embedding_separator == "\n"
+
+ @pytest.mark.unit
+ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "fake-api-key")
component = VoyageDocumentEmbedder(
- api_key="fake-api-key",
+ api_key=Secret.from_env_var("ENV_VAR", strict=False),
model="model",
input_type="query",
truncate=True,
@@ -90,8 +127,10 @@ def test_to_dict_with_custom_init_parameters(self):
)
data = component.to_dict()
assert data == {
- "type": "voyage_embedders.voyage_document_embedder.VoyageDocumentEmbedder",
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_document_embedder."
+ "VoyageDocumentEmbedder",
"init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"},
"model": "model",
"input_type": "query",
"truncate": True,
@@ -104,6 +143,39 @@ def test_to_dict_with_custom_init_parameters(self):
},
}
+ @pytest.mark.unit
+ def test_from_dict_with_custom_init_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "fake-api-key")
+ data = {
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_document_embedder."
+ "VoyageDocumentEmbedder",
+ "init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"},
+ "model": "model",
+ "input_type": "query",
+ "truncate": True,
+ "prefix": "prefix",
+ "suffix": "suffix",
+ "batch_size": 4,
+ "progress_bar": False,
+ "metadata_fields_to_embed": ["test_field"],
+ "embedding_separator": " | ",
+ },
+ }
+
+ embedder = VoyageDocumentEmbedder.from_dict(data)
+
+ assert embedder.client.api_key == "fake-api-key"
+ assert embedder.model == "model"
+ assert embedder.input_type == "query"
+ assert embedder.truncate is True
+ assert embedder.prefix == "prefix"
+ assert embedder.suffix == "suffix"
+ assert embedder.batch_size == 4
+ assert embedder.progress_bar is False
+ assert embedder.metadata_fields_to_embed == ["test_field"]
+ assert embedder.embedding_separator == " | "
+
@pytest.mark.unit
def test_prepare_texts_to_embed_w_metadata(self):
documents = [
@@ -111,12 +183,13 @@ def test_prepare_texts_to_embed_w_metadata(self):
]
embedder = VoyageDocumentEmbedder(
- api_key="fake-api-key", metadata_fields_to_embed=["meta_field"], embedding_separator=" | "
+ api_key=Secret.from_token("fake-api-key"),
+ metadata_fields_to_embed=["meta_field"],
+ embedding_separator=" | ",
)
prepared_texts = embedder._prepare_texts_to_embed(documents)
- # note that newline is replaced by space
assert prepared_texts == [
"meta_value 0 | document number 0: content",
"meta_value 1 | document number 1: content",
@@ -129,7 +202,9 @@ def test_prepare_texts_to_embed_w_metadata(self):
def test_prepare_texts_to_embed_w_suffix(self):
documents = [Document(content=f"document number {i}") for i in range(5)]
- embedder = VoyageDocumentEmbedder(api_key="fake-api-key", prefix="my_prefix ", suffix=" my_suffix")
+ embedder = VoyageDocumentEmbedder(
+ api_key=Secret.from_token("fake-api-key"), prefix="my_prefix ", suffix=" my_suffix"
+ )
prepared_texts = embedder._prepare_texts_to_embed(documents)
@@ -143,9 +218,8 @@ def test_prepare_texts_to_embed_w_suffix(self):
@pytest.mark.unit
def test_run_wrong_input_format(self):
- embedder = VoyageDocumentEmbedder(api_key="fake-api-key")
+ embedder = VoyageDocumentEmbedder(api_key=Secret.from_token("fake-api-key"))
- # wrong formats
string_input = "text"
list_integers_input = [1, 2, 3]
@@ -157,7 +231,7 @@ def test_run_wrong_input_format(self):
@pytest.mark.unit
def test_run_on_empty_list(self):
- embedder = VoyageDocumentEmbedder(api_key="fake-api-key")
+ embedder = VoyageDocumentEmbedder(api_key=Secret.from_token("fake-api-key"))
empty_list_input = []
result = embedder.run(documents=empty_list_input)
diff --git a/tests/test_voyage_text_embedder.py b/tests/test_voyage_text_embedder.py
index 622aac5..4b63037 100644
--- a/tests/test_voyage_text_embedder.py
+++ b/tests/test_voyage_text_embedder.py
@@ -1,8 +1,8 @@
import os
import pytest
-
-from voyage_embedders.voyage_text_embedder import VoyageTextEmbedder
+from haystack.utils.auth import Secret
+from haystack_integrations.components.embedders.voyage_embedders import VoyageTextEmbedder
class TestVoyageTextEmbedder:
@@ -21,7 +21,7 @@ def test_init_default(self, monkeypatch):
@pytest.mark.unit
def test_init_with_parameters(self):
embedder = VoyageTextEmbedder(
- api_key="fake-api-key",
+ api_key=Secret.from_token("fake-api-key"),
model="model",
input_type="document",
truncate=True,
@@ -38,16 +38,19 @@ def test_init_with_parameters(self):
@pytest.mark.unit
def test_init_fail_wo_api_key(self, monkeypatch):
monkeypatch.delenv("VOYAGE_API_KEY", raising=False)
- with pytest.raises(ValueError, match="VoyageTextEmbedder expects an VoyageAI API key"):
+ with pytest.raises(ValueError, match="None of the .* environment variables are set"):
VoyageTextEmbedder()
@pytest.mark.unit
- def test_to_dict(self):
- component = VoyageTextEmbedder(api_key="fake-api-key")
+ def test_to_dict(self, monkeypatch):
+ monkeypatch.setenv("VOYAGE_API_KEY", "fake-api-key")
+ component = VoyageTextEmbedder()
data = component.to_dict()
assert data == {
- "type": "voyage_embedders.voyage_text_embedder.VoyageTextEmbedder",
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_text_embedder."
+ "VoyageTextEmbedder",
"init_parameters": {
+ "api_key": {"env_vars": ["VOYAGE_API_KEY"], "strict": True, "type": "env_var"},
"model": "voyage-2",
"truncate": None,
"input_type": "query",
@@ -57,9 +60,34 @@ def test_to_dict(self):
}
@pytest.mark.unit
- def test_to_dict_with_custom_init_parameters(self):
+ def test_from_dict(self, monkeypatch):
+ monkeypatch.setenv("VOYAGE_API_KEY", "fake-api-key")
+ data = {
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_text_embedder."
+ "VoyageTextEmbedder",
+ "init_parameters": {
+ "api_key": {"env_vars": ["VOYAGE_API_KEY"], "strict": True, "type": "env_var"},
+ "model": "voyage-2",
+ "truncate": None,
+ "input_type": "query",
+ "prefix": "",
+ "suffix": "",
+ },
+ }
+
+ embedder = VoyageTextEmbedder.from_dict(data)
+ assert embedder.client.api_key == "fake-api-key"
+ assert embedder.input_type == "query"
+ assert embedder.model == "voyage-2"
+ assert embedder.truncate is None
+ assert embedder.prefix == ""
+ assert embedder.suffix == ""
+
+ @pytest.mark.unit
+ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "fake-api-key")
component = VoyageTextEmbedder(
- api_key="fake-api-key",
+ api_key=Secret.from_env_var("ENV_VAR", strict=False),
model="model",
truncate=True,
input_type="document",
@@ -68,8 +96,26 @@ def test_to_dict_with_custom_init_parameters(self):
)
data = component.to_dict()
assert data == {
- "type": "voyage_embedders.voyage_text_embedder.VoyageTextEmbedder",
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_text_embedder."
+ "VoyageTextEmbedder",
+ "init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"},
+ "model": "model",
+ "truncate": True,
+ "input_type": "document",
+ "prefix": "prefix",
+ "suffix": "suffix",
+ },
+ }
+
+ @pytest.mark.unit
+ def test_from_dict_with_custom_init_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "fake-api-key")
+ data = {
+ "type": "haystack_integrations.components.embedders.voyage_embedders.voyage_text_embedder."
+ "VoyageTextEmbedder",
"init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"},
"model": "model",
"truncate": True,
"input_type": "document",
@@ -78,6 +124,14 @@ def test_to_dict_with_custom_init_parameters(self):
},
}
+ embedder = VoyageTextEmbedder.from_dict(data)
+ assert embedder.client.api_key == "fake-api-key"
+ assert embedder.model == "model"
+ assert embedder.truncate is True
+ assert embedder.input_type == "document"
+ assert embedder.prefix == "prefix"
+ assert embedder.suffix == "suffix"
+
@pytest.mark.skipif(os.environ.get("VOYAGE_API_KEY", "") == "", reason="VOYAGE_API_KEY is not set")
@pytest.mark.integration
def test_run(self):
@@ -92,7 +146,7 @@ def test_run(self):
@pytest.mark.unit
def test_run_wrong_input_format(self):
- embedder = VoyageTextEmbedder(api_key="fake-api-key")
+ embedder = VoyageTextEmbedder(api_key=Secret.from_token("fake-api-key"))
list_integers_input = [1, 2, 3]