Skip to content

Commit

Permalink
Merge branch 'main' into weaviate-client-v4
Browse files Browse the repository at this point in the history
  • Loading branch information
hsm207 authored Feb 29, 2024
2 parents d45c7b9 + e5ee06e commit 2c3e446
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ class JinaDocumentEmbedder:
Usage example:
```python
from haystack import Document
from jina_haystack import JinaDocumentEmbedder
from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder
doc = Document(content="I love pizza!")
# Make sure that the environment variable JINA_API_KEY is set
document_embedder = JinaDocumentEmbedder()
doc = Document(content="I love pizza!")
result = document_embedder.run([doc])
print(result['documents'][0].embedding)
Expand All @@ -46,8 +48,10 @@ def __init__(
):
"""
Create a JinaDocumentEmbedder component.
:param api_key: The Jina API key.
:param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/`
:param model: The name of the Jina model to use.
Check the list of available models on [Jina documentation](https://jina.ai/embeddings/).
:param prefix: A string to add to the beginning of each text.
:param suffix: A string to add to the end of each text.
:param batch_size: Number of Documents to encode at once.
Expand Down Expand Up @@ -83,8 +87,9 @@ def _get_telemetry_data(self) -> Dict[str, Any]:

def to_dict(self) -> Dict[str, Any]:
"""
This method overrides the default serializer in order to avoid leaking the `api_key` value passed
to the constructor.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
Expand All @@ -100,6 +105,13 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "JinaDocumentEmbedder":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

Expand Down Expand Up @@ -151,10 +163,13 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
@component.output_types(documents=List[Document], meta=Dict[str, Any])
def run(self, documents: List[Document]):
"""
Embed a list of Documents.
The embedding of each Document is stored in the `embedding` field of the Document.
Compute the embeddings for a list of Documents.
:param documents: A list of Documents to embed.
:returns: A dictionary with following keys:
- `documents`: List of Documents, each with an `embedding` field containing the computed embedding.
- `meta`: A dictionary with metadata including the model name and usage statistics.
:raises TypeError: If the input is not a list of Documents.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
msg = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@
@component
class JinaTextEmbedder:
"""
A component for embedding strings using Jina models.
A component for embedding strings using Jina AI models.
Usage example:
```python
from jina_haystack import JinaTextEmbedder
from haystack_integrations.components.embedders.jina import JinaTextEmbedder
text_to_embed = "I love pizza!"
# Make sure that the environment variable JINA_API_KEY is set
text_embedder = JinaTextEmbedder()
text_to_embed = "I love pizza!"
print(text_embedder.run(text_to_embed))
# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
Expand All @@ -39,11 +41,12 @@ def __init__(
suffix: str = "",
):
"""
Create an JinaTextEmbedder component.
Create a JinaTextEmbedder component.
:param api_key: The Jina API key. It can be explicitly provided or automatically read from the
environment variable JINA_API_KEY (recommended).
:param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/`
environment variable `JINA_API_KEY` (recommended).
:param model: The name of the Jina model to use.
Check the list of available models on [Jina documentation](https://jina.ai/embeddings/).
:param prefix: A string to add to the beginning of each text.
:param suffix: A string to add to the end of each text.
"""
Expand Down Expand Up @@ -71,22 +74,37 @@ def _get_telemetry_data(self) -> Dict[str, Any]:

def to_dict(self) -> Dict[str, Any]:
"""
This method overrides the default serializer in order to avoid leaking the `api_key` value passed
to the constructor.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""

return default_to_dict(
self, api_key=self.api_key.to_dict(), model=self.model_name, prefix=self.prefix, suffix=self.suffix
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@component.output_types(embedding=List[float], meta=Dict[str, Any])
def run(self, text: str):
"""Embed a string."""
"""
Embed a string.
:param text: The string to embed.
:returns: A dictionary with following keys:
- `embedding`: The embedding of the input string.
- `meta`: A dictionary with metadata including the model name and usage statistics.
:raises TypeError: If the input is not a string.
"""
if not isinstance(text, str):
msg = (
"JinaTextEmbedder expects a string as an input."
Expand Down
4 changes: 2 additions & 2 deletions integrations/pgvector/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# git clone https://github.com/anakin87/neural-search-pills

import glob
import os

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
Expand All @@ -21,7 +20,8 @@
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"

# Initialize PgvectorDocumentStore
document_store = PgvectorDocumentStore(
Expand Down
1 change: 0 additions & 1 deletion integrations/pgvector/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ loaders:
modules: [
"haystack_integrations.components.retrievers.pgvector.embedding_retriever",
"haystack_integrations.document_stores.pgvector.document_store",
"haystack_integrations.document_stores.pgvector.filters",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,47 @@
@component
class PgvectorEmbeddingRetriever:
"""
Retrieves documents from the PgvectorDocumentStore, based on their dense embeddings.
Retrieves documents from the `PgvectorDocumentStore`, based on their dense embeddings.
Needs to be connected to the PgvectorDocumentStore.
Example usage:
```python
from haystack.document_stores import DuplicatePolicy
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
document_store = PgvectorDocumentStore(
embedding_dimension=768,
vector_function="cosine_similarity",
recreate_table=True,
)
documents = [Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates..."),
Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)
document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query = "How many languages are there?"
res = query_pipeline.run({"text_embedder": {"text": query}})
assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
```
"""

def __init__(
Expand All @@ -26,23 +64,20 @@ def __init__(
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
):
"""
Create the PgvectorEmbeddingRetriever component.
:param document_store: An instance of PgvectorDocumentStore.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10.
:param document_store: An instance of `PgvectorDocumentStore}.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param vector_function: The similarity function to use when searching for similar embeddings.
Defaults to the one set in the `document_store` instance.
"cosine_similarity" and "inner_product" are similarity functions and
`"cosine_similarity"` and `"inner_product"` are similarity functions and
higher scores indicate greater similarity between the documents.
"l2_distance" returns the straight-line distance between vectors,
`"l2_distance"` returns the straight-line distance between vectors,
and the most similar documents are the ones with the smallest score.
Important: if the document store is using the "hnsw" search strategy, the vector function
**Important**: if the document store is using the `"hnsw"` search strategy, the vector function
should match the one utilized during index creation to take advantage of the index.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:raises ValueError: If `document_store` is not an instance of PgvectorDocumentStore.
:raises ValueError: If `document_store` is not an instance of `PgvectorDocumentStore` or if `vector_function`
is not one of the valid options.
"""
if not isinstance(document_store, PgvectorDocumentStore):
msg = "document_store must be an instance of PgvectorDocumentStore"
Expand All @@ -58,6 +93,12 @@ def __init__(
self.vector_function = vector_function or document_store.vector_function

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
Expand All @@ -68,6 +109,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
doc_store_params = data["init_parameters"]["document_store"]
data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
return default_from_dict(cls, data)
Expand All @@ -81,14 +130,14 @@ def run(
vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
):
"""
Retrieve documents from the PgvectorDocumentStore, based on their embeddings.
Retrieve documents from the `PgvectorDocumentStore`, based on their embeddings.
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param vector_function: The similarity function to use when searching for similar embeddings.
:type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
:return: List of Documents similar to `query_embedding`.
:returns: List of Documents similar to `query_embedding`.
"""
filters = filters or self.filters
top_k = top_k or self.top_k
Expand Down
Loading

0 comments on commit 2c3e446

Please sign in to comment.