Merge branch 'main' into weaviate-client-v4

deepset-ai · Feb 29, 2024 · 2c3e446 · 2c3e446
2 parents d45c7b9 + e5ee06e
commit 2c3e446
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 63 deletions.
diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py
@@ -20,12 +20,14 @@ class JinaDocumentEmbedder:
     Usage example:
     ```python
     from haystack import Document
-    from jina_haystack import JinaDocumentEmbedder
+    from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder
 
-    doc = Document(content="I love pizza!")
+    # Make sure that the environment variable JINA_API_KEY is set
 
     document_embedder = JinaDocumentEmbedder()
 
+    doc = Document(content="I love pizza!")
+
     result = document_embedder.run([doc])
     print(result['documents'][0].embedding)
 
@@ -46,8 +48,10 @@ def __init__(
     ):
         """
         Create a JinaDocumentEmbedder component.
+
         :param api_key: The Jina API key.
-        :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/`
+        :param model: The name of the Jina model to use.
+            Check the list of available models on [Jina documentation](https://jina.ai/embeddings/).
         :param prefix: A string to add to the beginning of each text.
         :param suffix: A string to add to the end of each text.
         :param batch_size: Number of Documents to encode at once.
@@ -83,8 +87,9 @@ def _get_telemetry_data(self) -> Dict[str, Any]:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        This method overrides the default serializer in order to avoid leaking the `api_key` value passed
-        to the constructor.
+        Serializes the component to a dictionary.
+        :returns:
+            Dictionary with serialized data.
         """
         return default_to_dict(
             self,
@@ -100,6 +105,13 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "JinaDocumentEmbedder":
+        """
+        Deserializes the component from a dictionary.
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
@@ -151,10 +163,13 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
     @component.output_types(documents=List[Document], meta=Dict[str, Any])
     def run(self, documents: List[Document]):
         """
-        Embed a list of Documents.
-        The embedding of each Document is stored in the `embedding` field of the Document.
+        Compute the embeddings for a list of Documents.
 
         :param documents: A list of Documents to embed.
+        :returns: A dictionary with following keys:
+            - `documents`: List of Documents, each with an `embedding` field containing the computed embedding.
+            - `meta`: A dictionary with metadata including the model name and usage statistics.
+        :raises TypeError: If the input is not a list of Documents.
         """
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             msg = (

diff --git a/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py b/integrations/jina/src/haystack_integrations/components/embedders/jina/text_embedder.py
@@ -13,16 +13,18 @@
 @component
 class JinaTextEmbedder:
     """
-    A component for embedding strings using Jina models.
+    A component for embedding strings using Jina AI models.
 
     Usage example:
     ```python
-    from jina_haystack import JinaTextEmbedder
+    from haystack_integrations.components.embedders.jina import JinaTextEmbedder
 
-    text_to_embed = "I love pizza!"
+    # Make sure that the environment variable JINA_API_KEY is set
 
     text_embedder = JinaTextEmbedder()
 
+    text_to_embed = "I love pizza!"
+
     print(text_embedder.run(text_to_embed))
 
     # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
@@ -39,11 +41,12 @@ def __init__(
         suffix: str = "",
     ):
         """
-        Create an JinaTextEmbedder component.
+        Create a JinaTextEmbedder component.
 
         :param api_key: The Jina API key. It can be explicitly provided or automatically read from the
-            environment variable JINA_API_KEY (recommended).
-        :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/`
+            environment variable `JINA_API_KEY` (recommended).
+        :param model: The name of the Jina model to use.
+            Check the list of available models on [Jina documentation](https://jina.ai/embeddings/).
         :param prefix: A string to add to the beginning of each text.
         :param suffix: A string to add to the end of each text.
         """
@@ -71,22 +74,37 @@ def _get_telemetry_data(self) -> Dict[str, Any]:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        This method overrides the default serializer in order to avoid leaking the `api_key` value passed
-        to the constructor.
+        Serializes the component to a dictionary.
+        :returns:
+            Dictionary with serialized data.
         """
-
         return default_to_dict(
             self, api_key=self.api_key.to_dict(), model=self.model_name, prefix=self.prefix, suffix=self.suffix
         )
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "JinaTextEmbedder":
+        """
+        Deserializes the component from a dictionary.
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
     @component.output_types(embedding=List[float], meta=Dict[str, Any])
     def run(self, text: str):
-        """Embed a string."""
+        """
+        Embed a string.
+
+        :param text: The string to embed.
+        :returns: A dictionary with following keys:
+            - `embedding`: The embedding of the input string.
+            - `meta`: A dictionary with metadata including the model name and usage statistics.
+        :raises TypeError: If the input is not a string.
+        """
         if not isinstance(text, str):
             msg = (
                 "JinaTextEmbedder expects a string as an input."

diff --git a/integrations/pgvector/examples/example.py b/integrations/pgvector/examples/example.py
@@ -11,7 +11,6 @@
 # git clone https://github.com/anakin87/neural-search-pills
 
 import glob
-import os
 
 from haystack import Pipeline
 from haystack.components.converters import MarkdownToDocument
@@ -21,7 +20,8 @@
 from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
 from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
 
-os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
+# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
+# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
 
 # Initialize PgvectorDocumentStore
 document_store = PgvectorDocumentStore(

diff --git a/integrations/pgvector/pydoc/config.yml b/integrations/pgvector/pydoc/config.yml
@@ -4,7 +4,6 @@ loaders:
     modules: [
       "haystack_integrations.components.retrievers.pgvector.embedding_retriever",
       "haystack_integrations.document_stores.pgvector.document_store",
-      "haystack_integrations.document_stores.pgvector.filters",
     ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/.../pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py b/.../pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py
@@ -12,9 +12,47 @@
 @component
 class PgvectorEmbeddingRetriever:
     """
-    Retrieves documents from the PgvectorDocumentStore, based on their dense embeddings.
+    Retrieves documents from the `PgvectorDocumentStore`, based on their dense embeddings.
 
-    Needs to be connected to the PgvectorDocumentStore.
+    Example usage:
+    ```python
+    from haystack.document_stores import DuplicatePolicy
+    from haystack import Document, Pipeline
+    from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
+
+    from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
+    from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
+
+    # Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
+    # e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
+
+    document_store = PgvectorDocumentStore(
+        embedding_dimension=768,
+        vector_function="cosine_similarity",
+        recreate_table=True,
+    )
+
+    documents = [Document(content="There are over 7,000 languages spoken around the world today."),
+                 Document(content="Elephants have been observed to behave in a way that indicates..."),
+                 Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]
+
+    document_embedder = SentenceTransformersDocumentEmbedder()
+    document_embedder.warm_up()
+    documents_with_embeddings = document_embedder.run(documents)
+
+    document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)
+
+    query_pipeline = Pipeline()
+    query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
+    query_pipeline.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store))
+    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+
+    query = "How many languages are there?"
+
+    res = query_pipeline.run({"text_embedder": {"text": query}})
+
+    assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
+    ```
     """
 
     def __init__(
@@ -26,23 +64,20 @@ def __init__(
         vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
     ):
         """
-        Create the PgvectorEmbeddingRetriever component.
-
-        :param document_store: An instance of PgvectorDocumentStore.
-        :param filters: Filters applied to the retrieved Documents. Defaults to None.
-        :param top_k: Maximum number of Documents to return, defaults to 10.
+        :param document_store: An instance of `PgvectorDocumentStore}.
+        :param filters: Filters applied to the retrieved Documents.
+        :param top_k: Maximum number of Documents to return.
         :param vector_function: The similarity function to use when searching for similar embeddings.
             Defaults to the one set in the `document_store` instance.
-            "cosine_similarity" and "inner_product" are similarity functions and
+            `"cosine_similarity"` and `"inner_product"` are similarity functions and
             higher scores indicate greater similarity between the documents.
-            "l2_distance" returns the straight-line distance between vectors,
+            `"l2_distance"` returns the straight-line distance between vectors,
             and the most similar documents are the ones with the smallest score.
-
-            Important: if the document store is using the "hnsw" search strategy, the vector function
+            **Important**: if the document store is using the `"hnsw"` search strategy, the vector function
             should match the one utilized during index creation to take advantage of the index.
-        :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
 
-        :raises ValueError: If `document_store` is not an instance of PgvectorDocumentStore.
+        :raises ValueError: If `document_store` is not an instance of `PgvectorDocumentStore` or if `vector_function`
+            is not one of the valid options.
         """
         if not isinstance(document_store, PgvectorDocumentStore):
             msg = "document_store must be an instance of PgvectorDocumentStore"
@@ -58,6 +93,12 @@ def __init__(
         self.vector_function = vector_function or document_store.vector_function
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
         return default_to_dict(
             self,
             filters=self.filters,
@@ -68,6 +109,14 @@ def to_dict(self) -> Dict[str, Any]:
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
         doc_store_params = data["init_parameters"]["document_store"]
         data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
         return default_from_dict(cls, data)
@@ -81,14 +130,14 @@ def run(
         vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None,
     ):
         """
-        Retrieve documents from the PgvectorDocumentStore, based on their embeddings.
+        Retrieve documents from the `PgvectorDocumentStore`, based on their embeddings.
 
         :param query_embedding: Embedding of the query.
         :param filters: Filters applied to the retrieved Documents.
         :param top_k: Maximum number of Documents to return.
         :param vector_function: The similarity function to use when searching for similar embeddings.
-        :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"]
-        :return: List of Documents similar to `query_embedding`.
+
+        :returns: List of Documents similar to `query_embedding`.
         """
         filters = filters or self.filters
         top_k = top_k or self.top_k