From 0d15e3675785a4db745b98a7c53f235ced57c7a2 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 31 Jan 2024 17:43:14 +0100 Subject: [PATCH] Pgvector - embedding retrieval (#298) * squash * Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi * Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi * Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi * Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py Co-authored-by: Massimiliano Pippi * fix fmt --------- Co-authored-by: Massimiliano Pippi --- .../pgvector/document_store.py | 102 +++++++++++++- integrations/pgvector/tests/conftest.py | 2 +- .../tests/test_embedding_retrieval.py | 130 ++++++++++++++++++ 3 files changed, 229 insertions(+), 5 deletions(-) create mode 100644 integrations/pgvector/tests/test_embedding_retrieval.py diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index b49bd87c3..0abaaecce 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -52,8 +52,10 @@ meta = EXCLUDED.meta """ +VALID_VECTOR_FUNCTIONS = ["cosine_similarity", "inner_product", "l2_distance"] + VECTOR_FUNCTION_TO_POSTGRESQL_OPS = { - "cosine_distance": "vector_cosine_ops", + "cosine_similarity": "vector_cosine_ops", "inner_product": "vector_ip_ops", "l2_distance": "vector_l2_ops", } @@ -70,7 +72,7 @@ def __init__( connection_string: str, table_name: str = "haystack_documents", embedding_dimension: int = 768, - vector_function: Literal["cosine_distance", "inner_product", "l2_distance"] = "cosine_distance", + vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity", recreate_table: bool = False, search_strategy: Literal["exact_nearest_neighbor", "hnsw"] = "exact_nearest_neighbor", hnsw_recreate_index_if_exists: bool = False, @@ -87,12 +89,23 @@ def __init__( :param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents". :param embedding_dimension: The dimension of the embedding. Defaults to 768. :param vector_function: The similarity function to use when searching for similar embeddings. - Defaults to "cosine_distance". Set it to one of the following values: - :type vector_function: Literal["cosine_distance", "inner_product", "l2_distance"] + Defaults to "cosine_similarity". "cosine_similarity" and "inner_product" are similarity functions and + higher scores indicate greater similarity between the documents. + "l2_distance" returns the straight-line distance between vectors, + and the most similar documents are the ones with the smallest score. + + Important: when using the "hnsw" search strategy, an index will be created that depends on the + `vector_function` passed here. Make sure subsequent queries will keep using the same + vector similarity function in order to take advantage of the index. + :type vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] :param recreate_table: Whether to recreate the table if it already exists. Defaults to False. :param search_strategy: The search strategy to use when searching for similar embeddings. Defaults to "exact_nearest_neighbor". "hnsw" is an approximate nearest neighbor search strategy, which trades off some accuracy for speed; it is recommended for large numbers of documents. + + Important: when using the "hnsw" search strategy, an index will be created that depends on the + `vector_function` passed here. Make sure subsequent queries will keep using the same + vector similarity function in order to take advantage of the index. :type search_strategy: Literal["exact_nearest_neighbor", "hnsw"] :param hnsw_recreate_index_if_exists: Whether to recreate the HNSW index if it already exists. Defaults to False. Only used if search_strategy is set to "hnsw". @@ -107,6 +120,9 @@ def __init__( self.connection_string = connection_string self.table_name = table_name self.embedding_dimension = embedding_dimension + if vector_function not in VALID_VECTOR_FUNCTIONS: + msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}" + raise ValueError(msg) self.vector_function = vector_function self.recreate_table = recreate_table self.search_strategy = search_strategy @@ -423,3 +439,81 @@ def delete_documents(self, document_ids: List[str]) -> None: ) self._execute_sql(delete_sql, error_msg="Could not delete documents from PgvectorDocumentStore") + + def _embedding_retrieval( + self, + query_embedding: List[float], + *, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + vector_function: Optional[Literal["cosine_similarity", "inner_product", "l2_distance"]] = None, + ) -> List[Document]: + """ + Retrieves documents that are most similar to the query embedding using a vector similarity metric. + + This method is not meant to be part of the public interface of + `PgvectorDocumentStore` and it should not be called directly. + `PgvectorEmbeddingRetriever` uses this method directly and is the public interface for it. + :raises ValueError + :return: List of Documents that are most similar to `query_embedding` + """ + + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) + if len(query_embedding) != self.embedding_dimension: + msg = ( + f"query_embedding dimension ({len(query_embedding)}) does not match PgvectorDocumentStore " + f"embedding dimension ({self.embedding_dimension})." + ) + raise ValueError(msg) + + vector_function = vector_function or self.vector_function + if vector_function not in VALID_VECTOR_FUNCTIONS: + msg = f"vector_function must be one of {VALID_VECTOR_FUNCTIONS}, but got {vector_function}" + raise ValueError(msg) + + # the vector must be a string with this format: "'[3,1,2]'" + query_embedding_for_postgres = f"'[{','.join(str(el) for el in query_embedding)}]'" + + # to compute the scores, we use the approach described in pgvector README: + # https://github.com/pgvector/pgvector?tab=readme-ov-file#distances + # cosine_similarity and inner_product are modified from the result of the operator + if vector_function == "cosine_similarity": + score_definition = f"1 - (embedding <=> {query_embedding_for_postgres}) AS score" + elif vector_function == "inner_product": + score_definition = f"(embedding <#> {query_embedding_for_postgres}) * -1 AS score" + elif vector_function == "l2_distance": + score_definition = f"embedding <-> {query_embedding_for_postgres} AS score" + + sql_select = SQL("SELECT *, {score} FROM {table_name}").format( + table_name=Identifier(self.table_name), + score=SQL(score_definition), + ) + + sql_where_clause = SQL("") + params = () + if filters: + sql_where_clause, params = _convert_filters_to_where_clause_and_params(filters) + + # we always want to return the most similar documents first + # so when using l2_distance, the sort order must be ASC + sort_order = "ASC" if vector_function == "l2_distance" else "DESC" + + sql_sort = SQL(" ORDER BY score {sort_order} LIMIT {top_k}").format( + top_k=SQLLiteral(top_k), + sort_order=SQL(sort_order), + ) + + sql_query = sql_select + sql_where_clause + sql_sort + + result = self._execute_sql( + sql_query, + params, + error_msg="Could not retrieve documents from PgvectorDocumentStore.", + cursor=self._dict_cursor, + ) + + records = result.fetchall() + docs = self._from_pg_to_haystack_documents(records) + return docs diff --git a/integrations/pgvector/tests/conftest.py b/integrations/pgvector/tests/conftest.py index 34260f409..743e8de14 100644 --- a/integrations/pgvector/tests/conftest.py +++ b/integrations/pgvector/tests/conftest.py @@ -7,7 +7,7 @@ def document_store(request): connection_string = "postgresql://postgres:postgres@localhost:5432/postgres" table_name = f"haystack_{request.node.name}" embedding_dimension = 768 - vector_function = "cosine_distance" + vector_function = "cosine_similarity" recreate_table = True search_strategy = "exact_nearest_neighbor" diff --git a/integrations/pgvector/tests/test_embedding_retrieval.py b/integrations/pgvector/tests/test_embedding_retrieval.py new file mode 100644 index 000000000..1d5e8e297 --- /dev/null +++ b/integrations/pgvector/tests/test_embedding_retrieval.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +import pytest +from haystack.dataclasses.document import Document +from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore +from numpy.random import rand + + +class TestEmbeddingRetrieval: + @pytest.fixture + def document_store_w_hnsw_index(self, request): + connection_string = "postgresql://postgres:postgres@localhost:5432/postgres" + table_name = f"haystack_hnsw_{request.node.name}" + embedding_dimension = 768 + vector_function = "cosine_similarity" + recreate_table = True + search_strategy = "hnsw" + + store = PgvectorDocumentStore( + connection_string=connection_string, + table_name=table_name, + embedding_dimension=embedding_dimension, + vector_function=vector_function, + recreate_table=recreate_table, + search_strategy=search_strategy, + ) + yield store + + store.delete_table() + + @pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) + def test_embedding_retrieval_cosine_similarity(self, document_store: PgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (cosine sim)", embedding=most_similar_embedding), + Document(content="2nd best document (cosine sim)", embedding=second_best_embedding), + Document(content="Not very similar document (cosine sim)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="cosine_similarity" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (cosine sim)" + assert results[1].content == "2nd best document (cosine sim)" + assert results[0].score > results[1].score + + @pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) + def test_embedding_retrieval_inner_product(self, document_store: PgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (inner product)", embedding=most_similar_embedding), + Document(content="2nd best document (inner product)", embedding=second_best_embedding), + Document(content="Not very similar document (inner product)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="inner_product" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (inner product)" + assert results[1].content == "2nd best document (inner product)" + assert results[0].score > results[1].score + + @pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) + def test_embedding_retrieval_l2_distance(self, document_store: PgvectorDocumentStore): + query_embedding = [0.1] * 768 + most_similar_embedding = [0.1] * 765 + [0.15] * 3 + second_best_embedding = [0.1] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = rand(768).tolist() + + docs = [ + Document(content="Most similar document (l2 dist)", embedding=most_similar_embedding), + Document(content="2nd best document (l2 dist)", embedding=second_best_embedding), + Document(content="Not very similar document (l2 dist)", embedding=another_embedding), + ] + + document_store.write_documents(docs) + + results = document_store._embedding_retrieval( + query_embedding=query_embedding, top_k=2, filters={}, vector_function="l2_distance" + ) + assert len(results) == 2 + assert results[0].content == "Most similar document (l2 dist)" + assert results[1].content == "2nd best document (l2 dist)" + assert results[0].score < results[1].score + + @pytest.mark.parametrize("document_store", ["document_store", "document_store_w_hnsw_index"], indirect=True) + def test_embedding_retrieval_with_filters(self, document_store: PgvectorDocumentStore): + docs = [Document(content=f"Document {i}", embedding=rand(768).tolist()) for i in range(10)] + + for i in range(10): + docs[i].meta["meta_field"] = "custom_value" if i % 2 == 0 else "other_value" + + document_store.write_documents(docs) + + query_embedding = [0.1] * 768 + filters = {"field": "meta.meta_field", "operator": "==", "value": "custom_value"} + + results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=3, filters=filters) + assert len(results) == 3 + for result in results: + assert result.meta["meta_field"] == "custom_value" + assert results[0].score > results[1].score > results[2].score + + def test_empty_query_embedding(self, document_store: PgvectorDocumentStore): + query_embedding: List[float] = [] + with pytest.raises(ValueError): + document_store._embedding_retrieval(query_embedding=query_embedding) + + def test_query_embedding_wrong_dimension(self, document_store: PgvectorDocumentStore): + query_embedding = [0.1] * 4 + with pytest.raises(ValueError): + document_store._embedding_retrieval(query_embedding=query_embedding)