diff --git a/examples/pg_vectorstore.ipynb b/examples/pg_vectorstore.ipynb index 2c20e90..a6e3837 100644 --- a/examples/pg_vectorstore.ipynb +++ b/examples/pg_vectorstore.ipynb @@ -359,7 +359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To enable search with filters, it is necessary to declare the columns that you want to filter on when creating the table. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", + "To achieve performant search with filters, it is crucial to declare the columns you want to filter on within the `metadata_columns` when creating the table, as filtering directly on these columns is far more efficient than attempting to filter on fields within a metadata JSON column. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", "\n", "`PGVectorStore` currently supports the following operators.\n", "\n", diff --git a/examples/pg_vectorstore_how_to.ipynb b/examples/pg_vectorstore_how_to.ipynb index 2c5e75a..fb38bfa 100644 --- a/examples/pg_vectorstore_how_to.ipynb +++ b/examples/pg_vectorstore_how_to.ipynb @@ -530,7 +530,7 @@ "source": [ "### Search for documents with metadata filter\n", "\n", - "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns.\n", + "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns for efficient filtering.\n", "\n", "`PGVectorStore` currently supports the following operators and all Postgres data types.\n", "\n", @@ -645,7 +645,7 @@ "\n", "- **`metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"]`**: These columns are treated as metadata for each product. Metadata provides additional information about a product, such as its name, category, price, quantity available, SKU (Stock Keeping Unit), and an image URL. This information is useful for displaying product details in search results or for filtering and categorization.\n", "\n", - "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns.\n" + "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns. Note that filtering on fields within the JSON but not in `metadata_columns` will be less efficient.\n" ] }, { diff --git a/langchain_postgres/v2/async_vectorstore.py b/langchain_postgres/v2/async_vectorstore.py index c83930e..5eaae84 100644 --- a/langchain_postgres/v2/async_vectorstore.py +++ b/langchain_postgres/v2/async_vectorstore.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import datetime import json import uuid from typing import Any, Callable, Iterable, Optional, Sequence @@ -54,6 +55,16 @@ .union(SPECIAL_CASED_OPERATORS) ) +PYTHON_TO_POSTGRES_TYPE_MAP = { + int: "INTEGER", + float: "FLOAT", + str: "TEXT", + bool: "BOOLEAN", + datetime.date: "DATE", + datetime.datetime: "TIMESTAMP", + datetime.time: "TIME", +} + class AsyncPGVectorStore(VectorStore): """Postgres Vector Store class""" @@ -1093,19 +1104,33 @@ def _handle_field_filter( operator = "$eq" filter_value = value + field_selector = field + if self.metadata_json_column is not None and field not in self.metadata_columns and field not in ( + self.id_column, + self.content_column, + self.embedding_column + ): + filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value) + postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type) + if postgres_type is None: + raise ValueError(f"Unsupported type: {filter_value_type}") + field_selector = f"{self.metadata_json_column}->>'{field}'" + if postgres_type != "TEXT" and operator != "$exists": + field_selector = f"({field_selector})::{postgres_type}" + suffix_id = str(uuid.uuid4()).split("-")[0] if operator in COMPARISONS_TO_NATIVE: # Then we implement an equality filter # native is trusted input native = COMPARISONS_TO_NATIVE[operator] param_name = f"{field}_{suffix_id}" - return f"{field} {native} :{param_name}", {f"{param_name}": filter_value} + return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value} elif operator == "$between": # Use AND with two comparisons low, high = filter_value low_param_name = f"{field}_low_{suffix_id}" high_param_name = f"{field}_high_{suffix_id}" - return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", { + return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", { f"{low_param_name}": low, f"{high_param_name}": high, } @@ -1123,18 +1148,18 @@ def _handle_field_filter( ) param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$in": - return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value} + return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value} else: # i.e. $nin - return f"{field} <> ALL (:{param_name})", { + return f"{field_selector} <> ALL (:{param_name})", { f"{param_name}": filter_value } elif operator in {"$like", "$ilike"}: param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$like": - return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value} else: # i.e. $ilike - return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value} elif operator == "$exists": if not isinstance(filter_value, bool): raise ValueError( @@ -1143,9 +1168,9 @@ def _handle_field_filter( ) else: if filter_value: - return f"({field} IS NOT NULL)", {} + return f"({field_selector} IS NOT NULL)", {} else: - return f"({field} IS NULL)", {} + return f"({field_selector} IS NULL)", {} else: raise NotImplementedError() diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index 84f9d72..c4b7c28 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -27,6 +27,7 @@ HYBRID_SEARCH_TABLE1 = "test_table_hybrid1" + str(uuid.uuid4()).replace("-", "_") HYBRID_SEARCH_TABLE2 = "test_table_hybrid2" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 sync_method_exception_str = "Sync methods are not implemented for AsyncPGVectorStore. Use PGVectorStore interface instead." @@ -215,6 +216,24 @@ async def vs_custom_filter( await vs_custom_filter.aadd_documents(filter_docs, ids=ids) yield vs_custom_filter + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json( + self, engine: PGEngine + ) -> AsyncIterator[AsyncPGVectorStore]: + await engine._ainit_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json = await AsyncPGVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE, + ) + await vs_metadata_json.aadd_documents(filter_docs, ids=ids) + yield vs_metadata_json + async def test_asimilarity_search_score(self, vs: AsyncPGVectorStore) -> None: results = await vs.asimilarity_search_with_score("foo") assert len(results) == 4 @@ -370,6 +389,19 @@ async def test_vectorstore_with_metadata_filters( ) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + async def test_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json: AsyncPGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = await vs_metadata_json.asimilarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + async def test_asimilarity_hybrid_search(self, vs: AsyncPGVectorStore) -> None: results = await vs.asimilarity_search( "foo", k=1, hybrid_search_config=HybridSearchConfig() diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 7815a25..3fa5108 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -27,6 +27,8 @@ CUSTOM_TABLE = "custom" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_") CUSTOM_FILTER_TABLE_SYNC = "custom_filter_sync" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_METADATA_JSON_TABLE_SYNC = "custom_metadata_json_sync" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -163,6 +165,24 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor await vs_custom_filter.aadd_documents(filter_docs, ids=ids) yield vs_custom_filter + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json( + self, engine: PGEngine + ) -> AsyncIterator[PGVectorStore]: + await engine.ainit_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json = await PGVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE, + ) + await vs_metadata_json.aadd_documents(filter_docs, ids=ids) + yield vs_metadata_json + async def test_asimilarity_search_score(self, vs: PGVectorStore) -> None: results = await vs.asimilarity_search_with_score("foo") assert len(results) == 4 @@ -265,6 +285,19 @@ async def test_vectorstore_with_metadata_filters( "meow", k=5, filter=test_filter ) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + async def test_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json: PGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = await vs_metadata_json.asimilarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter async def test_asimilarity_hybrid_search(self, vs: PGVectorStore) -> None: results = await vs.asimilarity_search( @@ -375,6 +408,24 @@ async def vs_custom_filter_sync( vs_custom_filter_sync.add_documents(filter_docs, ids=ids) yield vs_custom_filter_sync + @pytest_asyncio.fixture(scope="class") + async def vs_metadata_json_sync( + self, engine_sync: PGEngine + ) -> AsyncIterator[PGVectorStore]: + engine_sync.init_vectorstore_table( + CUSTOM_METADATA_JSON_TABLE_SYNC, + VECTOR_SIZE, + store_metadata=True, + ) + + vs_metadata_json_sync = await PGVectorStore.create( + engine_sync, + embedding_service=embeddings_service, + table_name=CUSTOM_METADATA_JSON_TABLE_SYNC, + ) + vs_metadata_json_sync.add_documents(filter_docs, ids=ids) + yield vs_metadata_json_sync + def test_similarity_search_score(self, vs_custom: PGVectorStore) -> None: results = vs_custom.similarity_search_with_score("foo") assert len(results) == 4 @@ -429,6 +480,19 @@ def test_sync_vectorstore_with_metadata_filters( docs = vs_custom_filter_sync.similarity_search("meow", k=5, filter=test_filter) assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES) + def test_sync_vectorstore_with_json_metadata_filters( + self, + vs_metadata_json_sync: PGVectorStore, + test_filter: dict, + expected_ids: list[str], + ) -> None: + """Test end to end construction and search on json metadata.""" + docs = vs_metadata_json_sync.similarity_search( + "meow", k=5, filter=test_filter + ) + assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter + @pytest.mark.parametrize("test_filter", NEGATIVE_TEST_CASES) def test_metadata_filter_negative_tests( self, vs_custom_filter_sync: PGVectorStore, test_filter: dict