Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/pg_vectorstore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"To enable search with filters, it is necessary to declare the columns that you want to filter on when creating the table. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n",
"To achieve performant search with filters, it is crucial to declare the columns you want to filter on within the `metadata_columns` when creating the table, as filtering directly on these columns is far more efficient than attempting to filter on fields within a metadata JSON column. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n",
"\n",
"`PGVectorStore` currently supports the following operators.\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions examples/pg_vectorstore_how_to.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@
"source": [
"### Search for documents with metadata filter\n",
"\n",
"A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns.\n",
"A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns for efficient filtering.\n",
"\n",
"`PGVectorStore` currently supports the following operators and all Postgres data types.\n",
"\n",
Expand Down Expand Up @@ -645,7 +645,7 @@
"\n",
"- **`metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"]`**: These columns are treated as metadata for each product. Metadata provides additional information about a product, such as its name, category, price, quantity available, SKU (Stock Keeping Unit), and an image URL. This information is useful for displaying product details in search results or for filtering and categorization.\n",
"\n",
"- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns.\n"
"- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns. Note that filtering on fields within the JSON but not in `metadata_columns` will be less efficient.\n"
]
},
{
Expand Down
41 changes: 33 additions & 8 deletions langchain_postgres/v2/async_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import copy
import datetime
import json
import uuid
from typing import Any, Callable, Iterable, Optional, Sequence
Expand Down Expand Up @@ -54,6 +55,16 @@
.union(SPECIAL_CASED_OPERATORS)
)

PYTHON_TO_POSTGRES_TYPE_MAP = {
int: "INTEGER",
float: "FLOAT",
str: "TEXT",
bool: "BOOLEAN",
datetime.date: "DATE",
datetime.datetime: "TIMESTAMP",
datetime.time: "TIME",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need to convert Python None to Null

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Null is handled in $exists separately so it shouldn't be a problem

}


class AsyncPGVectorStore(VectorStore):
"""Postgres Vector Store class"""
Expand Down Expand Up @@ -1093,19 +1104,33 @@ def _handle_field_filter(
operator = "$eq"
filter_value = value

field_selector = field
if self.metadata_json_column is not None and field not in self.metadata_columns and field not in (
self.id_column,
self.content_column,
self.embedding_column
):
filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value)
postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type)
if postgres_type is None:
raise ValueError(f"Unsupported type: {filter_value_type}")
field_selector = f"{self.metadata_json_column}->>'{field}'"
if postgres_type != "TEXT" and operator != "$exists":
field_selector = f"({field_selector})::{postgres_type}"

suffix_id = str(uuid.uuid4()).split("-")[0]
if operator in COMPARISONS_TO_NATIVE:
# Then we implement an equality filter
# native is trusted input
native = COMPARISONS_TO_NATIVE[operator]
param_name = f"{field}_{suffix_id}"
return f"{field} {native} :{param_name}", {f"{param_name}": filter_value}
return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value}
elif operator == "$between":
# Use AND with two comparisons
low, high = filter_value
low_param_name = f"{field}_low_{suffix_id}"
high_param_name = f"{field}_high_{suffix_id}"
return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", {
return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", {
f"{low_param_name}": low,
f"{high_param_name}": high,
}
Expand All @@ -1123,18 +1148,18 @@ def _handle_field_filter(
)
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
if operator == "$in":
return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value}
return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value}
else: # i.e. $nin
return f"{field} <> ALL (:{param_name})", {
return f"{field_selector} <> ALL (:{param_name})", {
f"{param_name}": filter_value
}

elif operator in {"$like", "$ilike"}:
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
if operator == "$like":
return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value}
return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value}
else: # i.e. $ilike
return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value}
return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value}
elif operator == "$exists":
if not isinstance(filter_value, bool):
raise ValueError(
Expand All @@ -1143,9 +1168,9 @@ def _handle_field_filter(
)
else:
if filter_value:
return f"({field} IS NOT NULL)", {}
return f"({field_selector} IS NOT NULL)", {}
else:
return f"({field} IS NULL)", {}
return f"({field_selector} IS NULL)", {}
else:
raise NotImplementedError()

Expand Down
32 changes: 32 additions & 0 deletions tests/unit_tests/v2/test_async_pg_vectorstore_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
HYBRID_SEARCH_TABLE1 = "test_table_hybrid1" + str(uuid.uuid4()).replace("-", "_")
HYBRID_SEARCH_TABLE2 = "test_table_hybrid2" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_")
VECTOR_SIZE = 768
sync_method_exception_str = "Sync methods are not implemented for AsyncPGVectorStore. Use PGVectorStore interface instead."

Expand Down Expand Up @@ -215,6 +216,24 @@ async def vs_custom_filter(
await vs_custom_filter.aadd_documents(filter_docs, ids=ids)
yield vs_custom_filter

@pytest_asyncio.fixture(scope="class")
async def vs_metadata_json(
self, engine: PGEngine
) -> AsyncIterator[AsyncPGVectorStore]:
await engine._ainit_vectorstore_table(
CUSTOM_METADATA_JSON_TABLE,
VECTOR_SIZE,
store_metadata=True,
)

vs_metadata_json = await AsyncPGVectorStore.create(
engine,
embedding_service=embeddings_service,
table_name=CUSTOM_METADATA_JSON_TABLE,
)
await vs_metadata_json.aadd_documents(filter_docs, ids=ids)
yield vs_metadata_json

async def test_asimilarity_search_score(self, vs: AsyncPGVectorStore) -> None:
results = await vs.asimilarity_search_with_score("foo")
assert len(results) == 4
Expand Down Expand Up @@ -370,6 +389,19 @@ async def test_vectorstore_with_metadata_filters(
)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

@pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES)
async def test_vectorstore_with_json_metadata_filters(
self,
vs_metadata_json: AsyncPGVectorStore,
test_filter: dict,
expected_ids: list[str],
) -> None:
"""Test end to end construction and search on json metadata."""
docs = await vs_metadata_json.asimilarity_search(
"meow", k=5, filter=test_filter
)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

async def test_asimilarity_hybrid_search(self, vs: AsyncPGVectorStore) -> None:
results = await vs.asimilarity_search(
"foo", k=1, hybrid_search_config=HybridSearchConfig()
Expand Down
64 changes: 64 additions & 0 deletions tests/unit_tests/v2/test_pg_vectorstore_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
CUSTOM_TABLE = "custom" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_FILTER_TABLE = "custom_filter" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_FILTER_TABLE_SYNC = "custom_filter_sync" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_METADATA_JSON_TABLE = "custom_metadata_json" + str(uuid.uuid4()).replace("-", "_")
CUSTOM_METADATA_JSON_TABLE_SYNC = "custom_metadata_json_sync" + str(uuid.uuid4()).replace("-", "_")
VECTOR_SIZE = 768

embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE)
Expand Down Expand Up @@ -163,6 +165,24 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor
await vs_custom_filter.aadd_documents(filter_docs, ids=ids)
yield vs_custom_filter

@pytest_asyncio.fixture(scope="class")
async def vs_metadata_json(
self, engine: PGEngine
) -> AsyncIterator[PGVectorStore]:
await engine.ainit_vectorstore_table(
CUSTOM_METADATA_JSON_TABLE,
VECTOR_SIZE,
store_metadata=True,
)

vs_metadata_json = await PGVectorStore.create(
engine,
embedding_service=embeddings_service,
table_name=CUSTOM_METADATA_JSON_TABLE,
)
await vs_metadata_json.aadd_documents(filter_docs, ids=ids)
yield vs_metadata_json

async def test_asimilarity_search_score(self, vs: PGVectorStore) -> None:
results = await vs.asimilarity_search_with_score("foo")
assert len(results) == 4
Expand Down Expand Up @@ -265,6 +285,19 @@ async def test_vectorstore_with_metadata_filters(
"meow", k=5, filter=test_filter
)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

@pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES)
async def test_vectorstore_with_json_metadata_filters(
self,
vs_metadata_json: PGVectorStore,
test_filter: dict,
expected_ids: list[str],
) -> None:
"""Test end to end construction and search on json metadata."""
docs = await vs_metadata_json.asimilarity_search(
"meow", k=5, filter=test_filter
)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

async def test_asimilarity_hybrid_search(self, vs: PGVectorStore) -> None:
results = await vs.asimilarity_search(
Expand Down Expand Up @@ -375,6 +408,24 @@ async def vs_custom_filter_sync(
vs_custom_filter_sync.add_documents(filter_docs, ids=ids)
yield vs_custom_filter_sync

@pytest_asyncio.fixture(scope="class")
async def vs_metadata_json_sync(
self, engine_sync: PGEngine
) -> AsyncIterator[PGVectorStore]:
engine_sync.init_vectorstore_table(
CUSTOM_METADATA_JSON_TABLE_SYNC,
VECTOR_SIZE,
store_metadata=True,
)

vs_metadata_json_sync = await PGVectorStore.create(
engine_sync,
embedding_service=embeddings_service,
table_name=CUSTOM_METADATA_JSON_TABLE_SYNC,
)
vs_metadata_json_sync.add_documents(filter_docs, ids=ids)
yield vs_metadata_json_sync

def test_similarity_search_score(self, vs_custom: PGVectorStore) -> None:
results = vs_custom.similarity_search_with_score("foo")
assert len(results) == 4
Expand Down Expand Up @@ -429,6 +480,19 @@ def test_sync_vectorstore_with_metadata_filters(
docs = vs_custom_filter_sync.similarity_search("meow", k=5, filter=test_filter)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

@pytest.mark.parametrize("test_filter, expected_ids", FILTERING_TEST_CASES)
def test_sync_vectorstore_with_json_metadata_filters(
self,
vs_metadata_json_sync: PGVectorStore,
test_filter: dict,
expected_ids: list[str],
) -> None:
"""Test end to end construction and search on json metadata."""
docs = vs_metadata_json_sync.similarity_search(
"meow", k=5, filter=test_filter
)
assert [doc.metadata["code"] for doc in docs] == expected_ids, test_filter

@pytest.mark.parametrize("test_filter", NEGATIVE_TEST_CASES)
def test_metadata_filter_negative_tests(
self, vs_custom_filter_sync: PGVectorStore, test_filter: dict
Expand Down