Skip to content

Commit fe88e5f

Browse files
committed
feat(store): add hash storage support for vector data
Add vector_storage_type configuration option to enable Redis hash storage for improved memory efficiency. Supports both "json" (default) and "hash" storage types with automatic vector serialization. - Add vector_storage_type field to IndexConfig for storage type selection - Implement byte string serialization for hash storage using array_to_buffer - Maintain backward compatibility with JSON storage as default - Add comprehensive test coverage for hash storage functionality - Ensure type safety with proper schema copying and type annotations Hash storage provides memory savings for vector data while maintaining full compatibility with existing vector search operations.
1 parent 33d974b commit fe88e5f

File tree

3 files changed

+276
-8
lines changed

3 files changed

+276
-8
lines changed

langgraph/store/redis/__init__.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -372,17 +372,38 @@ def _batch_put_ops(
372372

373373
vector_docs: list[dict[str, Any]] = []
374374
vector_keys: list[str] = []
375+
376+
# Check if we're using hash storage for vectors
377+
vector_storage_type = "json" # default
378+
if self.index_config:
379+
index_dict = dict(self.index_config)
380+
vector_storage_type = index_dict.get("vector_storage_type", "json")
381+
375382
for (ns, key, path, _), vector in zip(text_params, vectors):
376383
vector_key: tuple[str, str] = (ns, key)
377384
doc_id = doc_ids[vector_key]
385+
386+
# Prepare vector based on storage type
387+
if vector_storage_type == "hash":
388+
# For hash storage, convert vector to byte string
389+
from redisvl.redis.utils import array_to_buffer
390+
391+
vector_list = (
392+
vector.tolist() if hasattr(vector, "tolist") else vector
393+
)
394+
embedding_value = array_to_buffer(vector_list, "float32")
395+
else:
396+
# For JSON storage, keep as list
397+
embedding_value = (
398+
vector.tolist() if hasattr(vector, "tolist") else vector
399+
)
400+
378401
vector_docs.append(
379402
{
380403
"prefix": ns,
381404
"key": key,
382405
"field_name": path,
383-
"embedding": (
384-
vector.tolist() if hasattr(vector, "tolist") else vector
385-
),
406+
"embedding": embedding_value,
386407
"created_at": datetime.now(timezone.utc).timestamp(),
387408
"updated_at": datetime.now(timezone.utc).timestamp(),
388409
}

langgraph/store/redis/base.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,22 @@
22

33
from __future__ import annotations
44

5+
import copy
56
import logging
67
import threading
78
from collections import defaultdict
89
from datetime import datetime, timedelta, timezone
9-
from typing import Any, Generic, Iterable, Optional, Sequence, TypedDict, TypeVar, Union
10+
from typing import (
11+
Any,
12+
Dict,
13+
Generic,
14+
Iterable,
15+
Optional,
16+
Sequence,
17+
TypedDict,
18+
TypeVar,
19+
Union,
20+
)
1021

1122
from langgraph.store.base import (
1223
GetOp,
@@ -222,7 +233,15 @@ def __init__(
222233

223234
# Configure vector index if needed
224235
if self.index_config:
225-
vector_schema = self.SCHEMAS[1].copy()
236+
# Get storage type from index config, default to "json" for backward compatibility
237+
# Cast to dict to safely access potential extra fields
238+
index_dict = dict(self.index_config)
239+
vector_storage_type = index_dict.get("vector_storage_type", "json")
240+
241+
vector_schema: Dict[str, Any] = copy.deepcopy(self.SCHEMAS[1])
242+
# Update storage type in schema
243+
vector_schema["index"]["storage_type"] = vector_storage_type
244+
226245
vector_fields = vector_schema.get("fields", [])
227246
vector_field = None
228247
for f in vector_fields:
@@ -243,14 +262,14 @@ def __init__(
243262
"l2": "L2",
244263
}[
245264
_ensure_string_or_literal(
246-
self.index_config.get("distance_type", "cosine")
265+
index_dict.get("distance_type", "cosine")
247266
)
248267
],
249268
}
250269

251270
# Apply any additional vector type config
252-
if "ann_index_config" in self.index_config:
253-
vector_field["attrs"].update(self.index_config["ann_index_config"])
271+
if "ann_index_config" in index_dict:
272+
vector_field["attrs"].update(index_dict["ann_index_config"])
254273

255274
self.vector_index = SearchIndex.from_dict(
256275
vector_schema, redis_client=self._redis

tests/test_store.py

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,3 +642,231 @@ def mock_echo(self, message):
642642
finally:
643643
client.close()
644644
client.connection_pool.disconnect()
645+
646+
647+
def test_vector_storage_json(redis_url, fake_embeddings: CharacterEmbeddings) -> None:
648+
"""Test JSON vector storage (default behavior)."""
649+
# Test data
650+
docs = [
651+
("doc1", {"text": "hello world"}),
652+
("doc2", {"text": "hello universe"}),
653+
("doc3", {"text": "goodbye world"}),
654+
]
655+
656+
index_config = {
657+
"dims": fake_embeddings.dims,
658+
"embed": fake_embeddings,
659+
"distance_type": "cosine",
660+
"fields": ["text"],
661+
# vector_storage_type defaults to "json"
662+
}
663+
664+
ttl_config = {"default_ttl": 2, "refresh_on_read": True}
665+
666+
with RedisStore.from_conn_string(
667+
redis_url, index=index_config, ttl=ttl_config
668+
) as store:
669+
store.setup()
670+
671+
# Insert documents
672+
for key, value in docs:
673+
store.put(("test_json",), key, value)
674+
675+
# Test vector search functionality
676+
results = store.search(("test_json",), query="hello")
677+
assert len(results) >= 2, "Vector search failed for JSON storage"
678+
679+
# Verify both hello documents are found
680+
doc_keys = [r.key for r in results]
681+
assert "doc1" in doc_keys, "doc1 not found in JSON storage"
682+
assert "doc2" in doc_keys, "doc2 not found in JSON storage"
683+
684+
# Test that scores are reasonable (should be > 0 for cosine similarity)
685+
for result in results:
686+
if result.key in ["doc1", "doc2"]:
687+
assert (
688+
result.score > 0
689+
), f"Invalid score for JSON storage: {result.score}"
690+
691+
# Test retrieval by key still works
692+
item = store.get(("test_json",), "doc1")
693+
assert item is not None, "Get operation failed for JSON storage"
694+
assert (
695+
item.value["text"] == "hello world"
696+
), "Retrieved wrong value for JSON storage"
697+
698+
699+
def test_vector_storage_hash(redis_url, fake_embeddings: CharacterEmbeddings) -> None:
700+
"""Test hash vector storage for improved memory efficiency."""
701+
# Test data
702+
docs = [
703+
("doc1", {"text": "hello world"}),
704+
("doc2", {"text": "hello universe"}),
705+
("doc3", {"text": "goodbye world"}),
706+
]
707+
708+
index_config = {
709+
"dims": fake_embeddings.dims,
710+
"embed": fake_embeddings,
711+
"distance_type": "cosine",
712+
"fields": ["text"],
713+
"vector_storage_type": "hash", # Enable hash storage
714+
}
715+
716+
ttl_config = {"default_ttl": 2, "refresh_on_read": True}
717+
718+
with RedisStore.from_conn_string(
719+
redis_url, index=index_config, ttl=ttl_config
720+
) as store:
721+
store.setup()
722+
723+
# Insert documents
724+
for key, value in docs:
725+
store.put(("test_hash",), key, value)
726+
727+
# Test vector search functionality
728+
results = store.search(("test_hash",), query="hello")
729+
assert len(results) >= 2, "Vector search failed for hash storage"
730+
731+
# Verify both hello documents are found
732+
doc_keys = [r.key for r in results]
733+
assert "doc1" in doc_keys, "doc1 not found in hash storage"
734+
assert "doc2" in doc_keys, "doc2 not found in hash storage"
735+
736+
# Test that scores are reasonable (should be > 0 for cosine similarity)
737+
for result in results:
738+
if result.key in ["doc1", "doc2"]:
739+
assert (
740+
result.score > 0
741+
), f"Invalid score for hash storage: {result.score}"
742+
743+
# Test retrieval by key still works
744+
item = store.get(("test_hash",), "doc1")
745+
assert item is not None, "Get operation failed for hash storage"
746+
assert (
747+
item.value["text"] == "hello world"
748+
), "Retrieved wrong value for hash storage"
749+
750+
751+
def test_vector_search_hash(redis_url, fake_embeddings: CharacterEmbeddings) -> None:
752+
"""Test vector search functionality with hash storage."""
753+
index_config = {
754+
"dims": fake_embeddings.dims,
755+
"embed": fake_embeddings,
756+
"distance_type": "cosine",
757+
"fields": ["text"],
758+
"vector_storage_type": "hash",
759+
}
760+
761+
ttl_config = {"default_ttl": 2, "refresh_on_read": True}
762+
763+
with RedisStore.from_conn_string(
764+
redis_url, index=index_config, ttl=ttl_config
765+
) as store:
766+
store.setup()
767+
768+
# Insert documents with text that can be embedded
769+
docs = [
770+
("doc1", {"text": "short text"}),
771+
("doc2", {"text": "longer text document"}),
772+
("doc3", {"text": "longest text document here"}),
773+
]
774+
775+
for key, value in docs:
776+
store.put(("test",), key, value)
777+
778+
# Search with query
779+
results = store.search(("test",), query="longer text")
780+
assert len(results) >= 2
781+
782+
# Doc2 and doc3 should be closer matches to "longer text"
783+
doc_keys = [r.key for r in results]
784+
assert "doc2" in doc_keys
785+
assert "doc3" in doc_keys
786+
787+
788+
def test_vector_search_with_filters_hash(
789+
redis_url, fake_embeddings: CharacterEmbeddings
790+
) -> None:
791+
"""Test vector search with additional filters using hash storage."""
792+
index_config = {
793+
"dims": fake_embeddings.dims,
794+
"embed": fake_embeddings,
795+
"distance_type": "cosine",
796+
"fields": ["text"],
797+
"vector_storage_type": "hash",
798+
}
799+
800+
ttl_config = {"default_ttl": 2, "refresh_on_read": True}
801+
802+
with RedisStore.from_conn_string(
803+
redis_url, index=index_config, ttl=ttl_config
804+
) as store:
805+
store.setup()
806+
807+
# Insert test documents
808+
docs = [
809+
("doc1", {"text": "red apple", "color": "red", "score": 4.5}),
810+
("doc2", {"text": "red car", "color": "red", "score": 3.0}),
811+
("doc3", {"text": "green apple", "color": "green", "score": 4.0}),
812+
("doc4", {"text": "blue car", "color": "blue", "score": 3.5}),
813+
]
814+
815+
for key, value in docs:
816+
store.put(("test",), key, value)
817+
818+
# Search for "apple" within red items
819+
results = store.search(("test",), query="apple", filter={"color": "red"})
820+
assert len(results) >= 1
821+
# Doc1 should be the closest match for "apple" with color=red
822+
assert results[0].key == "doc1"
823+
824+
# Search for "car" within red items
825+
results = store.search(("test",), query="car", filter={"color": "red"})
826+
assert len(results) >= 1
827+
# Doc2 should be the closest match for "car" with color=red
828+
assert results[0].key == "doc2"
829+
830+
831+
def test_vector_update_with_score_verification_hash(
832+
redis_url, fake_embeddings: CharacterEmbeddings
833+
) -> None:
834+
"""Test that updating items properly updates their embeddings with hash storage."""
835+
index_config = {
836+
"dims": fake_embeddings.dims,
837+
"embed": fake_embeddings,
838+
"distance_type": "cosine",
839+
"fields": ["text"],
840+
"vector_storage_type": "hash",
841+
}
842+
843+
ttl_config = {"default_ttl": 2, "refresh_on_read": True}
844+
845+
with RedisStore.from_conn_string(
846+
redis_url, index=index_config, ttl=ttl_config
847+
) as store:
848+
store.setup()
849+
850+
store.put(("test",), "doc1", {"text": "zany zebra xylophone"})
851+
store.put(("test",), "doc2", {"text": "something about dogs"})
852+
853+
# Search for a term similar to doc1's content
854+
results_initial = store.search(("test",), query="zany xylophone")
855+
assert len(results_initial) >= 1
856+
assert results_initial[0].key == "doc1"
857+
initial_score = results_initial[0].score
858+
859+
# Update doc1 to be about dogs instead
860+
store.put(("test",), "doc1", {"text": "new text about dogs"})
861+
862+
# The original query should now match doc1 less strongly
863+
results_after = store.search(("test",), query="zany xylophone")
864+
assert len(results_after) >= 1
865+
after_score = next((r.score for r in results_after if r.key == "doc1"), None)
866+
if after_score is not None:
867+
assert after_score < initial_score
868+
869+
# A dog-related query should now match doc1 more strongly
870+
results_new = store.search(("test",), query="dogs text")
871+
doc1_score = next((r.score for r in results_new if r.key == "doc1"), None)
872+
assert doc1_score is not None

0 commit comments

Comments
 (0)