Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip improved object retrieval #10513

Merged
merged 7 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions llama_index/core/base_retriever.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Base retriever."""

from abc import abstractmethod
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -72,7 +73,29 @@ def _retrieve_from_object(
f"Retrieving from object {obj.__class__.__name__} with query {query_bundle.query_str}\n",
color="llama_pink",
)
if isinstance(obj, NodeWithScore):

if isinstance(obj, str):
return [
NodeWithScore(
node=TextNode(text=obj),
score=score,
)
]
elif isinstance(obj, dict):
logan-markewich marked this conversation as resolved.
Show resolved Hide resolved
from llama_index.storage.docstore.utils import json_to_doc

# check if its a node, else assume string
try:
node = json_to_doc(obj)
return [NodeWithScore(node=node, score=score)]
except Exception:
return [
NodeWithScore(
node=TextNode(text=str(obj)),
score=score,
)
]
elif isinstance(obj, NodeWithScore):
return [obj]
elif isinstance(obj, BaseNode):
return [NodeWithScore(node=obj, score=score)]
Expand Down Expand Up @@ -144,7 +167,7 @@ def _handle_recursive_retrieval(
node = n.node
score = n.score or 1.0
if isinstance(node, IndexNode):
obj = self.object_map.get(node.index_id, None)
obj = node.obj or self.object_map.get(node.index_id, None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

help me understand, is object_map now only used for retrievers/query engines?

If it's a Node it should now be serialized/deserialized directly on the IndexNode right?

at a high-level once we make retrievers/query engine serializable i was thinking object_map would go away, and we'd replace with a proper docstore

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes if query engines/retrievers were serializable, this would go away.

Right now, unserializable index nodes have to be passed in under the objects kwarg -- from there, we can build a map of index id to object

Then we can serialize and retrieve the index node without the object.

If an index node is retrieved, the object map is checked if we have its object

if obj is not None:
if self._verbose:
print_text(
Expand Down
7 changes: 6 additions & 1 deletion llama_index/indices/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Base index classes."""

import logging
from abc import ABC, abstractmethod
from typing import Any, Dict, Generic, List, Optional, Sequence, Type, TypeVar, cast
Expand Down Expand Up @@ -66,7 +67,11 @@ def __init__(
self._graph_store = self._storage_context.graph_store

objects = objects or []
self._object_map = {obj.index_id: obj.obj for obj in objects}
self._object_map = {}
for obj in objects:
self._object_map[obj.index_id] = obj.obj
obj.obj = None # clear the object avoid serialization issues
logan-markewich marked this conversation as resolved.
Show resolved Hide resolved

with self._service_context.callback_manager.as_trace("index_construction"):
if index_struct is None:
nodes = nodes or []
Expand Down
27 changes: 26 additions & 1 deletion llama_index/schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Base schema for data structures."""

import json
import textwrap
import uuid
Expand Down Expand Up @@ -501,7 +502,31 @@ class IndexNode(TextNode):
"""

index_id: str
obj: Any = Field(exclude=True)
obj: Any = None

def dict(self, **kwargs: Any) -> Dict[str, Any]:
from llama_index.storage.docstore.utils import doc_to_json

data = super().dict(**kwargs)

is_obj_serializable = False
try:
if self.obj is None:
data["obj"] = None
elif isinstance(self.obj, BaseNode):
data["obj"] = doc_to_json(self.obj)
elif isinstance(self.obj, BaseModel):
logan-markewich marked this conversation as resolved.
Show resolved Hide resolved
data["obj"] = self.obj.dict()
else:
data["obj"] = json.dumps(self.obj)
is_obj_serializable = True
except Exception:
pass
logan-markewich marked this conversation as resolved.
Show resolved Hide resolved

if not is_obj_serializable:
raise ValueError("IndexNode obj is not serializable: " + str(self.obj))

return data

@classmethod
def from_text_node(
Expand Down
4 changes: 4 additions & 0 deletions llama_index/vector_stores/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
An index that is built on top of an existing Qdrant collection.

"""

import logging
from typing import Any, List, Optional, Tuple, cast

Expand Down Expand Up @@ -716,6 +717,9 @@ def parse_to_query_result(self, response: List[Any]) -> VectorStoreQueryResult:
similarities = []
ids = []

import pdb

pdb.set_trace()
for point in response:
payload = cast(Payload, point.payload)
try:
Expand Down
Loading