Quansight · Tengal-Teemo · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,10 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# Ignore my entrypoint
+test.py
+doctest/
+
 # C extensions
 *.so
 

diff --git a/ragna/core/_components.py b/ragna/core/_components.py
@@ -4,7 +4,18 @@
 import enum
 import functools
 import inspect
-from typing import AsyncIterable, AsyncIterator, Iterator, Optional, Type, Union
+import warnings
+from typing import (
+    AsyncIterable,
+    AsyncIterator,
+    Iterator,
+    Optional,
+    Type,
+    Union,
+    get_args,
+    get_origin,
+    get_type_hints,
+)
 
 import pydantic
 import pydantic.utils
@@ -76,6 +87,12 @@ def _protocol_model(cls) -> Type[pydantic.BaseModel]:
         return merge_models(cls.display_name(), *cls._protocol_models().values())
 
 
+# Just for demo purposes. We need to move the actual class here.
+# See https://github.com/Quansight/ragna/pull/354#discussion_r1526235318
+class Embedding:
+    pass
+
+
 class Source(pydantic.BaseModel):
     """Data class for sources stored inside a source storage.
 
@@ -98,6 +115,43 @@ class Source(pydantic.BaseModel):
 
 class SourceStorage(Component, abc.ABC):
     __ragna_protocol_methods__ = ["store", "retrieve"]
+    __ragna_input_type__: Union[Document, Embedding]
+
+    def __init_subclass__(cls):
+        if inspect.isabstract(cls):
+            return
+
+        valid_input_types = get_args(get_type_hints(cls)["__ragna_input_type__"])
+
+        input_parameter_name = list(inspect.signature(cls.store).parameters.keys())[1]
+        input_parameter_annotation = get_type_hints(cls.store).get(input_parameter_name)
+
+        if input_parameter_annotation is None:
+            input_type = None
+        else:
+
+            def extract_input_type():
+                origin = get_origin(input_parameter_annotation)
+                if origin is None:
+                    return None
+
+                args = get_args(input_parameter_annotation)
+                if len(args) != 1:
+                    return None
+
+                input_type = args[0]
+                if not issubclass(input_type, valid_input_types):
+                    return None
+
+                return input_type
+
+            input_type = extract_input_type()
+
+        if input_type is None:
+            warnings.warn("ADDME")
+            input_type = Document
+
+        cls.__ragna_input_type__ = input_type
 
     @abc.abstractmethod
     def store(self, documents: list[Document]) -> None:

diff --git a/ragna/core/_rag.py b/ragna/core/_rag.py
@@ -25,6 +25,9 @@
 from ._document import Document, LocalDocument
 from ._utils import RagnaException, default_user, merge_models
 
+from ragna.source_storages._embedding_model import GenericEmbeddingModel
+from ragna.source_storages._chunking_model import GenericChunkingModel
+
 T = TypeVar("T")
 C = TypeVar("C", bound=Component)
 
@@ -80,6 +83,8 @@ def chat(
         documents: Iterable[Any],
         source_storage: Union[Type[SourceStorage], SourceStorage],
         assistant: Union[Type[Assistant], Assistant],
+        embedding_model: Union[Type[GenericEmbeddingModel], GenericEmbeddingModel],
+        chunking_model: Union[Type[GenericChunkingModel], GenericChunkingModel],
         **params: Any,
     ) -> Chat:
         """Create a new [ragna.core.Chat][].
@@ -89,13 +94,17 @@ def chat(
                 [ragna.core.LocalDocument.from_path][] is invoked on it.
             source_storage: Source storage to use.
             assistant: Assistant to use.
+            embedding_model: Embedding Model to use
+            chunking_model: Chunking Model to use
             **params: Additional parameters passed to the source storage and assistant.
         """
         return Chat(
             self,
             documents=documents,
             source_storage=source_storage,
             assistant=assistant,
+            embedding_model=embedding_model,
+            chunking_model=chunking_model,
             **params,
         )
 
@@ -148,10 +157,15 @@ def __init__(
         documents: Iterable[Any],
         source_storage: Union[Type[SourceStorage], SourceStorage],
         assistant: Union[Type[Assistant], Assistant],
+        embedding_model: Union[Type[GenericEmbeddingModel], GenericEmbeddingModel],
+        chunking_model: Union[Type[GenericChunkingModel], GenericChunkingModel],
         **params: Any,
     ) -> None:
         self._rag = rag
 
+        self.embedding_model = cast(GenericEmbeddingModel, self._rag._load_component(embedding_model))
+        self.chunking_model = cast(GenericChunkingModel, self._rag._load_component(chunking_model))
+
         self.documents = self._parse_documents(documents)
         self.source_storage = cast(
             SourceStorage, self._rag._load_component(source_storage)
@@ -188,7 +202,14 @@ async def prepare(self) -> Message:
                 detail=RagnaException.EVENT,
             )
 
-        await self._run(self.source_storage.store, self.documents)
+        if list[Document] in inspect.signature(self.source_storage.store).parameters.values():
+            await self._run(self.source_storage.store, self.documents)
+        else:
+            # Here we need to generate the list of embeddings
+            chunks = self.chunking_model.chunk_documents(self.documents)
+            embeddings = self.embedding_model.embed_chunks(chunks)
+            await self._run(self.source_storage.store, embeddings)
+
         self._prepared = True
 
         welcome = Message(
@@ -218,7 +239,10 @@ async def answer(self, prompt: str, *, stream: bool = False) -> Message:
 
         self._messages.append(Message(content=prompt, role=MessageRole.USER))
 
-        sources = await self._run(self.source_storage.retrieve, self.documents, prompt)
+        if list[Document] in inspect.signature(self.source_storage.store).parameters.values():
+            sources = await self._run(self.source_storage.retrieve, self.documents, prompt)
+        else:
+            sources = await self._run(self.source_storage.retrieve, self.documents, self.embedding_model.embed_text(prompt))
 
         answer = Message(
             content=self._run_gen(self.assistant.answer, prompt, sources),

diff --git a/ragna/source_storages/__init__.py b/ragna/source_storages/__init__.py
@@ -2,11 +2,17 @@
     "Chroma",
     "LanceDB",
     "RagnaDemoSourceStorage",
+    "MiniLML6v2",
+    "NLTKChunkingModel",
+    "SpacyChunkingModel",
+    "TokenChunkingModel"
 ]
 
 from ._chroma import Chroma
 from ._demo import RagnaDemoSourceStorage
 from ._lancedb import LanceDB
+from ._embedding_model import MiniLML6v2
+from ._chunking_model import NLTKChunkingModel, SpacyChunkingModel, TokenChunkingModel
 
 # isort: split
 

diff --git a/ragna/source_storages/_chroma.py b/ragna/source_storages/_chroma.py
@@ -8,6 +8,9 @@
 
 from ._vector_database import VectorDatabaseSourceStorage
 
+from ._embedding_model import MiniLML6v2, Embedding
+from ._chunking_model import NLTKChunkingModel
+
 
 class Chroma(VectorDatabaseSourceStorage):
     """[Chroma vector database](https://www.trychroma.com/)
@@ -25,6 +28,9 @@ def __init__(self) -> None:
 
         import chromadb
 
+        self._embedding_model = MiniLML6v2()
+        self._chunking_model = NLTKChunkingModel()
+
         self._client = chromadb.Client(
             chromadb.config.Settings(
                 is_persistent=True,
@@ -33,58 +39,59 @@ def __init__(self) -> None:
             )
         )
 
+        self._tokens = 0
+        self._embeddings = 0
+
     def store(
         self,
-        documents: list[Document],
+        documents: list[Embedding],
         *,
         chat_id: uuid.UUID,
-        chunk_size: int = 500,
-        chunk_overlap: int = 250,
     ) -> None:
         collection = self._client.create_collection(
-            str(chat_id), embedding_function=self._embedding_function
+            str(chat_id)
         )
 
         ids = []
         texts = []
+        embeddings = []
         metadatas = []
-        for document in documents:
-            for chunk in self._chunk_pages(
-                document.extract_pages(),
-                chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap,
-            ):
-                ids.append(str(uuid.uuid4()))
-                texts.append(chunk.text)
-                metadatas.append(
-                    {
-                        "document_id": str(document.id),
-                        "page_numbers": self._page_numbers_to_str(chunk.page_numbers),
-                        "num_tokens": chunk.num_tokens,
-                    }
-                )
+        for embedding in documents:
+            self._tokens += embedding.chunk.num_tokens
+            self._embeddings += 1
+
+            ids.append(str(uuid.uuid4()))
+            texts.append(embedding.chunk.text)
+            metadatas.append(
+                {
+                    "document_id": str(embedding.chunk.document_id),
+                    "page_numbers": self._page_numbers_to_str(embedding.chunk.page_numbers),
+                    "num_tokens": embedding.chunk.num_tokens,
+                }
+            )
+            embeddings.append(embedding.embedding)
 
         collection.add(
             ids=ids,
+            embeddings=embeddings,
             documents=texts,
             metadatas=metadatas,  # type: ignore[arg-type]
         )
 
     def retrieve(
         self,
         documents: list[Document],
-        prompt: str,
+        prompt: list[float],
         *,
         chat_id: uuid.UUID,
-        chunk_size: int = 500,
         num_tokens: int = 1024,
     ) -> list[Source]:
         collection = self._client.get_collection(
-            str(chat_id), embedding_function=self._embedding_function
+            str(chat_id)
         )
 
         result = collection.query(
-            query_texts=prompt,
+            query_embeddings=prompt,
             n_results=min(
                 # We cannot retrieve source by a maximum number of tokens. Thus, we
                 # estimate how many sources we have to query. We overestimate by a
@@ -97,7 +104,7 @@ def retrieve(
                 #  Instead of just querying more documents here, we should use the
                 #  appropriate index parameters when creating the collection. However,
                 #  they are undocumented for now.
-                max(int(num_tokens * 2 / chunk_size), 100),
+                max(int(num_tokens * 2 / self._tokens * self._embeddings), 100),
                 collection.count(),
             ),
             include=["distances", "metadatas", "documents"],