leaner metadata storage, unique param for save

kagisearch · Oct 24, 2023 · 8909e61 · 8909e61
1 parent 3f1a3f9
commit 8909e61
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -21,17 +21,22 @@ from vectordb import Memory
 
 memory = Memory()
 
-text = "Hello world"
-metadata = {'url':'https://example.com'}
+# memory.save(texts, metadata, memory_file)
+memory.save("Hello world")
 
-# Save text with metadata
-# This will automatically embed content
-memory.save(text, metadata)
+
+# You can also save content as a list with associated metadata as dict
+memory.save(
+    ["apples", "oranges"],
+    [{"url": "https://apples.com"}, {"url": "https://oranges.com"}],
+)
 
 # Search for top n relevant chunks
 # We will automatically use the fastest vector search backend
-query="hello"
+query = "hello"
 results = memory.search(query, top_n=1)
+print(results)
+
 ```
 
 ## Methods
@@ -62,6 +67,7 @@ embeddings="normal")**
 - Searches for the most similar chunks to the given query in memory.
 - **query** (str): Query text.
 - **top_n** (int): Number of most similar chunks to return (default: 5).
+- **unique** (bool): Return only items chunks from unique original texts (additional chunks coming from the same text will be ignored). Note this may return less chhunks than requested (default: False).
 - Returns: List of dictionaries containing the top_n most similar chunks and their associated metadata.
 
 **clear(self)**
@@ -79,9 +85,12 @@ embeddings="normal")**
 ```
 from vectordb import Memory
 
-memory = Memory(chunking_strategy={'mode':'sliding_window', 'window_size': 128, 'overlap': 16})
+memory = Memory(
+    chunking_strategy={"mode": "sliding_window", "window_size": 128, "overlap": 16}
+)
 
-text = """
+texts = [
+    """
 Machine learning is a method of data analysis that automates analytical model building.
 
 It is a branch of artificial intelligence based on the idea that systems can learn from data,
@@ -101,14 +110,8 @@ Clustering: Finding groups of similar data points. For example, a machine learni
 Anomaly detection: Finding data points that are different from the rest of the data. For example, a machine learning algorithm could be used to find fraudulent credit card transactions.
 
 Machine learning is a powerful tool that can be used to solve a wide variety of problems. As the amount of data available continues to grow, machine learning is likely to become even more important in the future.
-
-"""
-
-metadata = {"title": "Introduction to Machine Learning", "url": "https://example.com/introduction-to-machine-learning"}
-
-memory.save(text, metadata)
-
-text2 = """
+""",
+    """
 Artificial intelligence (AI) is the simulation of human intelligence in machines
 that are programmed to think like humans and mimic their actions.
 
@@ -135,16 +138,24 @@ Weaponization: AI could be used to develop new weapons that are more powerful an
 Loss of control: If AI becomes too powerful, we may lose control over it, with potentially disastrous consequences.
 
 It is important to weigh the potential benefits and risks of AI carefully as we continue to develop this technology. With careful planning and oversight, AI has the potential to make the world a better place. However, if we are not careful, it could also lead to serious problems.
-"""
+""",
+]
 
-metadata2 = {"title": "Introduction to Artificial Intelligence", "url": "https://example.com/introduction-to-artificial-intelligence"}
+metadata_list = [
+    {
+        "title": "Introduction to Machine Learning",
+        "url": "https://example.com/introduction-to-machine-learning",
+    },
+    {
+        "title": "Introduction to Artificial Intelligence",
+        "url": "https://example.com/introduction-to-artificial-intelligence",
+    },
+]
 
-memory.save(text2, metadata2)
+memory.save(texts, metadata_list)
 
 query = "What is the relationship between AI and machine learning?"
-
 results = memory.search(query, top_n=3)
-
 print(results)
 ```
 

diff --git a/vectordb/chunking.py b/vectordb/chunking.py
@@ -71,6 +71,10 @@ def sliding_window_chunking(self, text: str) -> List[str]:
 
         tokens = text.split()
 
+        # If the text contains fewer tokens than window_size, return the text as a single chunk.
+        if len(tokens) < self.window_size:
+            return [text]
+
         # Use a list comprehension to create chunks from windows
         step = self.window_size - self.overlap
         # Ensure the range covers the entire length of the tokens

diff --git a/vectordb/embedding.py b/vectordb/embedding.py
@@ -41,7 +41,7 @@ def __init__(self, model_name: str = "normal"):
             if model_name == "normal":
                 model_name = "BAAI/bge-small-en-v1.5"
             elif model_name == "best":
-                model_name = "BAAI/bge-base-en-v1.5"
+                model_name = "BAAI/bge-large-en-v1.5"
 
 
             self.model = SentenceTransformer(model_name)

diff --git a/vectordb/memory.py b/vectordb/memory.py
@@ -33,13 +33,18 @@ def __init__(
         :param embedding_model: a string containing the name of the pre-trained model to be used for embeddings (default: "sentence-transformers/all-MiniLM-L6-v2").
         """
         self.memory_file = memory_file
+
         self.memory = (
             [] if memory_file is None else Storage(memory_file).load_from_disk()
         )
         if chunking_strategy is None:
             chunking_strategy = {"mode": "sliding_window"}
         self.chunker = Chunker(chunking_strategy)
 
+        self.metadata_memory = []
+        self.metadata_index_counter = 0
+        self.text_index_counter = 0 
+
         if isinstance(embeddings, str):
             self.embedder = Embedder(embeddings)
         elif isinstance(embeddings, BaseEmbedder):
@@ -66,6 +71,7 @@ def save(
         if not isinstance(texts, list):
             texts = [texts]
 
+
         if metadata is None:
             metadata = []
         elif not isinstance(metadata, list):
@@ -74,37 +80,54 @@ def save(
         # Extend metadata to be the same length as texts, if it's shorter.
         metadata += [{}] * (len(texts) - len(metadata))
 
+        for meta in metadata:
+            self.metadata_memory.append(meta)
+
+        meta_index_start = self.metadata_index_counter  # Starting index for this save operation
+        self.metadata_index_counter += len(metadata)  # Update the counter for future save operations
+
+
         if memory_file is None:
             memory_file = self.memory_file
 
         text_chunks = [self.chunker(text) for text in texts]
+
         chunks_size = [len(chunks) for chunks in text_chunks]
 
         flatten_chunks = list(itertools.chain.from_iterable(text_chunks))
         embeddings = self.embedder.embed_text(flatten_chunks)
 
+
+        text_index_start = self.text_index_counter  # Starting index for this save operation
+        self.text_index_counter += len(texts)  # Update the counter for future save operations
+
+
         # accumulated size is end_index of each chunk
-        for size, end_index, chunks, meta in zip(
+        for size, end_index, chunks, meta_index, text_index in zip(
             chunks_size,
             itertools.accumulate(chunks_size),
             text_chunks,
-            metadata
+            range(meta_index_start, self.metadata_index_counter),
+            range(text_index_start, self.text_index_counter),
         ):
             start_index = end_index - size
-            chunks_embedding = embeddings[start_index: end_index]
+            chunks_embedding = embeddings[start_index:end_index]
 
             for chunk, embedding in zip(chunks, chunks_embedding):
+                print(chunk)
                 entry = {
                     "chunk": chunk,
                     "embedding": embedding,
-                    "metadata": meta,
+                    "metadata_index": meta_index,
+                    "text_index": text_index,
                 }
                 self.memory.append(entry)
+            text_index += 1
 
         if memory_file is not None:
             Storage(memory_file).save_to_disk(self.memory)
 
-    def search(self, query: str, top_n: int = 5) -> List[Dict[str, Any]]:
+    def search(self, query: str, top_n: int = 5, unique=False) -> List[Dict[str, Any]]:
         """
         Searches for the most similar chunks to the given query in memory.
 
@@ -115,8 +138,28 @@ def search(self, query: str, top_n: int = 5) -> List[Dict[str, Any]]:
         query_embedding = self.embedder.embed_text([query])[0]
         embeddings = [entry["embedding"] for entry in self.memory]
         indices = self.vector_search.search_vectors(query_embedding, embeddings, top_n)
+
+        if unique:
+            unique_indices = []
+            seen_text_indices = set()  # Change the variable name
+            for i in indices:
+                text_index = self.memory[i][
+                    "text_index"
+                ]  # Use text_index instead of metadata_index
+                if (
+                    text_index not in seen_text_indices
+                ):  # Use seen_text_indices instead of seen_meta_indices
+                    unique_indices.append(i)
+                    seen_text_indices.add(
+                        text_index
+                    )  # Use seen_text_indices instead of seen_meta_indices
+            indices = unique_indices
+
         results = [
-            {"chunk": self.memory[i]["chunk"], "metadata": self.memory[i]["metadata"]}
+            {
+                "chunk": self.memory[i]["chunk"],
+                "metadata": self.metadata_memory[self.memory[i]["metadata_index"]],
+            }
             for i in indices
         ]
         return results
@@ -125,9 +168,13 @@ def clear(self):
         """
         Clears the memory.
         """
+        self.memory = []
+        self.metadata_memory = []
+        self.metadata_index_counter = 0
+        self.text_index_counter = 0 
+
         if self.memory_file is not None:
             Storage(self.memory_file).save_to_disk(self.memory)
-        self.memory = []
 
     def dump(self):
         """
@@ -136,6 +183,10 @@ def dump(self):
         for entry in self.memory:
             print("Chunk:", entry["chunk"])
             print("Embedding Length:", len(entry["embedding"]))
-            print("Metadata:", entry["metadata"])
+            print("Metadata:", self.metadata_memory[entry["metadata_index"]])
             print("-" * 40)
+
         print("Total entries: ", len(self.memory))
+        print("Total metadata: ", len(self.metadata_memory))
+
+