Skip to content

Commit

Permalink
leaner metadata storage, unique param for save
Browse files Browse the repository at this point in the history
  • Loading branch information
vprelovac committed Oct 24, 2023
1 parent 3f1a3f9 commit 8909e61
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 30 deletions.
53 changes: 32 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,22 @@ from vectordb import Memory
memory = Memory()
text = "Hello world"
metadata = {'url':'https://example.com'}
# memory.save(texts, metadata, memory_file)
memory.save("Hello world")
# Save text with metadata
# This will automatically embed content
memory.save(text, metadata)
# You can also save content as a list with associated metadata as dict
memory.save(
["apples", "oranges"],
[{"url": "https://apples.com"}, {"url": "https://oranges.com"}],
)
# Search for top n relevant chunks
# We will automatically use the fastest vector search backend
query="hello"
query = "hello"
results = memory.search(query, top_n=1)
print(results)
```

## Methods
Expand Down Expand Up @@ -62,6 +67,7 @@ embeddings="normal")**
- Searches for the most similar chunks to the given query in memory.
- **query** (str): Query text.
- **top_n** (int): Number of most similar chunks to return (default: 5).
- **unique** (bool): Return only items chunks from unique original texts (additional chunks coming from the same text will be ignored). Note this may return less chhunks than requested (default: False).
- Returns: List of dictionaries containing the top_n most similar chunks and their associated metadata.

**clear(self)**
Expand All @@ -79,9 +85,12 @@ embeddings="normal")**
```
from vectordb import Memory
memory = Memory(chunking_strategy={'mode':'sliding_window', 'window_size': 128, 'overlap': 16})
memory = Memory(
chunking_strategy={"mode": "sliding_window", "window_size": 128, "overlap": 16}
)
text = """
texts = [
"""
Machine learning is a method of data analysis that automates analytical model building.
It is a branch of artificial intelligence based on the idea that systems can learn from data,
Expand All @@ -101,14 +110,8 @@ Clustering: Finding groups of similar data points. For example, a machine learni
Anomaly detection: Finding data points that are different from the rest of the data. For example, a machine learning algorithm could be used to find fraudulent credit card transactions.
Machine learning is a powerful tool that can be used to solve a wide variety of problems. As the amount of data available continues to grow, machine learning is likely to become even more important in the future.
"""
metadata = {"title": "Introduction to Machine Learning", "url": "https://example.com/introduction-to-machine-learning"}
memory.save(text, metadata)
text2 = """
""",
"""
Artificial intelligence (AI) is the simulation of human intelligence in machines
that are programmed to think like humans and mimic their actions.
Expand All @@ -135,16 +138,24 @@ Weaponization: AI could be used to develop new weapons that are more powerful an
Loss of control: If AI becomes too powerful, we may lose control over it, with potentially disastrous consequences.
It is important to weigh the potential benefits and risks of AI carefully as we continue to develop this technology. With careful planning and oversight, AI has the potential to make the world a better place. However, if we are not careful, it could also lead to serious problems.
"""
""",
]
metadata2 = {"title": "Introduction to Artificial Intelligence", "url": "https://example.com/introduction-to-artificial-intelligence"}
metadata_list = [
{
"title": "Introduction to Machine Learning",
"url": "https://example.com/introduction-to-machine-learning",
},
{
"title": "Introduction to Artificial Intelligence",
"url": "https://example.com/introduction-to-artificial-intelligence",
},
]
memory.save(text2, metadata2)
memory.save(texts, metadata_list)
query = "What is the relationship between AI and machine learning?"
results = memory.search(query, top_n=3)
print(results)
```

Expand Down
4 changes: 4 additions & 0 deletions vectordb/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def sliding_window_chunking(self, text: str) -> List[str]:

tokens = text.split()

# If the text contains fewer tokens than window_size, return the text as a single chunk.
if len(tokens) < self.window_size:
return [text]

# Use a list comprehension to create chunks from windows
step = self.window_size - self.overlap
# Ensure the range covers the entire length of the tokens
Expand Down
2 changes: 1 addition & 1 deletion vectordb/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, model_name: str = "normal"):
if model_name == "normal":
model_name = "BAAI/bge-small-en-v1.5"
elif model_name == "best":
model_name = "BAAI/bge-base-en-v1.5"
model_name = "BAAI/bge-large-en-v1.5"


self.model = SentenceTransformer(model_name)
Expand Down
67 changes: 59 additions & 8 deletions vectordb/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,18 @@ def __init__(
:param embedding_model: a string containing the name of the pre-trained model to be used for embeddings (default: "sentence-transformers/all-MiniLM-L6-v2").
"""
self.memory_file = memory_file

self.memory = (
[] if memory_file is None else Storage(memory_file).load_from_disk()
)
if chunking_strategy is None:
chunking_strategy = {"mode": "sliding_window"}
self.chunker = Chunker(chunking_strategy)

self.metadata_memory = []
self.metadata_index_counter = 0
self.text_index_counter = 0

if isinstance(embeddings, str):
self.embedder = Embedder(embeddings)
elif isinstance(embeddings, BaseEmbedder):
Expand All @@ -66,6 +71,7 @@ def save(
if not isinstance(texts, list):
texts = [texts]


if metadata is None:
metadata = []
elif not isinstance(metadata, list):
Expand All @@ -74,37 +80,54 @@ def save(
# Extend metadata to be the same length as texts, if it's shorter.
metadata += [{}] * (len(texts) - len(metadata))

for meta in metadata:
self.metadata_memory.append(meta)

meta_index_start = self.metadata_index_counter # Starting index for this save operation
self.metadata_index_counter += len(metadata) # Update the counter for future save operations


if memory_file is None:
memory_file = self.memory_file

text_chunks = [self.chunker(text) for text in texts]

chunks_size = [len(chunks) for chunks in text_chunks]

flatten_chunks = list(itertools.chain.from_iterable(text_chunks))
embeddings = self.embedder.embed_text(flatten_chunks)


text_index_start = self.text_index_counter # Starting index for this save operation
self.text_index_counter += len(texts) # Update the counter for future save operations


# accumulated size is end_index of each chunk
for size, end_index, chunks, meta in zip(
for size, end_index, chunks, meta_index, text_index in zip(
chunks_size,
itertools.accumulate(chunks_size),
text_chunks,
metadata
range(meta_index_start, self.metadata_index_counter),
range(text_index_start, self.text_index_counter),
):
start_index = end_index - size
chunks_embedding = embeddings[start_index: end_index]
chunks_embedding = embeddings[start_index:end_index]

for chunk, embedding in zip(chunks, chunks_embedding):
print(chunk)
entry = {
"chunk": chunk,
"embedding": embedding,
"metadata": meta,
"metadata_index": meta_index,
"text_index": text_index,
}
self.memory.append(entry)
text_index += 1

if memory_file is not None:
Storage(memory_file).save_to_disk(self.memory)

def search(self, query: str, top_n: int = 5) -> List[Dict[str, Any]]:
def search(self, query: str, top_n: int = 5, unique=False) -> List[Dict[str, Any]]:
"""
Searches for the most similar chunks to the given query in memory.
Expand All @@ -115,8 +138,28 @@ def search(self, query: str, top_n: int = 5) -> List[Dict[str, Any]]:
query_embedding = self.embedder.embed_text([query])[0]
embeddings = [entry["embedding"] for entry in self.memory]
indices = self.vector_search.search_vectors(query_embedding, embeddings, top_n)

if unique:
unique_indices = []
seen_text_indices = set() # Change the variable name
for i in indices:
text_index = self.memory[i][
"text_index"
] # Use text_index instead of metadata_index
if (
text_index not in seen_text_indices
): # Use seen_text_indices instead of seen_meta_indices
unique_indices.append(i)
seen_text_indices.add(
text_index
) # Use seen_text_indices instead of seen_meta_indices
indices = unique_indices

results = [
{"chunk": self.memory[i]["chunk"], "metadata": self.memory[i]["metadata"]}
{
"chunk": self.memory[i]["chunk"],
"metadata": self.metadata_memory[self.memory[i]["metadata_index"]],
}
for i in indices
]
return results
Expand All @@ -125,9 +168,13 @@ def clear(self):
"""
Clears the memory.
"""
self.memory = []
self.metadata_memory = []
self.metadata_index_counter = 0
self.text_index_counter = 0

if self.memory_file is not None:
Storage(self.memory_file).save_to_disk(self.memory)
self.memory = []

def dump(self):
"""
Expand All @@ -136,6 +183,10 @@ def dump(self):
for entry in self.memory:
print("Chunk:", entry["chunk"])
print("Embedding Length:", len(entry["embedding"]))
print("Metadata:", entry["metadata"])
print("Metadata:", self.metadata_memory[entry["metadata_index"]])
print("-" * 40)

print("Total entries: ", len(self.memory))
print("Total metadata: ", len(self.metadata_memory))


0 comments on commit 8909e61

Please sign in to comment.