From 66f10aff15731a8bd7956e201f728c4cbf7be546 Mon Sep 17 00:00:00 2001 From: jeffrey Date: Fri, 26 Apr 2024 14:20:25 +0900 Subject: [PATCH 1/2] test for large embeddings and version control for latest chromaDB --- requirements.txt | 2 +- .../autorag/nodes/retrieval/test_vectordb.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f3499ba22..e546c3c51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ sacrebleu # for bleu score evaluate # for meteor and other scores rouge_score # for rouge score rich # for pretty logging -chromadb # for vectordb retrieval +chromadb>=0.5.0 # for vectordb retrieval click # for cli fastapi # for api server uvicorn # for api server diff --git a/tests/autorag/nodes/retrieval/test_vectordb.py b/tests/autorag/nodes/retrieval/test_vectordb.py index e002faa3f..dbfc6ebd2 100644 --- a/tests/autorag/nodes/retrieval/test_vectordb.py +++ b/tests/autorag/nodes/retrieval/test_vectordb.py @@ -2,7 +2,9 @@ import pathlib import shutil import tempfile +import uuid from datetime import datetime +from unittest.mock import patch import chromadb import pandas as pd @@ -32,6 +34,15 @@ def ingested_vectordb(): yield collection +@pytest.fixture +def empty_chromadb(): + with tempfile.TemporaryDirectory() as chroma_path: + db = chromadb.PersistentClient(path=chroma_path) + collection = db.create_collection(name="test_vectordb_retrieval", metadata={"hnsw:space": "cosine"}) + + yield collection + + @pytest.fixture def project_dir_for_vectordb_node(): with tempfile.TemporaryDirectory() as test_project_dir: @@ -88,3 +99,18 @@ def test_long_text_vectordb_ingest(ingested_vectordb): vectordb_ingest(ingested_vectordb, new_corpus_df, embedding_model) assert ingested_vectordb.count() == 7 + + +def mock_get_text_embedding_batch(self, texts, **kwargs): + return [[3.0, 4.1, 3.2] for _ in range(len(texts))] + + +@patch.object(OpenAIEmbedding, 'get_text_embedding_batch', mock_get_text_embedding_batch) +def test_long_ids_ingest(empty_chromadb): + embedding_model = OpenAIEmbedding() + content_df = pd.DataFrame({ + 'doc_id': [str(uuid.uuid4()) for _ in range(50_000)], + 'contents': ['havertz' for _ in range(50_000)], + 'metadata': [{'last_modified': datetime.now()} for _ in range(50_000)], + }) + vectordb_ingest(empty_chromadb, content_df, embedding_model) From 45ab34f9b597dc7b579436622f0eb22f6d2bc029 Mon Sep 17 00:00:00 2001 From: jeffrey Date: Fri, 26 Apr 2024 14:51:43 +0900 Subject: [PATCH 2/2] resolve error using creat_batches --- autorag/nodes/retrieval/vectordb.py | 7 ++++++- tests/autorag/nodes/retrieval/test_vectordb.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/autorag/nodes/retrieval/vectordb.py b/autorag/nodes/retrieval/vectordb.py index 3b54b1a92..6f2fd9484 100644 --- a/autorag/nodes/retrieval/vectordb.py +++ b/autorag/nodes/retrieval/vectordb.py @@ -3,6 +3,7 @@ import chromadb import pandas as pd +from chromadb.utils.batch_utils import create_batches from llama_index.core.embeddings import BaseEmbedding from llama_index.embeddings.openai import OpenAIEmbedding @@ -95,4 +96,8 @@ def vectordb_ingest(collection: chromadb.Collection, corpus_data: pd.DataFrame, new_ids = new_passage['doc_id'].tolist() embedded_contents = embedding_model.get_text_embedding_batch(new_contents, show_progress=True) - collection.add(ids=new_ids, embeddings=embedded_contents) + input_batches = create_batches(api=collection._client, ids=new_ids, embeddings=embedded_contents) + for batch in input_batches: + ids = batch[0] + embed_content = batch[1] + collection.add(ids=ids, embeddings=embed_content) diff --git a/tests/autorag/nodes/retrieval/test_vectordb.py b/tests/autorag/nodes/retrieval/test_vectordb.py index dbfc6ebd2..ddb14f204 100644 --- a/tests/autorag/nodes/retrieval/test_vectordb.py +++ b/tests/autorag/nodes/retrieval/test_vectordb.py @@ -109,8 +109,8 @@ def mock_get_text_embedding_batch(self, texts, **kwargs): def test_long_ids_ingest(empty_chromadb): embedding_model = OpenAIEmbedding() content_df = pd.DataFrame({ - 'doc_id': [str(uuid.uuid4()) for _ in range(50_000)], - 'contents': ['havertz' for _ in range(50_000)], - 'metadata': [{'last_modified': datetime.now()} for _ in range(50_000)], + 'doc_id': [str(uuid.uuid4()) for _ in range(100_000)], + 'contents': ['havertz' for _ in range(100_000)], + 'metadata': [{'last_modified_datetime': datetime.now()} for _ in range(100_000)], }) vectordb_ingest(empty_chromadb, content_df, embedding_model)