From ccd6f74833b91adeecd802ea444a45adf5a62cd5 Mon Sep 17 00:00:00 2001 From: Rahul Bansal Date: Sat, 30 Dec 2023 16:00:14 +0530 Subject: [PATCH 1/2] Added code --- .gitignore | 1 + pyproject.toml | 1 + vocode/streaming/models/vector_db.py | 15 ++- vocode/streaming/vector_db/pg_vector.py | 121 ++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 vocode/streaming/vector_db/pg_vector.py diff --git a/.gitignore b/.gitignore index e0ff3f5c6..51b6a91d2 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ benchmark_results/ private.key dump.rdb .idea +.venv/ diff --git a/pyproject.toml b/pyproject.toml index 4b0d71378..6be953a14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ opentelemetry-sdk = "^1.17.0" janus = "^1.0.0" scipy = "^1.10.1" anthropic = "^0.7.1" +vecs = "0.4.2" elevenlabs = {version = "^0.2.6", optional = true} google-cloud-texttospeech = {version = "^2.14.1", optional = true} diff --git a/vocode/streaming/models/vector_db.py b/vocode/streaming/models/vector_db.py index 719c9ef02..91077a1f2 100644 --- a/vocode/streaming/models/vector_db.py +++ b/vocode/streaming/models/vector_db.py @@ -2,16 +2,18 @@ from typing import Optional from .model import TypedModel -DEFAULT_EMBEDDINGS_MODEL = "text-embedding-ada-002" +DEFAULT_EMBEDDINGS_MODEL = {"name": "text-embedding-ada-002", "dimension": 1536} class VectorDBType(str, Enum): BASE = "vector_db_base" PINECONE = "vector_db_pinecone" + PGVector = "vector_db_pgvector" class VectorDBConfig(TypedModel, type=VectorDBType.BASE.value): - embeddings_model: str = DEFAULT_EMBEDDINGS_MODEL + embeddings_model: str = DEFAULT_EMBEDDINGS_MODEL["name"] + embedding_dimension = DEFAULT_EMBEDDINGS_MODEL["dimension"] class PineconeConfig(VectorDBConfig, type=VectorDBType.PINECONE.value): @@ -19,3 +21,12 @@ class PineconeConfig(VectorDBConfig, type=VectorDBType.PINECONE.value): api_key: Optional[str] api_environment: Optional[str] top_k: int = 3 + + +class PGVectorConfig(VectorDBConfig, type=VectorDBType.PGVector.value): + top_k: int = 3 + password: str = "" + user: str = "postgres" + port: int = 5432 + host: str = "" + database_name: str = "postgres" diff --git a/vocode/streaming/vector_db/pg_vector.py b/vocode/streaming/vector_db/pg_vector.py new file mode 100644 index 000000000..ee5635642 --- /dev/null +++ b/vocode/streaming/vector_db/pg_vector.py @@ -0,0 +1,121 @@ +import logging +from typing import Iterable, List, Optional, Tuple +import uuid +import vecs +import asyncio +from langchain.docstore.document import Document +from vocode import getenv +from vocode.streaming.models.vector_db import PGVectorConfig +from vocode.streaming.vector_db.base_vector_db import VectorDB + +logger = logging.getLogger(__name__) + + +class PGVector(VectorDB): + def __init__(self, config: PGVectorConfig, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.config = config + self.password = ( + getenv("PG_VECTOR_PASSWORD") or self.config.password or "3RQ&D!h34d8rbGu" + ) + self.host = ( + getenv("PG_VECTOR_HOST") + or self.config.host + or "db.wanqiiptqkundxqgetqt.supabase.co" + ) + self.database_name = ( + getenv("PG_VECTOR_DATABASE_NAME") or self.config.database_name + ) + self.user = getenv("PG_VECTOR_USER") or self.config.user + self.port = getenv("PG_VECTOR_PORT") or self.config.port + self.pg_url = f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database_name}" + self.dimension = config.embedding_dimension + self.vecs = vecs.create_client(self.pg_url) + self._text_key = "text" + + async def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + namespace: Optional[str] = None, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + namespace: Optional namespace to add the texts to, by default it will be docs + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if namespace is None: + namespace = "docs" + # Embed and create the documents + docs = [] + ids = ids or [str(uuid.uuid4()) for _ in texts] + for i, text in enumerate(texts): + embedding = await self.create_openai_embedding(text) + metadata = metadatas[i] if metadatas else {} + metadata[self._text_key] = text + docs.append((ids[i], embedding, metadata)) + + self.docs = self.vecs.get_or_create_collection( + name=namespace, dimension=self.dimension + ) + response = self.docs.upsert(records=docs) + return ids + + async def similarity_search_with_score( + self, + query: str, + filter: Optional[dict] = None, + namespace: Optional[str] = None, + ) -> List[Tuple[Document, float]]: + """Return PGVector documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + filter: Dictionary of argument(s) to filter on metadata + namespace: Namespace to search in. Default will search in '' namespace. + + Returns: + List of Documents most similar to the query and score for each + """ + if namespace is None: + namespace = "docs" + query_obj = await self.create_openai_embedding(query) + self.docs = self.vecs.get_or_create_collection( + name=namespace, dimension=self.dimension + ) + results = self.docs.query( + data=query_obj, + filters=filter, + limit=5, + include_value=True, + include_metadata=True, + ) + docs = [] + for id, score, metadata in results: + if self._text_key in metadata: + text = metadata.pop(self._text_key) + docs.append((Document(page_content=text, metadata=metadata), score)) + else: + logger.warning( + f"Found document with no `{self._text_key} key. Skipping" + ) + return docs + + +# config = PGVectorConfig(collection_name="docs", dimension=1536) +# vector = PGVector(config=config) +# # async def +# asyncio.run( +# vector.add_texts(texts=["Hello How are you?"]) +# ) +# asyncio.run( +# vector.similarity_search_with_score(query = "How r you?") +# ) +# vector.add_texts(texts=["Hello How are you"]) From 7f7d4b6ae1622e2e89d8e70ba438fba9f6fee3f4 Mon Sep 17 00:00:00 2001 From: Rahul Bansal Date: Thu, 11 Jan 2024 23:01:34 +0530 Subject: [PATCH 2/2] wrapped the calls in the thread --- vocode/streaming/vector_db/pg_vector.py | 41 +++++++++---------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/vocode/streaming/vector_db/pg_vector.py b/vocode/streaming/vector_db/pg_vector.py index ee5635642..3c63823a1 100644 --- a/vocode/streaming/vector_db/pg_vector.py +++ b/vocode/streaming/vector_db/pg_vector.py @@ -15,14 +15,8 @@ class PGVector(VectorDB): def __init__(self, config: PGVectorConfig, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.config = config - self.password = ( - getenv("PG_VECTOR_PASSWORD") or self.config.password or "3RQ&D!h34d8rbGu" - ) - self.host = ( - getenv("PG_VECTOR_HOST") - or self.config.host - or "db.wanqiiptqkundxqgetqt.supabase.co" - ) + self.password = getenv("PG_VECTOR_PASSWORD") or self.config.password + self.host = getenv("PG_VECTOR_HOST") or self.config.host self.database_name = ( getenv("PG_VECTOR_DATABASE_NAME") or self.config.database_name ) @@ -65,7 +59,8 @@ async def add_texts( self.docs = self.vecs.get_or_create_collection( name=namespace, dimension=self.dimension ) - response = self.docs.upsert(records=docs) + loop = asyncio.get_event_loop() + response = await loop.run_in_executor(None, self.docs.upsert, docs) return ids async def similarity_search_with_score( @@ -90,12 +85,16 @@ async def similarity_search_with_score( self.docs = self.vecs.get_or_create_collection( name=namespace, dimension=self.dimension ) - results = self.docs.query( - data=query_obj, - filters=filter, - limit=5, - include_value=True, - include_metadata=True, + loop = asyncio.get_event_loop() + results = await loop.run_in_executor( + None, + lambda: self.docs.query( + data=query_obj, + filters=filter, + limit=5, + include_value=True, + include_metadata=True, + ), ) docs = [] for id, score, metadata in results: @@ -107,15 +106,3 @@ async def similarity_search_with_score( f"Found document with no `{self._text_key} key. Skipping" ) return docs - - -# config = PGVectorConfig(collection_name="docs", dimension=1536) -# vector = PGVector(config=config) -# # async def -# asyncio.run( -# vector.add_texts(texts=["Hello How are you?"]) -# ) -# asyncio.run( -# vector.similarity_search_with_score(query = "How r you?") -# ) -# vector.add_texts(texts=["Hello How are you"])