From ccd6f74833b91adeecd802ea444a45adf5a62cd5 Mon Sep 17 00:00:00 2001
From: Rahul Bansal <bansal.rahul14@gmail.com>
Date: Sat, 30 Dec 2023 16:00:14 +0530
Subject: [PATCH 1/2] Added code

---
 .gitignore                              |   1 +
 pyproject.toml                          |   1 +
 vocode/streaming/models/vector_db.py    |  15 ++-
 vocode/streaming/vector_db/pg_vector.py | 121 ++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 vocode/streaming/vector_db/pg_vector.py

diff --git a/.gitignore b/.gitignore
index e0ff3f5c6..51b6a91d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ benchmark_results/
 private.key
 dump.rdb
 .idea
+.venv/
diff --git a/pyproject.toml b/pyproject.toml
index 4b0d71378..6be953a14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ opentelemetry-sdk = "^1.17.0"
 janus = "^1.0.0"
 scipy = "^1.10.1"
 anthropic = "^0.7.1"
+vecs = "0.4.2"
 
 elevenlabs = {version = "^0.2.6", optional = true}
 google-cloud-texttospeech = {version = "^2.14.1", optional = true}
diff --git a/vocode/streaming/models/vector_db.py b/vocode/streaming/models/vector_db.py
index 719c9ef02..91077a1f2 100644
--- a/vocode/streaming/models/vector_db.py
+++ b/vocode/streaming/models/vector_db.py
@@ -2,16 +2,18 @@
 from typing import Optional
 from .model import TypedModel
 
-DEFAULT_EMBEDDINGS_MODEL = "text-embedding-ada-002"
+DEFAULT_EMBEDDINGS_MODEL = {"name": "text-embedding-ada-002", "dimension": 1536}
 
 
 class VectorDBType(str, Enum):
     BASE = "vector_db_base"
     PINECONE = "vector_db_pinecone"
+    PGVector = "vector_db_pgvector"
 
 
 class VectorDBConfig(TypedModel, type=VectorDBType.BASE.value):
-    embeddings_model: str = DEFAULT_EMBEDDINGS_MODEL
+    embeddings_model: str = DEFAULT_EMBEDDINGS_MODEL["name"]
+    embedding_dimension = DEFAULT_EMBEDDINGS_MODEL["dimension"]
 
 
 class PineconeConfig(VectorDBConfig, type=VectorDBType.PINECONE.value):
@@ -19,3 +21,12 @@ class PineconeConfig(VectorDBConfig, type=VectorDBType.PINECONE.value):
     api_key: Optional[str]
     api_environment: Optional[str]
     top_k: int = 3
+
+
+class PGVectorConfig(VectorDBConfig, type=VectorDBType.PGVector.value):
+    top_k: int = 3
+    password: str = ""
+    user: str = "postgres"
+    port: int = 5432
+    host: str = ""
+    database_name: str = "postgres"
diff --git a/vocode/streaming/vector_db/pg_vector.py b/vocode/streaming/vector_db/pg_vector.py
new file mode 100644
index 000000000..ee5635642
--- /dev/null
+++ b/vocode/streaming/vector_db/pg_vector.py
@@ -0,0 +1,121 @@
+import logging
+from typing import Iterable, List, Optional, Tuple
+import uuid
+import vecs
+import asyncio
+from langchain.docstore.document import Document
+from vocode import getenv
+from vocode.streaming.models.vector_db import PGVectorConfig
+from vocode.streaming.vector_db.base_vector_db import VectorDB
+
+logger = logging.getLogger(__name__)
+
+
+class PGVector(VectorDB):
+    def __init__(self, config: PGVectorConfig, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.config = config
+        self.password = (
+            getenv("PG_VECTOR_PASSWORD") or self.config.password or "3RQ&D!h34d8rbGu"
+        )
+        self.host = (
+            getenv("PG_VECTOR_HOST")
+            or self.config.host
+            or "db.wanqiiptqkundxqgetqt.supabase.co"
+        )
+        self.database_name = (
+            getenv("PG_VECTOR_DATABASE_NAME") or self.config.database_name
+        )
+        self.user = getenv("PG_VECTOR_USER") or self.config.user
+        self.port = getenv("PG_VECTOR_PORT") or self.config.port
+        self.pg_url = f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database_name}"
+        self.dimension = config.embedding_dimension
+        self.vecs = vecs.create_client(self.pg_url)
+        self._text_key = "text"
+
+    async def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        namespace: Optional[str] = None,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of ids to associate with the texts.
+            namespace: Optional namespace to add the texts to, by default it will be docs
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        if namespace is None:
+            namespace = "docs"
+        # Embed and create the documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        for i, text in enumerate(texts):
+            embedding = await self.create_openai_embedding(text)
+            metadata = metadatas[i] if metadatas else {}
+            metadata[self._text_key] = text
+            docs.append((ids[i], embedding, metadata))
+
+        self.docs = self.vecs.get_or_create_collection(
+            name=namespace, dimension=self.dimension
+        )
+        response = self.docs.upsert(records=docs)
+        return ids
+
+    async def similarity_search_with_score(
+        self,
+        query: str,
+        filter: Optional[dict] = None,
+        namespace: Optional[str] = None,
+    ) -> List[Tuple[Document, float]]:
+        """Return PGVector documents most similar to query, along with scores.
+
+        Args:
+            query: Text to look up documents similar to.
+            filter: Dictionary of argument(s) to filter on metadata
+            namespace: Namespace to search in. Default will search in '' namespace.
+
+        Returns:
+            List of Documents most similar to the query and score for each
+        """
+        if namespace is None:
+            namespace = "docs"
+        query_obj = await self.create_openai_embedding(query)
+        self.docs = self.vecs.get_or_create_collection(
+            name=namespace, dimension=self.dimension
+        )
+        results = self.docs.query(
+            data=query_obj,
+            filters=filter,
+            limit=5,
+            include_value=True,
+            include_metadata=True,
+        )
+        docs = []
+        for id, score, metadata in results:
+            if self._text_key in metadata:
+                text = metadata.pop(self._text_key)
+                docs.append((Document(page_content=text, metadata=metadata), score))
+            else:
+                logger.warning(
+                    f"Found document with no `{self._text_key} key. Skipping"
+                )
+        return docs
+
+
+# config = PGVectorConfig(collection_name="docs", dimension=1536)
+# vector = PGVector(config=config)
+# # async def
+# asyncio.run(
+#     vector.add_texts(texts=["Hello How are you?"])
+# )
+# asyncio.run(
+#     vector.similarity_search_with_score(query = "How r you?")
+# )
+# vector.add_texts(texts=["Hello How are you"])

From 7f7d4b6ae1622e2e89d8e70ba438fba9f6fee3f4 Mon Sep 17 00:00:00 2001
From: Rahul Bansal <bansal.rahul14@gmail.com>
Date: Thu, 11 Jan 2024 23:01:34 +0530
Subject: [PATCH 2/2] wrapped the calls in the thread

---
 vocode/streaming/vector_db/pg_vector.py | 41 +++++++++----------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/vocode/streaming/vector_db/pg_vector.py b/vocode/streaming/vector_db/pg_vector.py
index ee5635642..3c63823a1 100644
--- a/vocode/streaming/vector_db/pg_vector.py
+++ b/vocode/streaming/vector_db/pg_vector.py
@@ -15,14 +15,8 @@ class PGVector(VectorDB):
     def __init__(self, config: PGVectorConfig, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.config = config
-        self.password = (
-            getenv("PG_VECTOR_PASSWORD") or self.config.password or "3RQ&D!h34d8rbGu"
-        )
-        self.host = (
-            getenv("PG_VECTOR_HOST")
-            or self.config.host
-            or "db.wanqiiptqkundxqgetqt.supabase.co"
-        )
+        self.password = getenv("PG_VECTOR_PASSWORD") or self.config.password
+        self.host = getenv("PG_VECTOR_HOST") or self.config.host
         self.database_name = (
             getenv("PG_VECTOR_DATABASE_NAME") or self.config.database_name
         )
@@ -65,7 +59,8 @@ async def add_texts(
         self.docs = self.vecs.get_or_create_collection(
             name=namespace, dimension=self.dimension
         )
-        response = self.docs.upsert(records=docs)
+        loop = asyncio.get_event_loop()
+        response = await loop.run_in_executor(None, self.docs.upsert, docs)
         return ids
 
     async def similarity_search_with_score(
@@ -90,12 +85,16 @@ async def similarity_search_with_score(
         self.docs = self.vecs.get_or_create_collection(
             name=namespace, dimension=self.dimension
         )
-        results = self.docs.query(
-            data=query_obj,
-            filters=filter,
-            limit=5,
-            include_value=True,
-            include_metadata=True,
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            None,
+            lambda: self.docs.query(
+                data=query_obj,
+                filters=filter,
+                limit=5,
+                include_value=True,
+                include_metadata=True,
+            ),
         )
         docs = []
         for id, score, metadata in results:
@@ -107,15 +106,3 @@ async def similarity_search_with_score(
                     f"Found document with no `{self._text_key} key. Skipping"
                 )
         return docs
-
-
-# config = PGVectorConfig(collection_name="docs", dimension=1536)
-# vector = PGVector(config=config)
-# # async def
-# asyncio.run(
-#     vector.add_texts(texts=["Hello How are you?"])
-# )
-# asyncio.run(
-#     vector.similarity_search_with_score(query = "How r you?")
-# )
-# vector.add_texts(texts=["Hello How are you"])