Add embedding

Signed-off-by: Aisuko <[email protected]>
SkywardAI · Jul 21, 2024 · 810b043 · 810b043
1 parent da02a1e
commit 810b043
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 26 deletions.
diff --git a/Makefile b/Makefile
@@ -64,6 +64,7 @@ INFERENCE_ENG:=llamacpp
 INFERENCE_ENG_PORT:=8080
 INFERENCE_ENG_VERSION:=server--b1-2321a5e
 NUM_CPU_CORES:=8.00
+NUM_CPU_CORES_EMBEDDING:=4.00
 
 # Language model, default is phi3-mini-4k-instruct-q4.gguf
 # https://github.com/SkywardAI/llama.cpp/blob/9b2f16f8055265c67e074025350736adc1ea0666/tests/test-chat-template.cpp#L91-L92
@@ -121,6 +122,7 @@ env:
 	@echo "INFERENCE_ENG_PORT=$(INFERENCE_ENG_PORT)">> $(FILE_NAME)
 	@echo "INFERENCE_ENG_VERSION=$(INFERENCE_ENG_VERSION)">> $(FILE_NAME)
 	@echo "NUM_CPU_CORES=$(NUM_CPU_CORES)">> $(FILE_NAME)
+	@echo "NUM_CPU_CORES_EMBEDDING=$(NUM_CPU_CORES_EMBEDDING)" >> $(FILE_NAME)
 	@echo "LANGUAGE_MODEL_NAME=$(LANGUAGE_MODEL_NAME)">> $(FILE_NAME)
 	@echo "ADMIN_USERNAME=$(ADMIN_USERNAME)">> $(FILE_NAME)
 	@echo "ADMIN_EMAIL=$(ADMIN_EMAIL)">> $(FILE_NAME)

diff --git a/backend/src/repository/inference_eng.py b/backend/src/repository/inference_eng.py
@@ -60,3 +60,9 @@ def instruct_infer_url(cls) -> str:
         str: URL for the inference engine
         """
         return f"http://{cls.infer_eng_url}:{cls.infer_eng_port}/completion"
+
+    @classmethod
+    def instruct_embedding_url(cls) -> str:
+        """
+        """
+        return f"http://embedding_eng:8082/embedding"
diff --git a/backend/src/repository/rag/chat.py b/backend/src/repository/rag/chat.py
@@ -126,26 +126,29 @@ async def get_context_by_question(input_msg: str):
             """
 
             # tokenized_input
-            async with httpx.AsyncClient() as client:
-                try:
-                    res=await client.post(
-                        InferenceHelper.tokenizer_url(),
-                        json={"content": input_msg},
-                    )
-                    res.raise_for_status()
-                    tokenized_input = res.json().get("tokens")
-                except Exception as e:
-                    pass
+
+            try:
+                res=await httpx_kit.async_client.post(
+                    InferenceHelper.instruct_embedding_url(),
+                    headers={"Content-Type": "application/json"},
+                    json={"content": input_msg},
+                    timeout=httpx.Timeout(timeout=None)
+                )
+                res.raise_for_status()
+                tokenized_input = res.json().get("embedding")
+            except Exception as e:
+                loguru.logger.error(e)
             # search the context in the vector database
-            result=await vector_db.search(tokenized_input, 1, collection_name="aisuko_squad01")
+            # context=vector_db.search(tokenized_input, 1, collection_name="aisuko_squad01")
+            context=""
             # combine the context with the input message
-            context = ""
             return context or InferenceHelper.instruction
 
         current_context = await get_context_by_question(input_msg)
 
+
         data_with_context = {
-            "prompt": self.format_prompt(input_msg, current_context),
+            "prompt": self.format_prompt(input_msg, current_context=""),
             "temperature": temperature,
             "top_k": top_k,
             "top_p": top_p,

diff --git a/backend/src/repository/vector_database.py b/backend/src/repository/vector_database.py
@@ -60,19 +60,27 @@ def insert_list(self, collection_name: str = DEFAULT_COLLECTION, data_list: list
 
     def search(self, data, n_results, collection_name=DEFAULT_COLLECTION):
         search_params = {"metric_type": "COSINE", "params": {}}
-        res = self.client.search(
-            collection_name=collection_name,
-            data=data,
-            limit=n_results,
-            search_params=search_params,
-            output_fields=["title"],
-        )
-        loguru.logger.info(f"Vector Database --- Result: {res}")
-        sentences = []
-        for hits in res:
-            for hit in hits:
-                sentences.append(hit.get("entity").get("title"))
-        return sentences
+        try:
+
+            res = self.client.search(
+                collection_name=collection_name,
+                data=data,
+                limit=n_results,
+                search_params=search_params,
+                output_fields=["title"],
+            )
+
+            loguru.logger.info(f"Vector Database --- Result: {res}")
+            sentences = []
+            for hits in res:
+                for hit in hits:
+                    sentences.append(hit.get("entity").get("title"))
+            return sentences
+        except Exception as e:
+            loguru.logger.error(e)
+        return None
+
+
 
     def create_index(self, index_name, index_params, collection_name=DEFAULT_COLLECTION):
         self.client.create_index(collection_name, index_name, index_params)

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -87,6 +87,7 @@ services:
       - ETCD_AUTO_COMPACTION_RETENTION=${ETCD_AUTO_COMPACTION_RETENTION}
       - ETCD_QUOTA_BACKEND_BYTES=${ETCD_QUOTA_BACKEND_BYTES}
       - NUM_CPU_CORES=${NUM_CPU_CORES}
+      - NUM_CPU_CORES_EMBEDDING=${NUM_CPU_CORES_EMBEDDING}
     volumes:
       - ./backend/:/app/
       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/models:/models
@@ -163,6 +164,22 @@ services:
       - 8080:8080
     command: ["-m", "models/${LANGUAGE_MODEL_NAME}","-c","8192"]
 
+  embedding_eng:
+    container_name: embedding_eng
+    image: gclub/llama.cpp:${INFERENCE_ENG_VERSION}
+    restart: always
+    deploy: # https://github.com/compose-spec/compose-spec/blob/master/deploy.md
+      resources:
+        reservations:
+          cpus: "${NUM_CPU_CORES_EMBEDDING}"
+    volumes:
+      - "${DOCKER_VOLUME_DIRECTORY:-.}/volumes/models:/models"
+    expose:
+      - 8080
+    ports:
+      - 8082:8080
+    command: ["-m", "models/${LANGUAGE_MODEL_NAME}","--embeddings","--pooling","mean","-c","512"]
+
   rebel:
     container_name: rebel
     image: ghcr.io/skywardai/rebel:v0.1.6