Merge pull request #8 from n4ze3m/next

cohere, huggingface embedding
n4ze3m · Jun 9, 2023 · 102b577 · 102b577
2 parents 2ee1e7b + 14f7c24
commit 102b577
Show file tree

Hide file tree

Showing 15 changed files with 173 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -101,14 +101,13 @@ and more...
 
 - [x] OpenAI 
 - [ ] Anthropic
-- [ ] Falcon-7B
 
 ### Embedding models
 
 - [X] OpenAI 
 - [X] TensorFlow
-- [ ] HuggingFace 
-- [ ] Cohere
+- [X] HuggingFace 
+- [X] Cohere
 
 
 ### Application

diff --git a/app/ui/src/utils/embeddings.ts b/app/ui/src/utils/embeddings.ts
@@ -1,6 +1,7 @@
 export const availableEmbeddingTypes = [
     { value: "openai", label: "OpenAI" },
     { value: "tensorflow", label: "Tensorflow" },
-    // { value: "cohere", label: "Cohere"}
+    { value: "cohere", label: "Cohere"},
+    { value: "huggingface-api", label: "HuggingFace (Inference)"}
   ];
 
diff --git a/docker/imp.env b/docker/imp.env
@@ -4,4 +4,6 @@ OPENAI_API_KEY=""
 # DB_SECRET_KEY is used for jwt token generation please change it to your own secret key
 DB_SECRET_KEY="super-secret-key"
 # Cohere API key  -> https://dashboard.cohere.ai/api-keys
-# COHERE_API_KEY=""
+COHERE_API_KEY=""
+# Huggingface Hub API key -> https://huggingface.co/settings/token
+HUGGINGFACEHUB_API_KEY=""
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "dialoqbase",
-  "version": "0.0.3",
+  "version": "0.0.4",
   "description": "Create chatbots with ease",
   "scripts": {
     "ui:dev": "pnpm run --filter ui dev",

diff --git a/server/package.json b/server/package.json
@@ -28,6 +28,7 @@
     "@fastify/multipart": "^7.6.0",
     "@fastify/sensible": "^5.0.0",
     "@fastify/static": "^6.10.2",
+    "@huggingface/inference": "1",
     "@prisma/client": "4.15.0",
     "@tensorflow-models/universal-sentence-encoder": "^1.3.3",
     "@tensorflow/tfjs-backend-cpu": "^4.7.0",

diff --git a/server/prisma/schema.prisma b/server/prisma/schema.prisma
@@ -1,10 +1,12 @@
 generator client {
   provider = "prisma-client-js"
+  previewFeatures = ["postgresqlExtensions"]
 }
 
 datasource db {
   provider = "postgresql"
   url      = env("DATABASE_URL")
+  extensions = [pgvector(map: "vector", schema: "extensions")]  
 }
 
 model Bot {

diff --git a/server/src/app.ts b/server/src/app.ts
@@ -4,6 +4,7 @@ import { FastifyPluginAsync } from "fastify";
 import cors from "@fastify/cors";
 import fastifyStatic from "@fastify/static";
 import fastifyMultipart from "@fastify/multipart";
+
 export type AppOptions = {} & Partial<AutoloadPluginOptions>;
 
 const options: AppOptions = {};

diff --git a/server/src/queue/index.ts b/server/src/queue/index.ts
@@ -18,145 +18,147 @@ export const queue = new Queue("vector", process.env.DB_REDIS_URL!, {});
 export const queueHandler = async (job: Job, done: DoneCallback) => {
   const data = job.data as QSource[];
 
-  console.log("Processing queue" );
-
-  for (const source of data) {
-    try {
-      if (source.type.toLowerCase() === "website") {
-        await prisma.botSource.update({
-          where: {
-            id: source.id,
-          },
-          data: {
-            status: "PROCESSING",
-          },
-        });
-
-        const loader = new CheerioWebBaseLoader(source.content!);
-        const docs = await loader.load();
-
-        const textSplitter = new RecursiveCharacterTextSplitter({
-          chunkSize: 1000,
-          chunkOverlap: 200,
-        });
-        const chunks = await textSplitter.splitDocuments(docs);
-
-        await DialoqbaseVectorStore.fromDocuments(
-          chunks,
-          embeddings(source.embedding),
-          {
-            botId: source.botId,
-            sourceId: source.id,
-          },
-        );
-
-        await prisma.botSource.update({
-          where: {
-            id: source.id,
-          },
-          data: {
-            status: "FINISHED",
-            isPending: false,
-          },
-        });
-      } else if (source.type.toLowerCase() === "text") {
-        await prisma.botSource.update({
-          where: {
-            id: source.id,
-          },
-          data: {
-            status: "PROCESSING",
-          },
-        });
-
-        const textSplitter = new RecursiveCharacterTextSplitter({
-          chunkSize: 1000,
-          chunkOverlap: 200,
-        });
-        const chunks = await textSplitter.splitDocuments([
-          {
-            pageContent: source.content!,
-            metadata: {
-              source: `text-${source.id}`,
+  console.log("Processing queue");
+  try {
+    for (const source of data) {
+      try {
+        if (source.type.toLowerCase() === "website") {
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
             },
-          },
-        ]);
-
-        await DialoqbaseVectorStore.fromDocuments(
-          chunks,
-          embeddings(source.embedding),
-          {
-            botId: source.botId,
-            sourceId: source.id,
-          },
-        );
+            data: {
+              status: "PROCESSING",
+            },
+          });
+
+          const loader = new CheerioWebBaseLoader(source.content!);
+          const docs = await loader.load();
+
+          const textSplitter = new RecursiveCharacterTextSplitter({
+            chunkSize: 1000,
+            chunkOverlap: 200,
+          });
+          const chunks = await textSplitter.splitDocuments(docs);
+
+          await DialoqbaseVectorStore.fromDocuments(
+            chunks,
+            embeddings(source.embedding),
+            {
+              botId: source.botId,
+              sourceId: source.id,
+            },
+          );
 
-        await prisma.botSource.update({
-          where: {
-            id: source.id,
-          },
-          data: {
-            status: "FINISHED",
-            isPending: false,
-          },
-        });
-      } else if (source.type.toLowerCase() === "pdf") {
-        console.log("loading pdf");
-        await prisma.botSource.update({
-          where: {
-            id: source.id,
-          },
-          data: {
-            status: "PROCESSING",
-          },
-        });
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
+            },
+            data: {
+              status: "FINISHED",
+              isPending: false,
+            },
+          });
+        } else if (source.type.toLowerCase() === "text") {
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
+            },
+            data: {
+              status: "PROCESSING",
+            },
+          });
+
+          const textSplitter = new RecursiveCharacterTextSplitter({
+            chunkSize: 1000,
+            chunkOverlap: 200,
+          });
+          const chunks = await textSplitter.splitDocuments([
+            {
+              pageContent: source.content!,
+              metadata: {
+                source: `text-${source.id}`,
+              },
+            },
+          ]);
+
+          await DialoqbaseVectorStore.fromDocuments(
+            chunks,
+            embeddings(source.embedding),
+            {
+              botId: source.botId,
+              sourceId: source.id,
+            },
+          );
 
-        const location = source.location!;
-        const loader = new PDFLoader(location);
-        const docs = await loader.load();
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
+            },
+            data: {
+              status: "FINISHED",
+              isPending: false,
+            },
+          });
+        } else if (source.type.toLowerCase() === "pdf") {
+          console.log("loading pdf");
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
+            },
+            data: {
+              status: "PROCESSING",
+            },
+          });
+
+          const location = source.location!;
+          const loader = new PDFLoader(location);
+          const docs = await loader.load();
+
+          const textSplitter = new RecursiveCharacterTextSplitter({
+            chunkSize: 1000,
+            chunkOverlap: 200,
+          });
+          const chunks = await textSplitter.splitDocuments(docs);
+
+          await DialoqbaseVectorStore.fromDocuments(
+            chunks,
+            embeddings(source.embedding),
+            {
+              botId: source.botId,
+              sourceId: source.id,
+            },
+          );
 
-        const textSplitter = new RecursiveCharacterTextSplitter({
-          chunkSize: 1000,
-          chunkOverlap: 200,
-        });
-        const chunks = await textSplitter.splitDocuments(docs);
-
-        await DialoqbaseVectorStore.fromDocuments(
-          chunks,
-          embeddings(source.embedding),
-          {
-            botId: source.botId,
-            sourceId: source.id,
-          },
-        );
+          await prisma.botSource.update({
+            where: {
+              id: source.id,
+            },
+            data: {
+              status: "FINISHED",
+              isPending: false,
+            },
+          });
+        }
+      } catch (e) {
+        console.log(e);
 
         await prisma.botSource.update({
           where: {
             id: source.id,
           },
           data: {
-            status: "FINISHED",
+            status: "FAILED",
             isPending: false,
           },
         });
       }
-    } catch (e) {
-      console.log(e);
-
-      await prisma.botSource.update({
-        where: {
-          id: source.id,
-        },
-        data: {
-          status: "FAILED",
-          isPending: false,
-        },
-      });
     }
+  } catch (e) {
+    console.log(e);
+  } finally {
+    done();
   }
-
-  done();
 };
 
 queue.process(queueHandler);
-
diff --git a/server/src/routes/api/v1/bot/handlers/index.ts b/server/src/routes/api/v1/bot/handlers/index.ts
@@ -129,9 +129,10 @@ export const createBotPDFHandler = async (
       ...botSource,
       embedding: bot.embedding,
     }]);
-    return {
+
+    return reply.status(200).send({
       id: bot.id,
-    };
+    });
   } catch (err) {
     return reply.status(500).send({
       message: "Upload failed due to internal server error",

diff --git a/server/src/routes/api/v1/bot/handlers/schema.ts b/server/src/routes/api/v1/bot/handlers/schema.ts
@@ -17,7 +17,7 @@ export const createBotSchema: FastifySchema = {
       },
       embedding: {
         type: "string",
-        enum: ["tensorflow", "openai", "cohere"],
+        enum: ["tensorflow", "openai", "cohere", "huggingface-api"],
       }
     },
   },

diff --git a/server/src/routes/bot/root.ts b/server/src/routes/bot/root.ts
@@ -11,6 +11,8 @@ const root: FastifyPluginAsync = async (fastify, _): Promise<void> => {
   fastify.get("/:id", async (request, reply) => {
     return reply.sendFile('bot.html')
   });
+
+
 };
 
 export default root;
diff --git a/server/src/utils/embeddings.ts b/server/src/utils/embeddings.ts
@@ -1,14 +1,20 @@
 import "@tensorflow/tfjs-backend-cpu";
 import { TensorFlowEmbeddings } from "langchain/embeddings/tensorflow";
 import { OpenAIEmbeddings } from "langchain/embeddings/openai";
+import { CohereEmbeddings } from "langchain/embeddings/cohere";
+import { HuggingFaceInferenceEmbeddings } from "langchain/embeddings/hf";
 
 export const embeddings = (embeddingsType: string) => {
   switch (embeddingsType) {
     case "tensorflow":
       return new TensorFlowEmbeddings();
     case "openai":
       return new OpenAIEmbeddings();
+    case "cohere":
+      return new CohereEmbeddings();
+    case "huggingface-api":
+      return new HuggingFaceInferenceEmbeddings();
     default:
       return new OpenAIEmbeddings();
   }
-};
+};