Merge pull request #273 from ChatWithPDF/embedding_addition

added retreiver query to embedding
Samagra-Development · Nov 6, 2023 · 6667d8a · 6667d8a
2 parents 62a81e5 + 219237a
commit 6667d8a
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 21 deletions.
diff --git a/src/embeddings/instructor/local/api.py b/src/embeddings/instructor/local/api.py
@@ -4,6 +4,7 @@
 import aiohttp
 import pandas as pd
 import io
+from quart import jsonify 
 
 app = Quart(__name__)
 
@@ -15,23 +16,37 @@ async def startup():
     global model
     model = Model(app)
 
+
 @app.route('/', methods=['POST'])
 async def embed():
     global model
     data = await request.get_json()
-    files = await request.files  # await the coroutine
-    uploaded_file = files.get('file')  # now you can use .get()
+    files = await request.files
+    uploaded_file = files.get('file')
 
     if uploaded_file:
         df = pd.read_csv(uploaded_file.stream)
-        req = ModelRequest(df=df)  # Pass the DataFrame to ModelRequest
+        if df.empty or df['content'].isnull().any():
+            return jsonify({'error': 'There are nonzero null rows'}), 400  # Return a 400 Bad Request response with the error message
+
+        req = ModelRequest(df=df)
         response = await model.inference(req)
-        df = pd.read_csv(io.StringIO(response))  # Convert the CSV string back to a DataFrame
-        # Save the DataFrame to a CSV file
-        df.to_csv('output.csv', index=False)
 
+        # If the response from the model is an error message, return it with a 400 status
+        if response == 'There are nonzero null rows':
+            return jsonify({'error': response}), 400
+
+        # Otherwise, assume response is a CSV string
+        df = pd.read_csv(io.StringIO(response))
+        df.to_csv('output.csv', index=False)
         return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv')
-
-    else: 
+    else:
         req = ModelRequest(**data)
-        return await model.inference(req)
+        response = await model.inference(req)
+
+        # Handle potential error from model inference in a similar way
+        if response == 'There are nonzero null rows':
+            return jsonify({'error': response}), 400
+
+        # Otherwise, send back the model's response
+        return response
diff --git a/src/embeddings/instructor/local/model.py b/src/embeddings/instructor/local/model.py
@@ -4,6 +4,8 @@
 import wget
 import pandas as pd
 import os
+from quart import jsonify  # Import jsonify to send JSON responses
+
 
 class Model():
     def __new__(cls, context):
@@ -19,10 +21,20 @@ async def inference(self, request: ModelRequest):
         corpus_instruction = "Represent the document for retrieval:"
         query_instruction = 'Represent the question for retrieving supporting documents: '
         query = request.query
+        query_type =  request.query_type
 
         if(query != None):
             # print('Query Encoding Process :-')
-            query_embeddings = self.model.encode(
+            if query_type ==  'retrieval': 
+                query_embeddings = self.model.encode(
+                    [[corpus_instruction, query]],
+                    show_progress_bar=False,
+                    batch_size=32,
+                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                )
+
+            else :  
+                query_embeddings = self.model.encode(
                     [[query_instruction, query]],
                     show_progress_bar=False,
                     batch_size=32,
@@ -33,15 +45,26 @@ async def inference(self, request: ModelRequest):
         if not request.df.empty:
             # print('Text corpus Encoding Process :-')
             data = request.df
-
-            text_corpus = data.loc[:,'content'].to_list()
-            corpus_embeddings = self.model.encode(
-                    [[corpus_instruction, text] for text in text_corpus],
-                    show_progress_bar=False,
-                    batch_size=32,
-                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            )
-            data['embeddings'] = corpus_embeddings.tolist()
-            csv_string = data.to_csv(index=False)
+            data = data.loc[~pd.isnull(data['content']),:]
+            data['content'] = data['content'].astype(str)
+
+            if data.empty or data['content'].isnull().any():
+                return 'There are nonzero null rows'
+
+            else :  
+                text_corpus = data.loc[:,'content'].to_list()
+
+                if not text_corpus: 
+                    corpus_embeddings = self.model.encode(
+                            [[corpus_instruction, text] for text in text_corpus],
+                            show_progress_bar=False,
+                            batch_size=32,
+                            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                    )
+                    data['embeddings'] = corpus_embeddings.tolist()
+                    csv_string = data.to_csv(index=False)
+                else:
+                    return 'There are nonzero null rows'
+
 
             return str(csv_string)
diff --git a/src/embeddings/instructor/local/request.py b/src/embeddings/instructor/local/request.py
@@ -3,9 +3,10 @@
 
 
 class ModelRequest():
-    def __init__(self, query=None, df = pd.DataFrame()):
+    def __init__(self, query=None, df = pd.DataFrame(), query_type =  None):
          # Url to download csv file
         self.query = query # String
+        self.query_type =  query_type
         self.df = df
 
     def to_json(self):

diff --git a/src/text_classification/convo_starter_orgbot/README.md b/src/text_classification/convo_starter_orgbot/README.md