From 219237a19dfb2cf709ca2de95aa71a78bb133ed1 Mon Sep 17 00:00:00 2001 From: GautamR-Samagra Date: Mon, 6 Nov 2023 06:14:04 +0000 Subject: [PATCH] added retreiver query to embedding --- src/embeddings/instructor/local/api.py | 33 ++++++++++---- src/embeddings/instructor/local/model.py | 45 ++++++++++++++----- src/embeddings/instructor/local/request.py | 3 +- .../convo_starter_orgbot/README.md | 0 4 files changed, 60 insertions(+), 21 deletions(-) create mode 100644 src/text_classification/convo_starter_orgbot/README.md diff --git a/src/embeddings/instructor/local/api.py b/src/embeddings/instructor/local/api.py index cee4645..0da3b0f 100644 --- a/src/embeddings/instructor/local/api.py +++ b/src/embeddings/instructor/local/api.py @@ -4,6 +4,7 @@ import aiohttp import pandas as pd import io +from quart import jsonify app = Quart(__name__) @@ -15,23 +16,37 @@ async def startup(): global model model = Model(app) + @app.route('/', methods=['POST']) async def embed(): global model data = await request.get_json() - files = await request.files # await the coroutine - uploaded_file = files.get('file') # now you can use .get() + files = await request.files + uploaded_file = files.get('file') if uploaded_file: df = pd.read_csv(uploaded_file.stream) - req = ModelRequest(df=df) # Pass the DataFrame to ModelRequest + if df.empty or df['content'].isnull().any(): + return jsonify({'error': 'There are nonzero null rows'}), 400 # Return a 400 Bad Request response with the error message + + req = ModelRequest(df=df) response = await model.inference(req) - df = pd.read_csv(io.StringIO(response)) # Convert the CSV string back to a DataFrame - # Save the DataFrame to a CSV file - df.to_csv('output.csv', index=False) + # If the response from the model is an error message, return it with a 400 status + if response == 'There are nonzero null rows': + return jsonify({'error': response}), 400 + + # Otherwise, assume response is a CSV string + df = pd.read_csv(io.StringIO(response)) + df.to_csv('output.csv', index=False) return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv') - - else: + else: req = ModelRequest(**data) - return await model.inference(req) + response = await model.inference(req) + + # Handle potential error from model inference in a similar way + if response == 'There are nonzero null rows': + return jsonify({'error': response}), 400 + + # Otherwise, send back the model's response + return response \ No newline at end of file diff --git a/src/embeddings/instructor/local/model.py b/src/embeddings/instructor/local/model.py index be05c15..58793f5 100644 --- a/src/embeddings/instructor/local/model.py +++ b/src/embeddings/instructor/local/model.py @@ -4,6 +4,8 @@ import wget import pandas as pd import os +from quart import jsonify # Import jsonify to send JSON responses + class Model(): def __new__(cls, context): @@ -19,10 +21,20 @@ async def inference(self, request: ModelRequest): corpus_instruction = "Represent the document for retrieval:" query_instruction = 'Represent the question for retrieving supporting documents: ' query = request.query + query_type = request.query_type if(query != None): # print('Query Encoding Process :-') - query_embeddings = self.model.encode( + if query_type == 'retrieval': + query_embeddings = self.model.encode( + [[corpus_instruction, query]], + show_progress_bar=False, + batch_size=32, + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + ) + + else : + query_embeddings = self.model.encode( [[query_instruction, query]], show_progress_bar=False, batch_size=32, @@ -33,15 +45,26 @@ async def inference(self, request: ModelRequest): if not request.df.empty: # print('Text corpus Encoding Process :-') data = request.df - - text_corpus = data.loc[:,'content'].to_list() - corpus_embeddings = self.model.encode( - [[corpus_instruction, text] for text in text_corpus], - show_progress_bar=False, - batch_size=32, - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - ) - data['embeddings'] = corpus_embeddings.tolist() - csv_string = data.to_csv(index=False) + data = data.loc[~pd.isnull(data['content']),:] + data['content'] = data['content'].astype(str) + + if data.empty or data['content'].isnull().any(): + return 'There are nonzero null rows' + + else : + text_corpus = data.loc[:,'content'].to_list() + + if not text_corpus: + corpus_embeddings = self.model.encode( + [[corpus_instruction, text] for text in text_corpus], + show_progress_bar=False, + batch_size=32, + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + ) + data['embeddings'] = corpus_embeddings.tolist() + csv_string = data.to_csv(index=False) + else: + return 'There are nonzero null rows' + return str(csv_string) diff --git a/src/embeddings/instructor/local/request.py b/src/embeddings/instructor/local/request.py index 96169b2..23dc54e 100644 --- a/src/embeddings/instructor/local/request.py +++ b/src/embeddings/instructor/local/request.py @@ -3,9 +3,10 @@ class ModelRequest(): - def __init__(self, query=None, df = pd.DataFrame()): + def __init__(self, query=None, df = pd.DataFrame(), query_type = None): # Url to download csv file self.query = query # String + self.query_type = query_type self.df = df def to_json(self): diff --git a/src/text_classification/convo_starter_orgbot/README.md b/src/text_classification/convo_starter_orgbot/README.md new file mode 100644 index 0000000..e69de29