Merge pull request #271 from ChatWithPDF/search_tf

added search tf
Samagra-Development · Nov 1, 2023 · 13d3c7f · 13d3c7f
2 parents ec77faa + 2355d7b
commit 13d3c7f
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 0 deletions.
diff --git a/src/search/tf_search/local/Dockerfile b/src/search/tf_search/local/Dockerfile
@@ -0,0 +1,21 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+WORKDIR /app
+
+#install requirements
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Download the CSV from Google Drive and store it in the "content" directory
+RUN apt-get update && apt-get install -y curl && \
+    mkdir content && \
+    curl -L 'https://drive.google.com/uc?export=download&id=13aDWCvj7PqFw7aPvPK_Qli3-Ei9mVwaO' -o content/data.csv && \
+    apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
+
+# Copy the rest of the application code to the working directory
+COPY . /app/
+EXPOSE 8000
+
+# Set the entrypoint for the container
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/search/tf_search/local/README copy.md b/src/search/tf_search/local/README copy.md
@@ -0,0 +1,13 @@
+# Word Score
+
+## Test Deployment
+
+- Git clone the repo and cd to the project location.
+- cd to `local`, i.e., `cd ./src/search/word_score/local`.
+- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`.
+- Start your docker engine and `docker build -t word_score .`.
+- Do `docker run -p 8000:8000 word_score`.
+- `curl -X POST -H "Content-Type: application/json" -d '{"query": "leave policy planned leaves",  "k": "6"}' http://localhost:8000/`. <
+`
+
+
diff --git a/src/search/tf_search/local/README.md b/src/search/tf_search/local/README.md
diff --git a/src/search/tf_search/local/__init__.py b/src/search/tf_search/local/__init__.py
@@ -0,0 +1,2 @@
+from .request import *
+from .model import *
diff --git a/src/search/tf_search/local/api.py b/src/search/tf_search/local/api.py
@@ -0,0 +1,58 @@
+from model import Model
+from request import ModelRequest
+from quart import Quart, request
+import aiohttp
+import os
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import linear_kernel
+
+app = Quart(__name__)
+
+# Global variable for the dataframe
+df = None
+vectorizer = None
+content_matrix = None
+
+files = os.listdir("./content")
+df = pd.read_csv(os.path.join("./content", files[0]))
+print(df.columns)
+
+
+@app.before_serving
+async def startup():
+    app.client = aiohttp.ClientSession()
+
+    # Load the dataframe during startup
+    global df 
+    global content_matrix
+    global vectorizer
+    files = os.listdir("./content")
+    df = pd.read_csv(os.path.join("./content", files[0]))
+    print(df.columns)
+    # Initialize a CountVectorizer with additional parameters
+    vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), binary=True, stop_words='english')
+
+    # Fit the vectorizer on the content column and transform it
+    content_matrix = vectorizer.fit_transform(df['heading'] + df['content'])
+    print("Type of vectorizer:", type(vectorizer))
+
+@app.route('/', methods=['POST'])
+async def translate():
+    global vectorizer, content_matrix
+    data = await request.get_json()
+    req = ModelRequest(**data)
+
+    print("Inside translate function, type of vectorizer:", type(vectorizer))
+
+    # Pass the dataframe as an argument to the Model class
+    model = Model(df, content_matrix, vectorizer, req)
+    return await model.inference(req)
+
+@app.route('/', methods=['GET'])
+async def hi():
+    return df.columns
+
+
+if __name__ == "__main__":
+    app.run(debug=True, port=8000)
diff --git a/src/search/tf_search/local/model.py b/src/search/tf_search/local/model.py
@@ -0,0 +1,40 @@
+from request import ModelRequest
+
+import numpy as np
+from cache import AsyncTTL
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import linear_kernel
+
+
+class Model:
+    def __init__(self, df1, matrix, vec, req: ModelRequest):
+        self.df = df1
+        self.content_matrix = matrix
+        self.vectorizer = vec
+
+        print("Inside Model's constructor, type of vectorizer:", type(self.vectorizer))
+
+
+    @AsyncTTL(time_to_live=600000, maxsize=1024)
+    async def inference(self, request: ModelRequest):
+
+        query = [request.query]
+        query_vector = self.vectorizer.transform(query)    
+        k = int(request.k) # k is the number of top k words to consider for the score
+
+        count_scores = linear_kernel(query_vector, self.content_matrix).flatten()
+
+  # Create a new DataFrame with content and count scores
+        result_df = pd.DataFrame({
+            'content': self.df['content'],
+            'count_score': count_scores,
+            'chunk_id' :  self.df['chunkId']
+        })
+
+        # Sort the DataFrame based on count scores in descending order
+        sorted_df = result_df.sort_values(by='count_score', ascending=False)
+        sorted_df = sorted_df.head(k)
+
+        return sorted_df.to_dict()
+
diff --git a/src/search/tf_search/local/request.py b/src/search/tf_search/local/request.py
@@ -0,0 +1,12 @@
+import json
+
+
+class ModelRequest():
+    def __init__(self, query, k=4):
+        self.query = query
+        self.k = k
+
+
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__,
+                          sort_keys=True, indent=4)
diff --git a/src/search/tf_search/local/requirements.txt b/src/search/tf_search/local/requirements.txt
@@ -0,0 +1,5 @@
+quart
+aiohttp
+async-cache==1.1.1
+pandas
+scikit-learn