diff --git a/src/search/tf_search/local/Dockerfile b/src/search/tf_search/local/Dockerfile new file mode 100644 index 0000000..645ec30 --- /dev/null +++ b/src/search/tf_search/local/Dockerfile @@ -0,0 +1,21 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +WORKDIR /app + +#install requirements +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +# Download the CSV from Google Drive and store it in the "content" directory +RUN apt-get update && apt-get install -y curl && \ + mkdir content && \ + curl -L 'https://drive.google.com/uc?export=download&id=13aDWCvj7PqFw7aPvPK_Qli3-Ei9mVwaO' -o content/data.csv && \ + apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* + +# Copy the rest of the application code to the working directory +COPY . /app/ +EXPOSE 8000 + +# Set the entrypoint for the container +CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] diff --git a/src/search/tf_search/local/README copy.md b/src/search/tf_search/local/README copy.md new file mode 100644 index 0000000..617b65b --- /dev/null +++ b/src/search/tf_search/local/README copy.md @@ -0,0 +1,13 @@ +# Word Score + +## Test Deployment + +- Git clone the repo and cd to the project location. +- cd to `local`, i.e., `cd ./src/search/word_score/local`. +- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`. +- Start your docker engine and `docker build -t word_score .`. +- Do `docker run -p 8000:8000 word_score`. +- `curl -X POST -H "Content-Type: application/json" -d '{"query": "leave policy planned leaves", "k": "6"}' http://localhost:8000/`. < +` + + diff --git a/src/search/tf_search/local/README.md b/src/search/tf_search/local/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/search/tf_search/local/__init__.py b/src/search/tf_search/local/__init__.py new file mode 100644 index 0000000..541dca1 --- /dev/null +++ b/src/search/tf_search/local/__init__.py @@ -0,0 +1,2 @@ +from .request import * +from .model import * \ No newline at end of file diff --git a/src/search/tf_search/local/api.py b/src/search/tf_search/local/api.py new file mode 100644 index 0000000..097a8cd --- /dev/null +++ b/src/search/tf_search/local/api.py @@ -0,0 +1,58 @@ +from model import Model +from request import ModelRequest +from quart import Quart, request +import aiohttp +import os +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import linear_kernel + +app = Quart(__name__) + +# Global variable for the dataframe +df = None +vectorizer = None +content_matrix = None + +files = os.listdir("./content") +df = pd.read_csv(os.path.join("./content", files[0])) +print(df.columns) + + +@app.before_serving +async def startup(): + app.client = aiohttp.ClientSession() + + # Load the dataframe during startup + global df + global content_matrix + global vectorizer + files = os.listdir("./content") + df = pd.read_csv(os.path.join("./content", files[0])) + print(df.columns) + # Initialize a CountVectorizer with additional parameters + vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), binary=True, stop_words='english') + + # Fit the vectorizer on the content column and transform it + content_matrix = vectorizer.fit_transform(df['heading'] + df['content']) + print("Type of vectorizer:", type(vectorizer)) + +@app.route('/', methods=['POST']) +async def translate(): + global vectorizer, content_matrix + data = await request.get_json() + req = ModelRequest(**data) + + print("Inside translate function, type of vectorizer:", type(vectorizer)) + + # Pass the dataframe as an argument to the Model class + model = Model(df, content_matrix, vectorizer, req) + return await model.inference(req) + +@app.route('/', methods=['GET']) +async def hi(): + return df.columns + + +if __name__ == "__main__": + app.run(debug=True, port=8000) diff --git a/src/search/tf_search/local/model.py b/src/search/tf_search/local/model.py new file mode 100644 index 0000000..104aa80 --- /dev/null +++ b/src/search/tf_search/local/model.py @@ -0,0 +1,40 @@ +from request import ModelRequest + +import numpy as np +from cache import AsyncTTL +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import linear_kernel + + +class Model: + def __init__(self, df1, matrix, vec, req: ModelRequest): + self.df = df1 + self.content_matrix = matrix + self.vectorizer = vec + + print("Inside Model's constructor, type of vectorizer:", type(self.vectorizer)) + + + @AsyncTTL(time_to_live=600000, maxsize=1024) + async def inference(self, request: ModelRequest): + + query = [request.query] + query_vector = self.vectorizer.transform(query) + k = int(request.k) # k is the number of top k words to consider for the score + + count_scores = linear_kernel(query_vector, self.content_matrix).flatten() + + # Create a new DataFrame with content and count scores + result_df = pd.DataFrame({ + 'content': self.df['content'], + 'count_score': count_scores, + 'chunk_id' : self.df['chunkId'] + }) + + # Sort the DataFrame based on count scores in descending order + sorted_df = result_df.sort_values(by='count_score', ascending=False) + sorted_df = sorted_df.head(k) + + return sorted_df.to_dict() + \ No newline at end of file diff --git a/src/search/tf_search/local/request.py b/src/search/tf_search/local/request.py new file mode 100644 index 0000000..b1f8e29 --- /dev/null +++ b/src/search/tf_search/local/request.py @@ -0,0 +1,12 @@ +import json + + +class ModelRequest(): + def __init__(self, query, k=4): + self.query = query + self.k = k + + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) diff --git a/src/search/tf_search/local/requirements.txt b/src/search/tf_search/local/requirements.txt new file mode 100644 index 0000000..b75ebd0 --- /dev/null +++ b/src/search/tf_search/local/requirements.txt @@ -0,0 +1,5 @@ +quart +aiohttp +async-cache==1.1.1 +pandas +scikit-learn \ No newline at end of file