-
Notifications
You must be signed in to change notification settings - Fork 112
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #271 from ChatWithPDF/search_tf
added search tf
- Loading branch information
Showing
8 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Use an official Python runtime as a parent image | ||
FROM python:3.9-slim | ||
|
||
WORKDIR /app | ||
|
||
#install requirements | ||
COPY requirements.txt requirements.txt | ||
RUN pip3 install -r requirements.txt | ||
|
||
# Download the CSV from Google Drive and store it in the "content" directory | ||
RUN apt-get update && apt-get install -y curl && \ | ||
mkdir content && \ | ||
curl -L 'https://drive.google.com/uc?export=download&id=13aDWCvj7PqFw7aPvPK_Qli3-Ei9mVwaO' -o content/data.csv && \ | ||
apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy the rest of the application code to the working directory | ||
COPY . /app/ | ||
EXPOSE 8000 | ||
|
||
# Set the entrypoint for the container | ||
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Word Score | ||
|
||
## Test Deployment | ||
|
||
- Git clone the repo and cd to the project location. | ||
- cd to `local`, i.e., `cd ./src/search/word_score/local`. | ||
- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`. | ||
- Start your docker engine and `docker build -t word_score .`. | ||
- Do `docker run -p 8000:8000 word_score`. | ||
- `curl -X POST -H "Content-Type: application/json" -d '{"query": "leave policy planned leaves", "k": "6"}' http://localhost:8000/`. < | ||
` | ||
|
||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .request import * | ||
from .model import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from model import Model | ||
from request import ModelRequest | ||
from quart import Quart, request | ||
import aiohttp | ||
import os | ||
import pandas as pd | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.metrics.pairwise import linear_kernel | ||
|
||
app = Quart(__name__) | ||
|
||
# Global variable for the dataframe | ||
df = None | ||
vectorizer = None | ||
content_matrix = None | ||
|
||
files = os.listdir("./content") | ||
df = pd.read_csv(os.path.join("./content", files[0])) | ||
print(df.columns) | ||
|
||
|
||
@app.before_serving | ||
async def startup(): | ||
app.client = aiohttp.ClientSession() | ||
|
||
# Load the dataframe during startup | ||
global df | ||
global content_matrix | ||
global vectorizer | ||
files = os.listdir("./content") | ||
df = pd.read_csv(os.path.join("./content", files[0])) | ||
print(df.columns) | ||
# Initialize a CountVectorizer with additional parameters | ||
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), binary=True, stop_words='english') | ||
|
||
# Fit the vectorizer on the content column and transform it | ||
content_matrix = vectorizer.fit_transform(df['heading'] + df['content']) | ||
print("Type of vectorizer:", type(vectorizer)) | ||
|
||
@app.route('/', methods=['POST']) | ||
async def translate(): | ||
global vectorizer, content_matrix | ||
data = await request.get_json() | ||
req = ModelRequest(**data) | ||
|
||
print("Inside translate function, type of vectorizer:", type(vectorizer)) | ||
|
||
# Pass the dataframe as an argument to the Model class | ||
model = Model(df, content_matrix, vectorizer, req) | ||
return await model.inference(req) | ||
|
||
@app.route('/', methods=['GET']) | ||
async def hi(): | ||
return df.columns | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True, port=8000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from request import ModelRequest | ||
|
||
import numpy as np | ||
from cache import AsyncTTL | ||
import pandas as pd | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.metrics.pairwise import linear_kernel | ||
|
||
|
||
class Model: | ||
def __init__(self, df1, matrix, vec, req: ModelRequest): | ||
self.df = df1 | ||
self.content_matrix = matrix | ||
self.vectorizer = vec | ||
|
||
print("Inside Model's constructor, type of vectorizer:", type(self.vectorizer)) | ||
|
||
|
||
@AsyncTTL(time_to_live=600000, maxsize=1024) | ||
async def inference(self, request: ModelRequest): | ||
|
||
query = [request.query] | ||
query_vector = self.vectorizer.transform(query) | ||
k = int(request.k) # k is the number of top k words to consider for the score | ||
|
||
count_scores = linear_kernel(query_vector, self.content_matrix).flatten() | ||
|
||
# Create a new DataFrame with content and count scores | ||
result_df = pd.DataFrame({ | ||
'content': self.df['content'], | ||
'count_score': count_scores, | ||
'chunk_id' : self.df['chunkId'] | ||
}) | ||
|
||
# Sort the DataFrame based on count scores in descending order | ||
sorted_df = result_df.sort_values(by='count_score', ascending=False) | ||
sorted_df = sorted_df.head(k) | ||
|
||
return sorted_df.to_dict() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import json | ||
|
||
|
||
class ModelRequest(): | ||
def __init__(self, query, k=4): | ||
self.query = query | ||
self.k = k | ||
|
||
|
||
def to_json(self): | ||
return json.dumps(self, default=lambda o: o.__dict__, | ||
sort_keys=True, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
quart | ||
aiohttp | ||
async-cache==1.1.1 | ||
pandas | ||
scikit-learn |