Skip to content

Commit

Permalink
Merge pull request #271 from ChatWithPDF/search_tf
Browse files Browse the repository at this point in the history
added search tf
  • Loading branch information
Gautam-Rajeev authored Nov 1, 2023
2 parents ec77faa + 2355d7b commit 13d3c7f
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/search/tf_search/local/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim

WORKDIR /app

#install requirements
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

# Download the CSV from Google Drive and store it in the "content" directory
RUN apt-get update && apt-get install -y curl && \
mkdir content && \
curl -L 'https://drive.google.com/uc?export=download&id=13aDWCvj7PqFw7aPvPK_Qli3-Ei9mVwaO' -o content/data.csv && \
apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*

# Copy the rest of the application code to the working directory
COPY . /app/
EXPOSE 8000

# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
13 changes: 13 additions & 0 deletions src/search/tf_search/local/README copy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Word Score

## Test Deployment

- Git clone the repo and cd to the project location.
- cd to `local`, i.e., `cd ./src/search/word_score/local`.
- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`.
- Start your docker engine and `docker build -t word_score .`.
- Do `docker run -p 8000:8000 word_score`.
- `curl -X POST -H "Content-Type: application/json" -d '{"query": "leave policy planned leaves", "k": "6"}' http://localhost:8000/`. <
`


Empty file.
2 changes: 2 additions & 0 deletions src/search/tf_search/local/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .request import *
from .model import *
58 changes: 58 additions & 0 deletions src/search/tf_search/local/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from model import Model
from request import ModelRequest
from quart import Quart, request
import aiohttp
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

app = Quart(__name__)

# Global variable for the dataframe
df = None
vectorizer = None
content_matrix = None

files = os.listdir("./content")
df = pd.read_csv(os.path.join("./content", files[0]))
print(df.columns)


@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()

# Load the dataframe during startup
global df
global content_matrix
global vectorizer
files = os.listdir("./content")
df = pd.read_csv(os.path.join("./content", files[0]))
print(df.columns)
# Initialize a CountVectorizer with additional parameters
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), binary=True, stop_words='english')

# Fit the vectorizer on the content column and transform it
content_matrix = vectorizer.fit_transform(df['heading'] + df['content'])
print("Type of vectorizer:", type(vectorizer))

@app.route('/', methods=['POST'])
async def translate():
global vectorizer, content_matrix
data = await request.get_json()
req = ModelRequest(**data)

print("Inside translate function, type of vectorizer:", type(vectorizer))

# Pass the dataframe as an argument to the Model class
model = Model(df, content_matrix, vectorizer, req)
return await model.inference(req)

@app.route('/', methods=['GET'])
async def hi():
return df.columns


if __name__ == "__main__":
app.run(debug=True, port=8000)
40 changes: 40 additions & 0 deletions src/search/tf_search/local/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from request import ModelRequest

import numpy as np
from cache import AsyncTTL
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel


class Model:
def __init__(self, df1, matrix, vec, req: ModelRequest):
self.df = df1
self.content_matrix = matrix
self.vectorizer = vec

print("Inside Model's constructor, type of vectorizer:", type(self.vectorizer))


@AsyncTTL(time_to_live=600000, maxsize=1024)
async def inference(self, request: ModelRequest):

query = [request.query]
query_vector = self.vectorizer.transform(query)
k = int(request.k) # k is the number of top k words to consider for the score

count_scores = linear_kernel(query_vector, self.content_matrix).flatten()

# Create a new DataFrame with content and count scores
result_df = pd.DataFrame({
'content': self.df['content'],
'count_score': count_scores,
'chunk_id' : self.df['chunkId']
})

# Sort the DataFrame based on count scores in descending order
sorted_df = result_df.sort_values(by='count_score', ascending=False)
sorted_df = sorted_df.head(k)

return sorted_df.to_dict()

12 changes: 12 additions & 0 deletions src/search/tf_search/local/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import json


class ModelRequest():
def __init__(self, query, k=4):
self.query = query
self.k = k


def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
5 changes: 5 additions & 0 deletions src/search/tf_search/local/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
quart
aiohttp
async-cache==1.1.1
pandas
scikit-learn

0 comments on commit 13d3c7f

Please sign in to comment.