From d891b91584c4633d964e0664264e346f77115909 Mon Sep 17 00:00:00 2001 From: sarthak247 Date: Wed, 17 Jul 2024 02:10:16 +0930 Subject: [PATCH] Feature implementation: Add support for Multiple PDF files --- app.py | 23 ++++++++++++++--------- main.py | 41 +++++++++++++++++++++++++++++++---------- utils.py | 28 +++++++++++++++++++++------- 3 files changed, 66 insertions(+), 26 deletions(-) diff --git a/app.py b/app.py index 34beed3..b1616b2 100644 --- a/app.py +++ b/app.py @@ -3,12 +3,12 @@ Description: Implements functions and methods needed for interacting with NekoPDF Run: streamlit run app.py """ +import json import streamlit as st from dotenv import load_dotenv from streamlit_extras.add_vertical_space import add_vertical_space -from utils import read_pdf, split_into_chunks import requests -import json +from utils import read_pdf, split_into_chunks, get_store_name # Favicon and Title st.set_page_config(page_title="NekoPDF 📖 - Chat with PDF", @@ -42,12 +42,13 @@ def main(): k = st.slider('Top K', 1, 5, 1) # Upload PDF File - pdf = st.file_uploader("Upload your PDF", type = 'pdf') + pdfs = st.file_uploader("Upload your PDF", type = 'pdf', accept_multiple_files=True) + store_name = get_store_name(pdfs) # Read PDF - if pdf is not None: + if pdfs is not None: # Read PDF content - content = read_pdf(pdf) + content = read_pdf(pdfs) # Build chunks of text chunks = split_into_chunks(content) @@ -56,17 +57,21 @@ def main(): query = st.text_input("Ask questions about your PDF File: ") if option == 'GPT 3.5 - Turbo': # Check for existing store or create new one - store_name = pdf.name[:-4] + '.openai.faiss' + store_name =store_name + 'openai.faiss' payload = {'chunks': chunks, 'store_name' : store_name, 'query' : query, 'k': k} if query: - response = requests.post(url='http://127.0.0.1:8000/qa/openai', data = json.dumps(payload)) + response = requests.post(url='http://127.0.0.1:8000/qa/openai', + data = json.dumps(payload), + timeout=120) st.write(response.text) elif option == 'LLama 2 7B': # Check for existing store or create one - store_name = pdf.name[:-4] + '.llama.faiss' + store_name = store_name + 'llama.faiss' payload = {'chunks' : chunks, 'store_name' : store_name, 'query' : query, 'k' : k} if query: - response = requests.post(url='http://127.0.0.1:8000/qa/llama', data = json.dumps(payload)) + response = requests.post(url='http://127.0.0.1:8000/qa/llama', + data = json.dumps(payload), + timeout=120) st.write(response.text) if __name__ == '__main__': diff --git a/main.py b/main.py index 8be9c0e..f37e02b 100644 --- a/main.py +++ b/main.py @@ -1,29 +1,50 @@ +""" +Filename: main.py +Description: Implements endpoints for NekoPDF frontend to interact with +Run: uvicorn main:app --reload +""" +from typing import List from fastapi import FastAPI from pydantic import BaseModel from llama_chat import get_llama_embeddings, get_llama_answers -from typing import List from openai_chat import get_openai_answers, get_openai_embeddings app = FastAPI() class QA(BaseModel): + """ + QA Class used by methods for accepting and validating HTTP data transfer + Parameters: + chunks: Chunks of text to embed + store_name: Name of the vectorstore from which to load or save to + query: Query to be run on the vectorstore + k: Top K similarity search + """ chunks : List store_name : str query : str k: int @app.post('/qa/openai') -def openai_response(input: QA): - vectorstore = get_openai_embeddings(input.chunks, input.store_name) +def openai_response(inputs: QA): + """ + Parameters: + inputs: Instance of QA class used to accept inputs for generating response + Returns: Response generated by OpenAI + """ + vectorstore = get_openai_embeddings(inputs.chunks, inputs.store_name) if input.query: - response = get_openai_answers(vectorstore, input.query, input.k) + response = get_openai_answers(vectorstore, inputs.query, inputs.k) return response - + @app.post('/qa/llama') -def llama_response(input: QA): - vectorstore = get_llama_embeddings(input.chunks, input.store_name) +def llama_response(inputs: QA): + """ + Parameters: + inputs: Instance of QA class used to accept inputs for generating response + Returns: Response generated by Llama + """ + vectorstore = get_llama_embeddings(inputs.chunks, inputs.store_name) if input.query: - response = get_llama_answers(vectorstore, input.query, input.k) + response = get_llama_answers(vectorstore, inputs.query, inputs.k) return response - - \ No newline at end of file diff --git a/utils.py b/utils.py index a2b766c..9d20f55 100644 --- a/utils.py +++ b/utils.py @@ -6,17 +6,17 @@ from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter -def read_pdf(pdf): +def read_pdf(pdfs): """ Parameters: - - pdf: path to the PDF file + - pdf: path to PDF files Return: Returns the contents of the PDF file """ - pdf_reader = PdfReader(pdf) - content = "" - for page in pdf_reader.pages: - content += page.extract_text() + for pdf in pdfs: + pdf_reader = PdfReader(pdf) + for page in pdf_reader.pages: + content += page.extract_text() return content def split_into_chunks(content): @@ -31,4 +31,18 @@ def split_into_chunks(content): length_function = len ) chunks = text_splitter.split_text(text = content) - return chunks \ No newline at end of file + return chunks + +def get_store_name(pdfs): + """ + Construct a unique store name for the uploaded PDF files. We sort the files ensuring + that the store name remains same for the same files + Parameters: + pdf: path to PDF files + Return: sorted store name + """ + pdfs = sorted(pdfs, key = lambda file: file.name) + store_name = '' + for pdf in pdfs: + store_name += (pdf.name[:-4] + '.') + return store_name