Skip to content

Commit

Permalink
Feature implementation: Add support for Multiple PDF files
Browse files Browse the repository at this point in the history
  • Loading branch information
sarthak247 committed Jul 16, 2024
1 parent 2e01038 commit d891b91
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 26 deletions.
23 changes: 14 additions & 9 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
Description: Implements functions and methods needed for interacting with NekoPDF
Run: streamlit run app.py
"""
import json
import streamlit as st
from dotenv import load_dotenv
from streamlit_extras.add_vertical_space import add_vertical_space
from utils import read_pdf, split_into_chunks
import requests
import json
from utils import read_pdf, split_into_chunks, get_store_name

# Favicon and Title
st.set_page_config(page_title="NekoPDF 📖 - Chat with PDF",
Expand Down Expand Up @@ -42,12 +42,13 @@ def main():
k = st.slider('Top K', 1, 5, 1)

# Upload PDF File
pdf = st.file_uploader("Upload your PDF", type = 'pdf')
pdfs = st.file_uploader("Upload your PDF", type = 'pdf', accept_multiple_files=True)
store_name = get_store_name(pdfs)

# Read PDF
if pdf is not None:
if pdfs is not None:
# Read PDF content
content = read_pdf(pdf)
content = read_pdf(pdfs)

# Build chunks of text
chunks = split_into_chunks(content)
Expand All @@ -56,17 +57,21 @@ def main():
query = st.text_input("Ask questions about your PDF File: ")
if option == 'GPT 3.5 - Turbo':
# Check for existing store or create new one
store_name = pdf.name[:-4] + '.openai.faiss'
store_name =store_name + 'openai.faiss'
payload = {'chunks': chunks, 'store_name' : store_name, 'query' : query, 'k': k}
if query:
response = requests.post(url='http://127.0.0.1:8000/qa/openai', data = json.dumps(payload))
response = requests.post(url='http://127.0.0.1:8000/qa/openai',
data = json.dumps(payload),
timeout=120)
st.write(response.text)
elif option == 'LLama 2 7B':
# Check for existing store or create one
store_name = pdf.name[:-4] + '.llama.faiss'
store_name = store_name + 'llama.faiss'
payload = {'chunks' : chunks, 'store_name' : store_name, 'query' : query, 'k' : k}
if query:
response = requests.post(url='http://127.0.0.1:8000/qa/llama', data = json.dumps(payload))
response = requests.post(url='http://127.0.0.1:8000/qa/llama',
data = json.dumps(payload),
timeout=120)
st.write(response.text)

if __name__ == '__main__':
Expand Down
41 changes: 31 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,50 @@
"""
Filename: main.py
Description: Implements endpoints for NekoPDF frontend to interact with
Run: uvicorn main:app --reload
"""
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel
from llama_chat import get_llama_embeddings, get_llama_answers
from typing import List
from openai_chat import get_openai_answers, get_openai_embeddings

app = FastAPI()

class QA(BaseModel):
"""
QA Class used by methods for accepting and validating HTTP data transfer
Parameters:
chunks: Chunks of text to embed
store_name: Name of the vectorstore from which to load or save to
query: Query to be run on the vectorstore
k: Top K similarity search
"""
chunks : List
store_name : str
query : str
k: int

@app.post('/qa/openai')
def openai_response(input: QA):
vectorstore = get_openai_embeddings(input.chunks, input.store_name)
def openai_response(inputs: QA):
"""
Parameters:
inputs: Instance of QA class used to accept inputs for generating response
Returns: Response generated by OpenAI
"""
vectorstore = get_openai_embeddings(inputs.chunks, inputs.store_name)
if input.query:
response = get_openai_answers(vectorstore, input.query, input.k)
response = get_openai_answers(vectorstore, inputs.query, inputs.k)
return response

@app.post('/qa/llama')
def llama_response(input: QA):
vectorstore = get_llama_embeddings(input.chunks, input.store_name)
def llama_response(inputs: QA):
"""
Parameters:
inputs: Instance of QA class used to accept inputs for generating response
Returns: Response generated by Llama
"""
vectorstore = get_llama_embeddings(inputs.chunks, inputs.store_name)
if input.query:
response = get_llama_answers(vectorstore, input.query, input.k)
response = get_llama_answers(vectorstore, inputs.query, inputs.k)
return response


28 changes: 21 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def read_pdf(pdf):
def read_pdf(pdfs):
"""
Parameters:
- pdf: path to the PDF file
- pdf: path to PDF files
Return: Returns the contents of the PDF file
"""
pdf_reader = PdfReader(pdf)

content = ""
for page in pdf_reader.pages:
content += page.extract_text()
for pdf in pdfs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
content += page.extract_text()
return content

def split_into_chunks(content):
Expand All @@ -31,4 +31,18 @@ def split_into_chunks(content):
length_function = len
)
chunks = text_splitter.split_text(text = content)
return chunks
return chunks

def get_store_name(pdfs):
"""
Construct a unique store name for the uploaded PDF files. We sort the files ensuring
that the store name remains same for the same files
Parameters:
pdf: path to PDF files
Return: sorted store name
"""
pdfs = sorted(pdfs, key = lambda file: file.name)
store_name = ''
for pdf in pdfs:
store_name += (pdf.name[:-4] + '.')
return store_name

0 comments on commit d891b91

Please sign in to comment.