Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature implementation: Add support for Multiple PDF files #7

Merged
merged 1 commit into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
Description: Implements functions and methods needed for interacting with NekoPDF
Run: streamlit run app.py
"""
import json
import streamlit as st
from dotenv import load_dotenv
from streamlit_extras.add_vertical_space import add_vertical_space
from utils import read_pdf, split_into_chunks
import requests
import json
from utils import read_pdf, split_into_chunks, get_store_name

# Favicon and Title
st.set_page_config(page_title="NekoPDF 📖 - Chat with PDF",
Expand Down Expand Up @@ -42,12 +42,13 @@ def main():
k = st.slider('Top K', 1, 5, 1)

# Upload PDF File
pdf = st.file_uploader("Upload your PDF", type = 'pdf')
pdfs = st.file_uploader("Upload your PDF", type = 'pdf', accept_multiple_files=True)
store_name = get_store_name(pdfs)

# Read PDF
if pdf is not None:
if pdfs is not None:
# Read PDF content
content = read_pdf(pdf)
content = read_pdf(pdfs)

# Build chunks of text
chunks = split_into_chunks(content)
Expand All @@ -56,17 +57,21 @@ def main():
query = st.text_input("Ask questions about your PDF File: ")
if option == 'GPT 3.5 - Turbo':
# Check for existing store or create new one
store_name = pdf.name[:-4] + '.openai.faiss'
store_name =store_name + 'openai.faiss'
payload = {'chunks': chunks, 'store_name' : store_name, 'query' : query, 'k': k}
if query:
response = requests.post(url='http://127.0.0.1:8000/qa/openai', data = json.dumps(payload))
response = requests.post(url='http://127.0.0.1:8000/qa/openai',
data = json.dumps(payload),
timeout=120)
st.write(response.text)
elif option == 'LLama 2 7B':
# Check for existing store or create one
store_name = pdf.name[:-4] + '.llama.faiss'
store_name = store_name + 'llama.faiss'
payload = {'chunks' : chunks, 'store_name' : store_name, 'query' : query, 'k' : k}
if query:
response = requests.post(url='http://127.0.0.1:8000/qa/llama', data = json.dumps(payload))
response = requests.post(url='http://127.0.0.1:8000/qa/llama',
data = json.dumps(payload),
timeout=120)
st.write(response.text)

if __name__ == '__main__':
Expand Down
41 changes: 31 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,50 @@
"""
Filename: main.py
Description: Implements endpoints for NekoPDF frontend to interact with
Run: uvicorn main:app --reload
"""
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel
from llama_chat import get_llama_embeddings, get_llama_answers
from typing import List
from openai_chat import get_openai_answers, get_openai_embeddings

app = FastAPI()

class QA(BaseModel):
"""
QA Class used by methods for accepting and validating HTTP data transfer
Parameters:
chunks: Chunks of text to embed
store_name: Name of the vectorstore from which to load or save to
query: Query to be run on the vectorstore
k: Top K similarity search
"""
chunks : List
store_name : str
query : str
k: int

@app.post('/qa/openai')
def openai_response(input: QA):
vectorstore = get_openai_embeddings(input.chunks, input.store_name)
def openai_response(inputs: QA):
"""
Parameters:
inputs: Instance of QA class used to accept inputs for generating response
Returns: Response generated by OpenAI
"""
vectorstore = get_openai_embeddings(inputs.chunks, inputs.store_name)
if input.query:
response = get_openai_answers(vectorstore, input.query, input.k)
response = get_openai_answers(vectorstore, inputs.query, inputs.k)
return response

@app.post('/qa/llama')
def llama_response(input: QA):
vectorstore = get_llama_embeddings(input.chunks, input.store_name)
def llama_response(inputs: QA):
"""
Parameters:
inputs: Instance of QA class used to accept inputs for generating response
Returns: Response generated by Llama
"""
vectorstore = get_llama_embeddings(inputs.chunks, inputs.store_name)
if input.query:
response = get_llama_answers(vectorstore, input.query, input.k)
response = get_llama_answers(vectorstore, inputs.query, inputs.k)
return response


28 changes: 21 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def read_pdf(pdf):
def read_pdf(pdfs):
"""
Parameters:
- pdf: path to the PDF file
- pdf: path to PDF files
Return: Returns the contents of the PDF file
"""
pdf_reader = PdfReader(pdf)

content = ""
for page in pdf_reader.pages:
content += page.extract_text()
for pdf in pdfs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
content += page.extract_text()
return content

def split_into_chunks(content):
Expand All @@ -31,4 +31,18 @@ def split_into_chunks(content):
length_function = len
)
chunks = text_splitter.split_text(text = content)
return chunks
return chunks

def get_store_name(pdfs):
"""
Construct a unique store name for the uploaded PDF files. We sort the files ensuring
that the store name remains same for the same files
Parameters:
pdf: path to PDF files
Return: sorted store name
"""
pdfs = sorted(pdfs, key = lambda file: file.name)
store_name = ''
for pdf in pdfs:
store_name += (pdf.name[:-4] + '.')
return store_name
Loading