-
Notifications
You must be signed in to change notification settings - Fork 0
/
flask_pinecone.py
139 lines (121 loc) · 4.86 KB
/
flask_pinecone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import JSONLoader
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
import os
from flask import *
from flask_cors import CORS
import openai
import pinecone
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})
app.config['SECRET_KEY'] = 'your-secret-key'
# Load environment variables from .env file
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv('.env')
openai_api_key = os.environ.get('API_KEY')
hugging_face_api = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
pinecone_env = os.environ.get('PINECONE_API_ENV')
#Template
template = """
Based on the provided context, answer every question under each heading:
Context: {context}
Question: Describe the answer to every question under each of the following headings:
- Introduction
- Key Points in bullets
- Conclusion
{query}
Answer:
"""
# Prompt Template
script_template = PromptTemplate(
input_variables=['query','context'],
template=template
)
# Llms
model_name="gpt-3.5-turbo-16k"
llm = ChatOpenAI(temperature=0.8,model_name=model_name, max_tokens=5000)
script_chain = LLMChain(llm=llm, prompt=script_template,output_key='Answer',verbose=True)
@app.route('/')
def main():
return render_template("upload.html")
#question answering
chain = load_qa_chain(llm, chain_type="stuff")
ALLOWED_EXTENSIONS = {'txt', 'pdf'}
def allowed_file(file):
return '.' in file and \
file.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/upload', methods=['POST','GET'])
def upload():
file_extension = None
if request.method == 'POST':
# Get the list of files from webpage
files = request.files.getlist("file")
# Iterate for each file in the files List, and Save them
for file in files:
file.save(file.filename)
_, file_extension = os.path.splitext(file.filename)
documents = None
if allowed_file in files:
if file_extension.lower() == '.pdf':
# Load the PDF file with PyPDFLoader
loader = PyPDFLoader(file.filename)
documents = loader.load()
elif file_extension.lower() == '.txt':
# Load the text file with TextLoader
loader = TextLoader(file.filename,encoding='utf8')
else:
raise ValueError(f"Unsupported file type: {file_extension}")
else:
# Handle the case when file_extension is None
raise ValueError("File extension could not be determined.")
documents = loader.load()
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
#embedings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key,show_progress_bar=True)
#for existing
pinecone.init(
api_key=pinecone_api_key, # find at app.pinecone.io
environment=pinecone_env # next to api key in console
)
index_name = "testing"
namespaces = 'docs'
meta = [{'text': str(doc.page_content)} for doc in docs]
print("Meta",meta)
# Upsert data into the selected namespace
# Create an index
index = pinecone.Index(index_name)
# Store the embeddings in the Pinecone index with metadata
to_upsert = [(f"doc-{i}", embeddings.embed_documents([doc.page_content]), meta[i]) for i, doc in enumerate(docs)]
print("Upsert",to_upsert)
index.upsert(vectors=to_upsert, namespace=namespaces)
index = Pinecone.from_documents(docs, embeddings, index_name=index_name,namespace=namespaces)
def get_similiar_docs(query ,top_k=5, score=True):
# Convert the query to a vector
query_vector = embeddings.embed_documents([query])
if score:
similar_docs = index.similarity_search_with_score(query_vector)
else:
similar_docs = index.similarity_search(query_vector, top_k=top_k)
return similar_docs
def get_answer(query):
# Get similar documents
similar_docs = get_similiar_docs(query)
inputs = {'query': query, 'context': similar_docs}
answer = script_chain.run(inputs)
return answer
if request.form.get('go'):
query = request.form.get('question')
return get_answer(query)
return render_template('question.html')
if __name__ == '__main__':
app.run(debug=True)