From f4512ce0954d9aca50a12a62c5ffd6ff00c7e62d Mon Sep 17 00:00:00 2001 From: ARYA CHAKRABORTY Date: Thu, 9 May 2024 10:10:22 +0530 Subject: [PATCH 1/3] added streamlit app in the examples folder and modified .gitignore --- .gitignore | 3 ++ examples/streamlit_app/app.py | 68 ++++++++++++++++++++++++++++++++ examples/streamlit_app/ingest.py | 49 +++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 examples/streamlit_app/app.py create mode 100644 examples/streamlit_app/ingest.py diff --git a/.gitignore b/.gitignore index b3ca752..c28837a 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Streamlit +.DS_Store \ No newline at end of file diff --git a/examples/streamlit_app/app.py b/examples/streamlit_app/app.py new file mode 100644 index 0000000..2ae7ca0 --- /dev/null +++ b/examples/streamlit_app/app.py @@ -0,0 +1,68 @@ +import os +import streamlit as st +from beyondllm import generator + +from beyondllm.llms import GeminiModel +from beyondllm import generator +from ingest import get_retriever + +st.title("Chat with CSV file") + +st.text("Enter Google API Key") +google_api_key = st.text_input("Google API Key:", type="password") +os.environ['GOOGLE_API_KEY'] = google_api_key + +vectordb_options = ['Chroma', 'Pinecone'] +with st.sidebar: + st.title("VectorDB Options") + vectordb_type = st.selectbox("Select VectorDB Type", + vectordb_options, + index=0) + + if vectordb_type == 'Pinecone': + # get the pinecone api key and index name + pinecone_api_key = st.text_input("Pinecone API Key:", + type="password") + pinecone_index_name = st.text_input("Pinecone Index Name:") + + # choose whether to use an existing index or create a new one + st.subheader("Pinecone Options") + pinecone_option = st.radio("Choose Option", + ('Existing', 'Create New'), + index=0) + +if google_api_key: + st.success("Google API Key entered successfully!") + uploaded_file = st.file_uploader("Choose a CSV file", type=['csv']) + if uploaded_file is not None: + st.success("file uploaded successfully!") + question = st.text_input("Enter your question") + + if uploaded_file is not None and question: + # Get the retriever + if vectordb_type == 'Pinecone': + retriever = get_retriever(uploaded_file, + google_api_key, + vector_db=vectordb_type.lower(), + pinecone_api_key=pinecone_api_key, + pinecone_index_name=pinecone_index_name, + pinecone_option=pinecone_option) + elif vectordb_type == 'Chroma': + retriever = get_retriever(uploaded_file, + google_api_key, + vector_db=vectordb_type.lower()) + # Initialize the LLM + llm = GeminiModel(model_name="gemini-pro", + google_api_key = os.environ.get('GOOGLE_API_KEY')) + # Initialize the system prompt + system_prompt = "You are an AI assistant, who answers questions based on uploaded csv files. You can answer anything about the data." + # Initialize the generator + pipeline = generator.Generate(question=question, + retriever=retriever, + llm=llm, + system_prompt=system_prompt) + # Generate the response + response = pipeline.call() + # display the response + st.write(response) + diff --git a/examples/streamlit_app/ingest.py b/examples/streamlit_app/ingest.py new file mode 100644 index 0000000..9269ffe --- /dev/null +++ b/examples/streamlit_app/ingest.py @@ -0,0 +1,49 @@ +import os +from beyondllm.retrieve import auto_retriever +from beyondllm.vectordb import ChromaVectorDb, PineconeVectorDb +from beyondllm.embeddings import GeminiEmbeddings +from beyondllm import source + +def get_retriever(uploaded_file, google_api_key, vector_db='chroma', pinecone_api_key=None, pinecone_index_name=None, pinecone_option=None): + if google_api_key: + # Save the uploaded file + save_path = "./uploaded_files" # change this to your desired path or leave it as is + if not os.path.exists(save_path): + os.makedirs(save_path) + file_path = os.path.join(save_path, uploaded_file.name) + with open(file_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + # Fit the data + data = source.fit(file_path, dtype="csv", + chunk_size=512, + chunk_overlap=50) + # Initialize your embedding model + embed_model = GeminiEmbeddings(api_key=google_api_key, + model_name="models/embedding-001") + # Initialize your vector store + if vector_db == 'chroma': + vector_store = ChromaVectorDb(collection_name='my_persistent_collection', # change this to your desired collection name + persist_directory='./db/chroma/') + elif vector_db == 'pinecone': + if pinecone_option == 'Existing': + # Initialize an existing Pinecone index + vector_store = PineconeVectorDb(api_key=pinecone_api_key, + index_name=pinecone_index_name) + else: + # Create a new serverless Pinecone index + vector_store = PineconeVectorDb( + create=True, + api_key=pinecone_api_key, + index_name=pinecone_index_name, + embedding_dim=768, + metric="cosine", + cloud="aws", + region="us-east-1", + ) + # Initialize the retriever + retriever = auto_retriever(data=data, embed_model=embed_model, type="normal", top_k=5, vectordb=vector_store) + + return retriever + + return None From 436f88cce5a6c84b24f68a8dcbd00c33a2d5aafc Mon Sep 17 00:00:00 2001 From: ARYA CHAKRABORTY Date: Thu, 9 May 2024 20:04:27 +0530 Subject: [PATCH 2/3] added custom options for pinecone and updated .gitignore file again --- .gitignore | 3 ++- examples/streamlit_app/app.py | 25 ++++++++++++++++++++++++- examples/streamlit_app/ingest.py | 10 +++++----- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index c28837a..b32a269 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,5 @@ cython_debug/ #.idea/ # Streamlit -.DS_Store \ No newline at end of file +.DS_Store +uploaded_files/ \ No newline at end of file diff --git a/examples/streamlit_app/app.py b/examples/streamlit_app/app.py index 2ae7ca0..7e8ca35 100644 --- a/examples/streamlit_app/app.py +++ b/examples/streamlit_app/app.py @@ -30,6 +30,25 @@ pinecone_option = st.radio("Choose Option", ('Existing', 'Create New'), index=0) + + # choose whether you want to use the default embedding dimension or specify your own + pinecone_embedding_dim = st.number_input("Embedding Dimension", + min_value=1, + max_value=2048, + value=768) + + # choose whether you want to use the default metric or specify your own, choose between "cosine" and "euclidean" + pinecone_metric = st.selectbox("Metric", + ["cosine", "euclidean"], + index=0) + + # choose whether you want to use the default cloud or specify your own + pinecone_cloud = st.selectbox("Cloud", + ["aws", "gcp", "azure"], + index=0) + + # put the name of the region you want to use + pinecone_region = st.text_input("Region:") if google_api_key: st.success("Google API Key entered successfully!") @@ -46,7 +65,11 @@ vector_db=vectordb_type.lower(), pinecone_api_key=pinecone_api_key, pinecone_index_name=pinecone_index_name, - pinecone_option=pinecone_option) + pinecone_option=pinecone_option, + pinecone_embedding_dim=pinecone_embedding_dim, + pinecone_metric=pinecone_metric, + pinecone_cloud=pinecone_cloud, + pinecone_region=pinecone_region) elif vectordb_type == 'Chroma': retriever = get_retriever(uploaded_file, google_api_key, diff --git a/examples/streamlit_app/ingest.py b/examples/streamlit_app/ingest.py index 9269ffe..f4351f1 100644 --- a/examples/streamlit_app/ingest.py +++ b/examples/streamlit_app/ingest.py @@ -4,7 +4,7 @@ from beyondllm.embeddings import GeminiEmbeddings from beyondllm import source -def get_retriever(uploaded_file, google_api_key, vector_db='chroma', pinecone_api_key=None, pinecone_index_name=None, pinecone_option=None): +def get_retriever(uploaded_file, google_api_key, vector_db='chroma', pinecone_api_key=None, pinecone_index_name=None, pinecone_option=None, pinecone_embedding_dim=None, pinecone_metric=None, pinecone_cloud=None, pinecone_region=None): if google_api_key: # Save the uploaded file save_path = "./uploaded_files" # change this to your desired path or leave it as is @@ -36,10 +36,10 @@ def get_retriever(uploaded_file, google_api_key, vector_db='chroma', pinecone_ap create=True, api_key=pinecone_api_key, index_name=pinecone_index_name, - embedding_dim=768, - metric="cosine", - cloud="aws", - region="us-east-1", + embedding_dim=pinecone_embedding_dim, + metric=pinecone_metric, + cloud=pinecone_cloud, + region=pinecone_region, ) # Initialize the retriever retriever = auto_retriever(data=data, embed_model=embed_model, type="normal", top_k=5, vectordb=vector_store) From 44827ef3475a841fd9370d74806aa70b85ce350b Mon Sep 17 00:00:00 2001 From: ARYA CHAKRABORTY Date: Mon, 13 May 2024 10:00:20 +0530 Subject: [PATCH 3/3] removed genai stack from contributing.md --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 04c689a..650896b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ helps, and credit will always be given. ## Get Started! -Ready to contribute? Here's how to set up `genai_stack` for local development. +Ready to contribute? Here's how to set up `beyondllm` for local development. 1. Fork the `beyondllm` repo on GitHub. 2. Clone your fork locally @@ -71,8 +71,8 @@ and "help wanted" is open to whoever wants to implement it. **Write Documentation** -GenAI Stack could always use more documentation, whether as part of the -official GenAI Stack docs, in docstrings, or even on the web in blog posts, +BeyondLLM could always use more documentation, whether as part of the +official BeyondLLM docs, in docstrings, or even on the web in blog posts, articles, and such. **Submit Feedback**