diff --git a/.gitignore b/.gitignore index b3ca752..b32a269 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,7 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Streamlit +.DS_Store +uploaded_files/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 04c689a..650896b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ helps, and credit will always be given. ## Get Started! -Ready to contribute? Here's how to set up `genai_stack` for local development. +Ready to contribute? Here's how to set up `beyondllm` for local development. 1. Fork the `beyondllm` repo on GitHub. 2. Clone your fork locally @@ -71,8 +71,8 @@ and "help wanted" is open to whoever wants to implement it. **Write Documentation** -GenAI Stack could always use more documentation, whether as part of the -official GenAI Stack docs, in docstrings, or even on the web in blog posts, +BeyondLLM could always use more documentation, whether as part of the +official BeyondLLM docs, in docstrings, or even on the web in blog posts, articles, and such. **Submit Feedback** diff --git a/examples/streamlit_app/app.py b/examples/streamlit_app/app.py new file mode 100644 index 0000000..7e8ca35 --- /dev/null +++ b/examples/streamlit_app/app.py @@ -0,0 +1,91 @@ +import os +import streamlit as st +from beyondllm import generator + +from beyondllm.llms import GeminiModel +from beyondllm import generator +from ingest import get_retriever + +st.title("Chat with CSV file") + +st.text("Enter Google API Key") +google_api_key = st.text_input("Google API Key:", type="password") +os.environ['GOOGLE_API_KEY'] = google_api_key + +vectordb_options = ['Chroma', 'Pinecone'] +with st.sidebar: + st.title("VectorDB Options") + vectordb_type = st.selectbox("Select VectorDB Type", + vectordb_options, + index=0) + + if vectordb_type == 'Pinecone': + # get the pinecone api key and index name + pinecone_api_key = st.text_input("Pinecone API Key:", + type="password") + pinecone_index_name = st.text_input("Pinecone Index Name:") + + # choose whether to use an existing index or create a new one + st.subheader("Pinecone Options") + pinecone_option = st.radio("Choose Option", + ('Existing', 'Create New'), + index=0) + + # choose whether you want to use the default embedding dimension or specify your own + pinecone_embedding_dim = st.number_input("Embedding Dimension", + min_value=1, + max_value=2048, + value=768) + + # choose whether you want to use the default metric or specify your own, choose between "cosine" and "euclidean" + pinecone_metric = st.selectbox("Metric", + ["cosine", "euclidean"], + index=0) + + # choose whether you want to use the default cloud or specify your own + pinecone_cloud = st.selectbox("Cloud", + ["aws", "gcp", "azure"], + index=0) + + # put the name of the region you want to use + pinecone_region = st.text_input("Region:") + +if google_api_key: + st.success("Google API Key entered successfully!") + uploaded_file = st.file_uploader("Choose a CSV file", type=['csv']) + if uploaded_file is not None: + st.success("file uploaded successfully!") + question = st.text_input("Enter your question") + + if uploaded_file is not None and question: + # Get the retriever + if vectordb_type == 'Pinecone': + retriever = get_retriever(uploaded_file, + google_api_key, + vector_db=vectordb_type.lower(), + pinecone_api_key=pinecone_api_key, + pinecone_index_name=pinecone_index_name, + pinecone_option=pinecone_option, + pinecone_embedding_dim=pinecone_embedding_dim, + pinecone_metric=pinecone_metric, + pinecone_cloud=pinecone_cloud, + pinecone_region=pinecone_region) + elif vectordb_type == 'Chroma': + retriever = get_retriever(uploaded_file, + google_api_key, + vector_db=vectordb_type.lower()) + # Initialize the LLM + llm = GeminiModel(model_name="gemini-pro", + google_api_key = os.environ.get('GOOGLE_API_KEY')) + # Initialize the system prompt + system_prompt = "You are an AI assistant, who answers questions based on uploaded csv files. You can answer anything about the data." + # Initialize the generator + pipeline = generator.Generate(question=question, + retriever=retriever, + llm=llm, + system_prompt=system_prompt) + # Generate the response + response = pipeline.call() + # display the response + st.write(response) + diff --git a/examples/streamlit_app/ingest.py b/examples/streamlit_app/ingest.py new file mode 100644 index 0000000..f4351f1 --- /dev/null +++ b/examples/streamlit_app/ingest.py @@ -0,0 +1,49 @@ +import os +from beyondllm.retrieve import auto_retriever +from beyondllm.vectordb import ChromaVectorDb, PineconeVectorDb +from beyondllm.embeddings import GeminiEmbeddings +from beyondllm import source + +def get_retriever(uploaded_file, google_api_key, vector_db='chroma', pinecone_api_key=None, pinecone_index_name=None, pinecone_option=None, pinecone_embedding_dim=None, pinecone_metric=None, pinecone_cloud=None, pinecone_region=None): + if google_api_key: + # Save the uploaded file + save_path = "./uploaded_files" # change this to your desired path or leave it as is + if not os.path.exists(save_path): + os.makedirs(save_path) + file_path = os.path.join(save_path, uploaded_file.name) + with open(file_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + # Fit the data + data = source.fit(file_path, dtype="csv", + chunk_size=512, + chunk_overlap=50) + # Initialize your embedding model + embed_model = GeminiEmbeddings(api_key=google_api_key, + model_name="models/embedding-001") + # Initialize your vector store + if vector_db == 'chroma': + vector_store = ChromaVectorDb(collection_name='my_persistent_collection', # change this to your desired collection name + persist_directory='./db/chroma/') + elif vector_db == 'pinecone': + if pinecone_option == 'Existing': + # Initialize an existing Pinecone index + vector_store = PineconeVectorDb(api_key=pinecone_api_key, + index_name=pinecone_index_name) + else: + # Create a new serverless Pinecone index + vector_store = PineconeVectorDb( + create=True, + api_key=pinecone_api_key, + index_name=pinecone_index_name, + embedding_dim=pinecone_embedding_dim, + metric=pinecone_metric, + cloud=pinecone_cloud, + region=pinecone_region, + ) + # Initialize the retriever + retriever = auto_retriever(data=data, embed_model=embed_model, type="normal", top_k=5, vectordb=vector_store) + + return retriever + + return None