Added v3 prototype frontend and notebooks

bettersg · Aug 6, 2024 · a17458b · a17458b
1 parent bcedb91
commit a17458b
Show file tree

Hide file tree

Showing 18 changed files with 8,367 additions and 0 deletions.
diff --git a/notebooks/master_webscraping.ipynb b/notebooks/master_webscraping.ipynb
diff --git a/notebooks/model-creation-transformer-laiss.ipynb b/notebooks/model-creation-transformer-laiss.ipynb
diff --git a/schemesv3_prototype_frontend/.DS_Store b/schemesv3_prototype_frontend/.DS_Store
diff --git a/schemesv3_prototype_frontend/Dockerfile b/schemesv3_prototype_frontend/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.10.6-buster
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY . .
+
+EXPOSE 8080
+
+HEALTHCHECK CMD curl --fail http://localhost:8080/_stcore/health
+
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
diff --git a/schemesv3_prototype_frontend/Logo.webp b/schemesv3_prototype_frontend/Logo.webp
diff --git a/schemesv3_prototype_frontend/Makefile b/schemesv3_prototype_frontend/Makefile
@@ -0,0 +1,62 @@
+##### Front end - - - - - - - - - - - - - - - - - - - - - - - - -
+
+run_app_locally:
+	streamlit run app.py
+
+##### Docker - - - - - - - - - - - - - - - - - - - - - - - - -
+
+docker_build_dev:
+	docker build --tag=$(GAR_IMAGE):dev .
+
+docker_run_dev:
+	docker run -it -e PORT=8080 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
+	-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro -p 8080:8080 --env-file .env $(GAR_IMAGE):dev
+
+docker_run_dev_noenv:
+	docker run -it -e PORT=8080 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
+	-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro -p 8080:8080 $(GAR_IMAGE):dev
+
+docker_run_dev_sh:
+	docker run -it --entrypoint sh -e PORT=8501 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
+	-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro \
+	-p 8080:8080 $(GAR_IMAGE):dev
+
+##### GCP - - - - - - - - - - - - - - - - - - - - - - - - -
+
+build_gcr_image:
+	docker build --platform linux/amd64 -t  $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod .
+
+gcr_new_ar:
+  gcloud artifacts repositories create $(GAR_IMAGE) --repository-format=docker --location=$(GCP_REGION) --description="Repository for storing $(GAR_IMAGE) images"
+
+push_gcr_image:
+	docker push $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod
+
+gcr_deploy:
+	gcloud run deploy --image $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod --memory $(GAR_MEMORY) --region $(GCP_REGION) --env-vars-file .env.yaml
+
+rebuild_and_deploy:
+	make build_gcr_image
+	make push_gcr_image
+	make gcr_deploy
+
+##### BigQuery - - - - - - - - - - - - - - - - - - - - - - - - -
+
+create_bq_env:
+	make create_bq_ds
+	make create_bq_tables
+
+create_bq_ds:
+	bq mk --project_id $(GCP_PROJECT) --data_location $(BQ_REGION) $(BQ_DATASET)
+
+create_bq_tables:
+	bq mk --location=$(GCP_REGION) --schema=user_queries_schema.json $(BQ_DATASET).$(BQ_USER_QUERY_T)
+	bq mk --location=$(GCP_REGION) --schema=chat_history_schema.json $(BQ_DATASET).$(BQ_CHAT_HISTORY_T)
+
+delete_bq_ds:
+	bq rm -r -f -d --project_id $(GCP_PROJECT) $(BQ_DATASET)
+
+reset_bq:
+	make delete_bq_ds
+	make create_bq_ds
+	make create_bq_tables
diff --git a/schemesv3_prototype_frontend/README_frontend_docker.md b/schemesv3_prototype_frontend/README_frontend_docker.md
@@ -0,0 +1,35 @@
+# How to set up a docker file for schemes frontend
+
+## Create .env file
+1. Ensure your terminal is inside the frontend directory!
+1. Ensure dotenv is installed in your local python environment
+2. Use the .env.sample as a template to create a .env file
+3. Update the values in .env file - these should already be set up when you did the ML Ops > Predict in production
+
+## Set up Big Query
+1. Assuming you have completed ML Ops > Cloud Training exercise, the below steps should just work
+2. Run `make create_bq_env` to set up a database and tables for schemes
+
+## Docker build
+1. Ensure the docker application is running on your computer
+2. Run `make build_gcr_image` to build the docker file (this should take a while)
+
+## Set up a new artifact repository in GCP
+1. If you already set up the artifact repository before, you can skip this step.
+2. Assuming you have completed ML Ops > Predict in production exercise, the below steps should just work
+3. Create a new artifact repository in GCP for this project by running `make gcr_new_ar`
+4. If it doesnt work, run this commeand directly in terminal
+   'gcloud artifacts repositories create $GAR_IMAGE --repository-format=docker --location=$GCP_REGION --description="Repository for storing $GAR_IMAGE images"'
+
+## Push (upload) docker image to GCP artifact repository
+1. Ensure the docker image has finished building
+2. Run `make push_gcr_image` to push the docker image to GCP
+
+## Run docker image on GCP artifact repository
+1. Ensure the docker image has finished uploading
+2. Create a .env.yaml which contains all the .env variables
+2. Run `make gcr_deploy` to run the docker image on GCP
+3. Go to "https://console.cloud.google.com/run" in your browser, click on schemesv2-app
+4. In the schemesv2-app page, you should be able to see a URL - this is the URL for your app
+
+ALL DONE!!!
diff --git a/schemesv3_prototype_frontend/__pycache__/bqlogic.cpython-310.pyc b/schemesv3_prototype_frontend/__pycache__/bqlogic.cpython-310.pyc
diff --git a/schemesv3_prototype_frontend/__pycache__/chat.cpython-310.pyc b/schemesv3_prototype_frontend/__pycache__/chat.cpython-310.pyc
diff --git a/schemesv3_prototype_frontend/__pycache__/cleantext.cpython-310.pyc b/schemesv3_prototype_frontend/__pycache__/cleantext.cpython-310.pyc
diff --git a/schemesv3_prototype_frontend/app.py b/schemesv3_prototype_frontend/app.py
@@ -0,0 +1,58 @@
+import streamlit as st
+from PIL import Image
+import requests
+from dotenv import load_dotenv
+from chat import *
+import pandas as pd
+import os
+
+# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./schemesv2-frontend-write-bq-6f2c58dec79d.json"
+
+
+# Set page tab display
+st.set_page_config(
+   page_title="Schemes V2",
+   page_icon= '🔍',
+   layout="centered",
+#    layout="wide",
+#    initial_sidebar_state="expanded",
+)
+
+# Title of the webpage
+col1, col2 = st.columns([1, 3])
+with col1:
+    st.image("Logo.webp")
+with col2:
+    st.write("")
+    st.title("Schemes V2",anchor=False)
+
+st.markdown("""
+        Hello! I'm your virtual assistant here to help you find schemes in Singapore for financial aid, healthcare, and more. I aim to provide relevant information and guide you to the best resources. As a volunteer-powered AI, I ensure safe and respectful advice based on the provided scheme data. Please verify the details with official sources for accuracy.
+        """)
+# Define a key for the text area widget
+query_key = 'query_text'
+
+# Initialize the query text in session state if not already present
+if query_key not in st.session_state:
+    st.session_state[query_key] = ""
+
+# Display the text area with session state value
+query = st.text_area(
+        'Tell us the help you need. Don\'t give personal info.',
+        value=st.session_state[query_key],
+        key=query_key,
+        placeholder='E.g. I am a dialysis patient in need of financial assistance and food support after being retrenched due to COVID 19'
+    )
+
+rel_score_key = "rel_score_key"
+if rel_score_key not in st.session_state:
+    st.session_state[rel_score_key] = 25
+
+rel_score = st.slider('Show me schemes above relevance score of (0 - most lenient, 100 - most strict) :',
+          0, 100,
+          value=st.session_state[rel_score_key],
+          step=25)
+st.session_state[rel_score_key] = rel_score
+
+if st.button(type="primary", label="🤖 Search & Chat", disabled=query==""):
+    st.switch_page("pages/chat_page.py")
diff --git a/schemesv3_prototype_frontend/bqlogic.py b/schemesv3_prototype_frontend/bqlogic.py
@@ -0,0 +1,69 @@
+import os
+from google.cloud import bigquery
+import pandas as pd
+from datetime import datetime, timezone
+import uuid
+import json
+
+# Initialize a BigQuery client
+project = os.getenv('GCP_PROJECT')
+client = bigquery.Client(project=project)
+
+def save_query(query_text,response):
+    dataset_id = os.getenv('BQ_DATASET')
+    table_id = os.getenv('BQ_USER_QUERY_T')
+    table_ref = client.dataset(dataset_id).table(table_id)
+    table = client.get_table(table_ref)  # Make an API request to fetch the table
+    query_id = uuid.uuid4()
+    json_string = json.dumps(response)  # Convert JSON to a string
+
+    rows_to_insert = [
+        { "user_id": str(query_id), "query_text": query_text, "query_timestamp": datetime.now(timezone.utc).isoformat(), "schemes_response":json_string }
+    ]
+
+    errors = client.insert_rows_json(table, rows_to_insert)  # Make an API request
+    if errors == []:
+        print("New rows have been added.")
+    else:
+        print("Encountered errors while inserting rows: {}".format(errors))
+
+    return query_id
+
+
+def save_chat_history(query_uuid, chat_text):
+    dataset_id = os.getenv('BQ_DATASET')
+    table_id = os.getenv('BQ_CHAT_HISTORY_T')
+    full_table_id = f"{client.project}.{dataset_id}.{table_id}"
+
+
+    # Ensure query_uuid is a string
+    query_uuid_str = str(query_uuid)
+
+    # Prepare the chat history JSON and ensure it is properly escaped
+    chat_history_json = json.dumps(chat_text, ensure_ascii=False)
+
+    # Construct a parameterized query
+    query = f"""
+    MERGE `{full_table_id}` T
+    USING (SELECT @query_uuid AS query_uuid, @chat_text AS chat_text, @chat_timestamp AS chat_timestamp) S
+    ON T.query_uuid = S.query_uuid
+    WHEN MATCHED THEN
+        UPDATE SET T.chat_text = CONCAT(T.chat_text, '\\n', S.chat_text), T.chat_timestamp = S.chat_timestamp
+    WHEN NOT MATCHED THEN
+        INSERT (query_uuid, chat_text, chat_timestamp) VALUES (query_uuid, chat_text, chat_timestamp)
+    """
+
+    # Set up the query parameters
+    job_config = bigquery.QueryJobConfig(
+        query_parameters=[
+            bigquery.ScalarQueryParameter("query_uuid", "STRING", query_uuid_str),
+            bigquery.ScalarQueryParameter("chat_text", "STRING", chat_history_json),
+            bigquery.ScalarQueryParameter("chat_timestamp", "TIMESTAMP", datetime.now(timezone.utc).isoformat())
+        ]
+    )
+
+    # Execute the query
+    job = client.query(query, job_config=job_config)
+    job.result()  # Wait for the query to finish
+
+    print("Chat history saved or updated.")
diff --git a/schemesv3_prototype_frontend/chat.py b/schemesv3_prototype_frontend/chat.py
@@ -0,0 +1,119 @@
+import streamlit as st
+import random
+import time
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import LLMChain
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import HumanMessage, AIMessage
+import os
+from dotenv import load_dotenv
+from cleantext import clean_scraped_text
+from bqlogic import save_chat_history
+
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+
+
+def dataframe_to_text(df):
+    # Example function to convert first 5 rows of a DataFrame into a text summary
+    text_summary = ''
+    for index, row in df.iterrows():
+        cleanScrape = row['Scraped Text']
+        sentence = clean_scraped_text(cleanScrape)
+
+        text_summary += f"Scheme Name: {row['Scheme']}, Agency: {row['Agency']}, Description: {row['Description']}, Link: {row['Link']}, Scraped Text from website: {sentence}\n"
+    return text_summary
+
+
+
+# Streaming the responses
+def response_generator(query, chat_history, topschemes):
+
+    top_schemes_text = dataframe_to_text(topschemes)
+    template = """
+    As a virtual assistant, I'm dedicated to helping user navigate through the available schemes. User has done initial search based on their needs and system has provided top schemes relevant to the search. Now, my job is to advise on the follow up user queries based on the schemes data available by analyzing user query and extracting relevant answers from the top scheme data. Top Schemes Information includes scheme name, agency, Link to website, and may include text directly scraped from scheme website.
+
+    In responding to user queries, I will adhere to the following principles:
+
+    1. **Continuity in Conversation**: Each new question may build on the ongoing conversation. I'll consider the chat history to ensure a coherent and contextual dialogue.
+
+    2. **Role Clarity**: My purpose is to guide user by leveraging the scheme information provided. My responses aim to advise based on this data, without undertaking any actions outside these confines.
+
+    3. **Language Simplicity**: I commit to using simple, accessible English, ensuring my responses are understandable to all users, without losing the essence or accuracy of the scheme information.
+
+    4. **Safety and Respect**: Maintaining a safe, respectful interaction is paramount. I will not entertain or generate harmful, disrespectful, or irrelevant content. Should any query diverge from the discussion on schemes, I will gently redirect the focus back to how I can assist with scheme-related inquiries.
+
+    5. **Avoidance of Fabrication**: My responses will solely rely on the information from the scheme details provided, avoiding any speculative or unfounded content. I will not alter or presume any specifics not clearly indicated in the scheme descriptions.
+
+    **User Query:**
+    {user_question}
+
+    **Chat History:**
+    {chat_history}
+
+    **Top Schemes Information:**
+    """ + top_schemes_text
+
+    # print(template)
+
+    prompt = ChatPromptTemplate.from_template(template)
+    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", google_api_key=GOOGLE_API_KEY, temperature=0.3)
+
+    chain = prompt | llm | StrOutputParser()
+
+    return chain.stream({
+        "chat_history": chat_history,
+        "user_question": query
+    })
+
+def chatbot(topschemes):
+    ai_intro = """
+    🌟 Welcome to Scheme Support Chat! 🌟 Feel free to ask me questions like:
+    - "Can you tell me more about Scheme X?"
+    - "How can I apply for support from Scheme X?"
+
+    To get started, just type your question below. I'm here to help explore schemes results 🚀
+    """
+
+    # Initialize chat history
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+        st.session_state.chat_history = [AIMessage(ai_intro)]
+        # with st.chat_message("AI"):
+        #     st.markdown(AIMessage(ai_intro))
+
+    for message in st.session_state.chat_history:
+        if isinstance(message, HumanMessage):
+            with st.chat_message("Human"):
+                st.markdown(message.content)
+
+        else:
+            with st.chat_message("AI"):
+                st.markdown(message.content)
+
+    #Chat log
+    prompt = st.chat_input("Ask any clarification questions for the search results...")
+
+    if prompt is not None and prompt != "":
+        # Append Human input
+        st.session_state.chat_history.append(HumanMessage(prompt))
+
+        # Display user message in chat message container
+        with st.chat_message("Human"):
+            st.markdown(prompt)
+
+        # Display AI in chat message container
+        with st.chat_message("AI"):
+            ai_response = st.write_stream(response_generator(prompt, st.session_state.chat_history, topschemes))
+
+        # Append AI response
+        st.session_state.chat_history.append(AIMessage(ai_response))
+
+        chat_exch = f"{HumanMessage(prompt).pretty_repr()} {AIMessage(ai_response).pretty_repr()}"
+        print(f"======= Chat exchange ====== {chat_exch} =====")
+        # Store the chat history in bigquery
+        save_chat_history(st.session_state['query_id'], chat_exch)
+
+    ### Create a native Streamlit reset button
+    # st.markdown("### ")
diff --git a/schemesv3_prototype_frontend/chat_history_schema.json b/schemesv3_prototype_frontend/chat_history_schema.json
@@ -0,0 +1,17 @@
+[
+  {
+    "mode":"NULLABLE",
+    "name":"query_uuid",
+    "type":"STRING"
+  },
+  {
+    "mode":"NULLABLE",
+    "name":"chat_text",
+    "type":"STRING"
+  },
+  {
+    "mode":"NULLABLE",
+    "name":"chat_timestamp",
+    "type":"TIMESTAMP"
+  }
+]