Skip to content

Commit

Permalink
Added v3 prototype frontend and notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
yevkim committed Aug 6, 2024
1 parent bcedb91 commit a17458b
Show file tree
Hide file tree
Showing 18 changed files with 8,367 additions and 0 deletions.
876 changes: 876 additions & 0 deletions notebooks/master_webscraping.ipynb

Large diffs are not rendered by default.

6,944 changes: 6,944 additions & 0 deletions notebooks/model-creation-transformer-laiss.ipynb

Large diffs are not rendered by default.

Binary file added schemesv3_prototype_frontend/.DS_Store
Binary file not shown.
14 changes: 14 additions & 0 deletions schemesv3_prototype_frontend/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM python:3.10.6-buster

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY . .

EXPOSE 8080

HEALTHCHECK CMD curl --fail http://localhost:8080/_stcore/health

ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
Binary file added schemesv3_prototype_frontend/Logo.webp
Binary file not shown.
62 changes: 62 additions & 0 deletions schemesv3_prototype_frontend/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
##### Front end - - - - - - - - - - - - - - - - - - - - - - - - -

run_app_locally:
streamlit run app.py

##### Docker - - - - - - - - - - - - - - - - - - - - - - - - -

docker_build_dev:
docker build --tag=$(GAR_IMAGE):dev .

docker_run_dev:
docker run -it -e PORT=8080 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro -p 8080:8080 --env-file .env $(GAR_IMAGE):dev

docker_run_dev_noenv:
docker run -it -e PORT=8080 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro -p 8080:8080 $(GAR_IMAGE):dev

docker_run_dev_sh:
docker run -it --entrypoint sh -e PORT=8501 -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keys/FILE_NAME.json \
-v $(GOOGLE_APPLICATION_CREDENTIALS):/tmp/keys/FILE_NAME.json:ro \
-p 8080:8080 $(GAR_IMAGE):dev

##### GCP - - - - - - - - - - - - - - - - - - - - - - - - -

build_gcr_image:
docker build --platform linux/amd64 -t $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod .

gcr_new_ar:
gcloud artifacts repositories create $(GAR_IMAGE) --repository-format=docker --location=$(GCP_REGION) --description="Repository for storing $(GAR_IMAGE) images"

push_gcr_image:
docker push $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod

gcr_deploy:
gcloud run deploy --image $(GCP_REGION)-docker.pkg.dev/$(GCP_PROJECT)/$(GAR_IMAGE)/$(GAR_IMAGE):prod --memory $(GAR_MEMORY) --region $(GCP_REGION) --env-vars-file .env.yaml

rebuild_and_deploy:
make build_gcr_image
make push_gcr_image
make gcr_deploy

##### BigQuery - - - - - - - - - - - - - - - - - - - - - - - - -

create_bq_env:
make create_bq_ds
make create_bq_tables

create_bq_ds:
bq mk --project_id $(GCP_PROJECT) --data_location $(BQ_REGION) $(BQ_DATASET)

create_bq_tables:
bq mk --location=$(GCP_REGION) --schema=user_queries_schema.json $(BQ_DATASET).$(BQ_USER_QUERY_T)
bq mk --location=$(GCP_REGION) --schema=chat_history_schema.json $(BQ_DATASET).$(BQ_CHAT_HISTORY_T)

delete_bq_ds:
bq rm -r -f -d --project_id $(GCP_PROJECT) $(BQ_DATASET)

reset_bq:
make delete_bq_ds
make create_bq_ds
make create_bq_tables
35 changes: 35 additions & 0 deletions schemesv3_prototype_frontend/README_frontend_docker.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# How to set up a docker file for schemes frontend

## Create .env file
1. Ensure your terminal is inside the frontend directory!
1. Ensure dotenv is installed in your local python environment
2. Use the .env.sample as a template to create a .env file
3. Update the values in .env file - these should already be set up when you did the ML Ops > Predict in production

## Set up Big Query
1. Assuming you have completed ML Ops > Cloud Training exercise, the below steps should just work
2. Run `make create_bq_env` to set up a database and tables for schemes

## Docker build
1. Ensure the docker application is running on your computer
2. Run `make build_gcr_image` to build the docker file (this should take a while)

## Set up a new artifact repository in GCP
1. If you already set up the artifact repository before, you can skip this step.
2. Assuming you have completed ML Ops > Predict in production exercise, the below steps should just work
3. Create a new artifact repository in GCP for this project by running `make gcr_new_ar`
4. If it doesnt work, run this commeand directly in terminal
'gcloud artifacts repositories create $GAR_IMAGE --repository-format=docker --location=$GCP_REGION --description="Repository for storing $GAR_IMAGE images"'

## Push (upload) docker image to GCP artifact repository
1. Ensure the docker image has finished building
2. Run `make push_gcr_image` to push the docker image to GCP

## Run docker image on GCP artifact repository
1. Ensure the docker image has finished uploading
2. Create a .env.yaml which contains all the .env variables
2. Run `make gcr_deploy` to run the docker image on GCP
3. Go to "https://console.cloud.google.com/run" in your browser, click on schemesv2-app
4. In the schemesv2-app page, you should be able to see a URL - this is the URL for your app

ALL DONE!!!
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 58 additions & 0 deletions schemesv3_prototype_frontend/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import streamlit as st
from PIL import Image
import requests
from dotenv import load_dotenv
from chat import *
import pandas as pd
import os

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./schemesv2-frontend-write-bq-6f2c58dec79d.json"


# Set page tab display
st.set_page_config(
page_title="Schemes V2",
page_icon= '🔍',
layout="centered",
# layout="wide",
# initial_sidebar_state="expanded",
)

# Title of the webpage
col1, col2 = st.columns([1, 3])
with col1:
st.image("Logo.webp")
with col2:
st.write("")
st.title("Schemes V2",anchor=False)

st.markdown("""
Hello! I'm your virtual assistant here to help you find schemes in Singapore for financial aid, healthcare, and more. I aim to provide relevant information and guide you to the best resources. As a volunteer-powered AI, I ensure safe and respectful advice based on the provided scheme data. Please verify the details with official sources for accuracy.
""")
# Define a key for the text area widget
query_key = 'query_text'

# Initialize the query text in session state if not already present
if query_key not in st.session_state:
st.session_state[query_key] = ""

# Display the text area with session state value
query = st.text_area(
'Tell us the help you need. Don\'t give personal info.',
value=st.session_state[query_key],
key=query_key,
placeholder='E.g. I am a dialysis patient in need of financial assistance and food support after being retrenched due to COVID 19'
)

rel_score_key = "rel_score_key"
if rel_score_key not in st.session_state:
st.session_state[rel_score_key] = 25

rel_score = st.slider('Show me schemes above relevance score of (0 - most lenient, 100 - most strict) :',
0, 100,
value=st.session_state[rel_score_key],
step=25)
st.session_state[rel_score_key] = rel_score

if st.button(type="primary", label="🤖 Search & Chat", disabled=query==""):
st.switch_page("pages/chat_page.py")
69 changes: 69 additions & 0 deletions schemesv3_prototype_frontend/bqlogic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
from google.cloud import bigquery
import pandas as pd
from datetime import datetime, timezone
import uuid
import json

# Initialize a BigQuery client
project = os.getenv('GCP_PROJECT')
client = bigquery.Client(project=project)

def save_query(query_text,response):
dataset_id = os.getenv('BQ_DATASET')
table_id = os.getenv('BQ_USER_QUERY_T')
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref) # Make an API request to fetch the table
query_id = uuid.uuid4()
json_string = json.dumps(response) # Convert JSON to a string

rows_to_insert = [
{ "user_id": str(query_id), "query_text": query_text, "query_timestamp": datetime.now(timezone.utc).isoformat(), "schemes_response":json_string }
]

errors = client.insert_rows_json(table, rows_to_insert) # Make an API request
if errors == []:
print("New rows have been added.")
else:
print("Encountered errors while inserting rows: {}".format(errors))

return query_id


def save_chat_history(query_uuid, chat_text):
dataset_id = os.getenv('BQ_DATASET')
table_id = os.getenv('BQ_CHAT_HISTORY_T')
full_table_id = f"{client.project}.{dataset_id}.{table_id}"


# Ensure query_uuid is a string
query_uuid_str = str(query_uuid)

# Prepare the chat history JSON and ensure it is properly escaped
chat_history_json = json.dumps(chat_text, ensure_ascii=False)

# Construct a parameterized query
query = f"""
MERGE `{full_table_id}` T
USING (SELECT @query_uuid AS query_uuid, @chat_text AS chat_text, @chat_timestamp AS chat_timestamp) S
ON T.query_uuid = S.query_uuid
WHEN MATCHED THEN
UPDATE SET T.chat_text = CONCAT(T.chat_text, '\\n', S.chat_text), T.chat_timestamp = S.chat_timestamp
WHEN NOT MATCHED THEN
INSERT (query_uuid, chat_text, chat_timestamp) VALUES (query_uuid, chat_text, chat_timestamp)
"""

# Set up the query parameters
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter("query_uuid", "STRING", query_uuid_str),
bigquery.ScalarQueryParameter("chat_text", "STRING", chat_history_json),
bigquery.ScalarQueryParameter("chat_timestamp", "TIMESTAMP", datetime.now(timezone.utc).isoformat())
]
)

# Execute the query
job = client.query(query, job_config=job_config)
job.result() # Wait for the query to finish

print("Chat history saved or updated.")
119 changes: 119 additions & 0 deletions schemesv3_prototype_frontend/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import streamlit as st
import random
import time
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, AIMessage
import os
from dotenv import load_dotenv
from cleantext import clean_scraped_text
from bqlogic import save_chat_history

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


def dataframe_to_text(df):
# Example function to convert first 5 rows of a DataFrame into a text summary
text_summary = ''
for index, row in df.iterrows():
cleanScrape = row['Scraped Text']
sentence = clean_scraped_text(cleanScrape)

text_summary += f"Scheme Name: {row['Scheme']}, Agency: {row['Agency']}, Description: {row['Description']}, Link: {row['Link']}, Scraped Text from website: {sentence}\n"
return text_summary



# Streaming the responses
def response_generator(query, chat_history, topschemes):

top_schemes_text = dataframe_to_text(topschemes)
template = """
As a virtual assistant, I'm dedicated to helping user navigate through the available schemes. User has done initial search based on their needs and system has provided top schemes relevant to the search. Now, my job is to advise on the follow up user queries based on the schemes data available by analyzing user query and extracting relevant answers from the top scheme data. Top Schemes Information includes scheme name, agency, Link to website, and may include text directly scraped from scheme website.
In responding to user queries, I will adhere to the following principles:
1. **Continuity in Conversation**: Each new question may build on the ongoing conversation. I'll consider the chat history to ensure a coherent and contextual dialogue.
2. **Role Clarity**: My purpose is to guide user by leveraging the scheme information provided. My responses aim to advise based on this data, without undertaking any actions outside these confines.
3. **Language Simplicity**: I commit to using simple, accessible English, ensuring my responses are understandable to all users, without losing the essence or accuracy of the scheme information.
4. **Safety and Respect**: Maintaining a safe, respectful interaction is paramount. I will not entertain or generate harmful, disrespectful, or irrelevant content. Should any query diverge from the discussion on schemes, I will gently redirect the focus back to how I can assist with scheme-related inquiries.
5. **Avoidance of Fabrication**: My responses will solely rely on the information from the scheme details provided, avoiding any speculative or unfounded content. I will not alter or presume any specifics not clearly indicated in the scheme descriptions.
**User Query:**
{user_question}
**Chat History:**
{chat_history}
**Top Schemes Information:**
""" + top_schemes_text

# print(template)

prompt = ChatPromptTemplate.from_template(template)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", google_api_key=GOOGLE_API_KEY, temperature=0.3)

chain = prompt | llm | StrOutputParser()

return chain.stream({
"chat_history": chat_history,
"user_question": query
})

def chatbot(topschemes):
ai_intro = """
🌟 Welcome to Scheme Support Chat! 🌟 Feel free to ask me questions like:
- "Can you tell me more about Scheme X?"
- "How can I apply for support from Scheme X?"
To get started, just type your question below. I'm here to help explore schemes results 🚀
"""

# Initialize chat history
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
st.session_state.chat_history = [AIMessage(ai_intro)]
# with st.chat_message("AI"):
# st.markdown(AIMessage(ai_intro))

for message in st.session_state.chat_history:
if isinstance(message, HumanMessage):
with st.chat_message("Human"):
st.markdown(message.content)

else:
with st.chat_message("AI"):
st.markdown(message.content)

#Chat log
prompt = st.chat_input("Ask any clarification questions for the search results...")

if prompt is not None and prompt != "":
# Append Human input
st.session_state.chat_history.append(HumanMessage(prompt))

# Display user message in chat message container
with st.chat_message("Human"):
st.markdown(prompt)

# Display AI in chat message container
with st.chat_message("AI"):
ai_response = st.write_stream(response_generator(prompt, st.session_state.chat_history, topschemes))

# Append AI response
st.session_state.chat_history.append(AIMessage(ai_response))

chat_exch = f"{HumanMessage(prompt).pretty_repr()} {AIMessage(ai_response).pretty_repr()}"
print(f"======= Chat exchange ====== {chat_exch} =====")
# Store the chat history in bigquery
save_chat_history(st.session_state['query_id'], chat_exch)

### Create a native Streamlit reset button
# st.markdown("### ")
17 changes: 17 additions & 0 deletions schemesv3_prototype_frontend/chat_history_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[
{
"mode":"NULLABLE",
"name":"query_uuid",
"type":"STRING"
},
{
"mode":"NULLABLE",
"name":"chat_text",
"type":"STRING"
},
{
"mode":"NULLABLE",
"name":"chat_timestamp",
"type":"TIMESTAMP"
}
]
Loading

0 comments on commit a17458b

Please sign in to comment.