From 0adb6eafdd020b6538db98cda03dd2f45f743a48 Mon Sep 17 00:00:00 2001 From: Xuhui Zhou Date: Wed, 24 Apr 2024 16:31:50 -0400 Subject: [PATCH] Leaderboard and Unified UI (#61) * Update start_app.sh to use gradio instead of python app.py * fixed action typing error --------- Co-authored-by: Jasonqi146 --- README.md | 13 ++ app.py | 379 ++++++--------------------------- data_dir/models_vs_gpt35.jsonl | 4 + requirements.txt | 28 +-- sotopia_space/_header.md | 4 + sotopia_space/benchmark.py | 70 ++++++ sotopia_space/chat.py | 284 ++++++++++++++++++++++++ sotopia_space/constants.py | 39 ++++ sotopia_space/utils.py | 223 +++++++++++++++++++ start_app.sh | 2 +- ui_constants.py | 191 +++++++++++++++++ 11 files changed, 908 insertions(+), 329 deletions(-) create mode 100644 data_dir/models_vs_gpt35.jsonl create mode 100644 sotopia_space/_header.md create mode 100644 sotopia_space/benchmark.py create mode 100644 sotopia_space/chat.py create mode 100644 sotopia_space/constants.py create mode 100644 sotopia_space/utils.py create mode 100644 ui_constants.py diff --git a/README.md b/README.md index 9f7e201..011e9ea 100644 --- a/README.md +++ b/README.md @@ -11,3 +11,16 @@ license: apache-2.0 --- This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo) + +## Getting Started + +```bash +conda create -n sotopia-space python=3.11; conda activate sotopia-space +python -m pip install -r requirements.txt +``` + +To run the app, run the following command: + +```bash +bash start_app.sh +``` diff --git a/app.py b/app.py index 2bac33e..8d6f2da 100644 --- a/app.py +++ b/app.py @@ -1,332 +1,83 @@ import os -from collections import defaultdict -import json +import argparse from typing import Literal -import gradio as gr +import gradio as gr # type: ignore +from sotopia_space.chat import chat_introduction, chat_tab, get_sotopia_profiles +from sotopia_space import benchmark +from ui_constants import CITATION_TEXT, BANNER -from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt -from functools import cache -from sotopia_pi_generate import prepare_model, generate_action OPENAI_KEY_FILE="./openai_api.key" if os.path.exists(OPENAI_KEY_FILE): with open(OPENAI_KEY_FILE, "r") as f: os.environ["OPENAI_API_KEY"] = f.read().strip() -DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true" -DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo" -TEMPERATURE = 0.7 -TOP_P = 1 -MAX_TOKENS = 1024 +with open("./sotopia_space/_header.md", "r") as f: + HEADER_MD = f.read() -ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl" -AGENT_PROFILES = "profiles/agent_profiles.jsonl" -RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl" - -ACTION_TYPES = ['none', 'action', 'non-verbal communication', 'speak', 'leave'] - -MODEL_OPTIONS = [ - "gpt-3.5-turbo", - "gpt-4", - "gpt-4-turbo", - "cmu-lti/sotopia-pi-mistral-7b-BC_SR", - "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit", - "mistralai/Mistral-7B-Instruct-v0.1" - # "mistralai/Mixtral-8x7B-Instruct-v0.1", - # "togethercomputer/llama-2-7b-chat", - # "togethercomputer/llama-2-70b-chat", - # "togethercomputer/mpt-30b-chat", - # "together_ai/togethercomputer/llama-2-7b-chat", - # "together_ai/togethercomputer/falcon-7b-instruct", -] - -@cache -def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES): - with open(env_file, 'r') as f: - data = [json.loads(line) for line in f.readlines()] - - code_names_count = defaultdict(int) - environments = [] - environment_dict = {} - for profile in sorted(data, key=lambda x: x['codename']): - env_obj = Environment(profile) - if profile['codename'] in code_names_count: - environments.append(( - "{}_{:05d}".format(profile['codename'], - code_names_count[profile['codename']] - ), - env_obj._id - )) - else: - environments.append((profile['codename'], env_obj._id)) - environment_dict[env_obj._id] = env_obj - code_names_count[profile['codename']] += 1 - - with open(agent_file, 'r') as f: - data = [json.loads(line) for line in f.readlines()] - - agent_dict = {} - for profile in data: - agent_obj = Agent(profile) - agent_dict[agent_obj._id] = agent_obj - - with open(relationship_file, 'r') as f: - data = [json.loads(line) for line in f.readlines()] - - relationship_dict = defaultdict(lambda : defaultdict(list)) - for profile in data: - relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id']) - relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id']) - - return environments, environment_dict, agent_dict, relationship_dict - - -def introduction(): +def navigation_bar(): with gr.Column(scale=2): - gr.Image( - "images/sotopia.jpg", elem_id="banner-image", show_label=False + toggle_dark = gr.Button(value="Toggle Dark") + toggle_dark.click( + None, + js=""" + () => { + if (document.body.classList.contains('dark')) { + document.body.classList.remove('dark'); + document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-light)'; + } else { + document.body.classList.add('dark'); + document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-dark)'; + } + } + """, + ) + +with gr.Blocks( + css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;} + #chatbot {height: 600px; overflow: auto;} + #create_container {height: 750px; margin-left: 0px; margin-right: 0px;} + #tokenizer_renderer span {white-space: pre-wrap} + """, + theme="gradio/monochrome", +) as demo: + # with gr.Row(): + # navigation_bar() + gr.Image( + "images/banner.png", elem_id="banner-image", show_label=False ) - with gr.Column(scale=5): - gr.Markdown( - """# Sotopia Space - **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!** - - ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios. - - ✨ **Guidance**: - - Step (1) Select a social scenario that interests you in "Scenario Selection" - - Step (2) Select a social agent you want to chat with in "Model Selection" - - Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection" - - Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved. - - ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so. - - 🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms. - """ - ) - -def create_user_agent_dropdown(environment_id): - _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles() - environment = environment_dict[environment_id] - - user_agents_list = [] - unique_agent_ids = set() - for x, _ in relationship_dict[environment.relationship].items(): - unique_agent_ids.add(x) - - for agent_id in unique_agent_ids: - user_agents_list.append((agent_dict[agent_id].name, agent_id)) - return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection") - -def create_bot_agent_dropdown(environment_id, user_agent_id): - _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles() - environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id] - - bot_agent_list = [] - for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]: - bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id)) - - return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None, label="Bot Agent Selection") - -def create_environment_info(environment_dropdown): - _, environment_dict, _, _ = get_sotopia_profiles() - environment = environment_dict[environment_dropdown] - text = environment.scenario - return gr.Textbox(label="Scenario", lines=1, value=text) - -def create_user_info(user_agent_dropdown): - _, _, agent_dict, _ = get_sotopia_profiles() - user_agent = agent_dict[user_agent_dropdown] - text = f"{user_agent.background} {user_agent.personality}" - return gr.Textbox(label="User Agent Profile", lines=4, value=text) - -def create_bot_info(bot_agent_dropdown): - _, _, agent_dict, _ = get_sotopia_profiles() - bot_agent = agent_dict[bot_agent_dropdown] - text = f"{bot_agent.background} {bot_agent.personality}" - return gr.Textbox(label="Bot Agent Profile", lines=4, value=text) - -def create_user_goal(environment_dropdown): - _, environment_dict, _, _ = get_sotopia_profiles() - text = environment_dict[environment_dropdown].agent_goals[0] - text = text.replace('(', '').replace(')', '') - if "" in text: - text = text.replace("", "\n\n") - text = text.replace("", "\n") - if "" in text: - text = text.replace("", "\n\n") - text = text.replace("", "\n") - return gr.Textbox(label="User Agent Goal", lines=4, value=text) - -def create_bot_goal(environment_dropdown): - _, environment_dict, _, _ = get_sotopia_profiles() - text = environment_dict[environment_dropdown].agent_goals[1] - text = text.replace('(', '').replace(')', '') - if "" in text: - text = text.replace("", "\n\n") - text = text.replace("", "\n") - if "" in text: - text = text.replace("", "\n\n") - text = text.replace("", "\n") - return gr.Textbox(label="Bot Agent Goal", lines=4, value=text) - -def sotopia_info_accordion(accordion_visible=True): - environments, _, _, _ = get_sotopia_profiles() - - with gr.Accordion("Create your sotopia space!", open=accordion_visible): - with gr.Row(): - environment_dropdown = gr.Dropdown( - choices=environments, - label="Scenario Selection", - value=environments[0][1] if environments else None, - interactive=True, - ) - model_name_dropdown = gr.Dropdown( - choices=MODEL_OPTIONS, - value=DEFAULT_MODEL_SELECTION, - interactive=True, - label="Model Selection" - ) - - with gr.Row(): - user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value) - bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value) - - with gr.Accordion("Check your social task!", open=accordion_visible): - - scenario_info_display = create_environment_info(environment_dropdown.value) - - with gr.Row(): - bot_goal_display = create_bot_goal(environment_dropdown.value) - user_goal_display = create_user_goal(environment_dropdown.value) - - - - with gr.Row(): - bot_agent_info_display = create_bot_info(bot_agent_dropdown.value) - user_agent_info_display = create_user_info(user_agent_dropdown.value) - - # Update user dropdown when scenario changes - environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown]) - # Update bot dropdown when user or scenario changes - user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown]) - # Update scenario information when scenario changes - environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display]) - # Update user agent profile when user changes - user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display]) - # Update bot agent profile when bot changes - bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display]) - # Update user goal when scenario changes - environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display]) - # Update bot goal when scenario changes - environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display]) - - return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown - -def instructions_accordion(instructions, according_visible=False): - with gr.Accordion("Instructions", open=False, visible=according_visible): - instructions = gr.Textbox( - lines=10, - value=instructions, - interactive=False, - placeholder="Instructions", - show_label=False, - max_lines=10, - visible=False, - ) - return instructions - - -def chat_tab(): - # history are input output pairs - _, environment_dict, agent_dict, _ = get_sotopia_profiles() - def run_chat( - message, - history, - environment_selection, - user_agent_dropdown, - bot_agent_dropdown, - model_selection:str - ): - environment = environment_dict[environment_selection] - user_agent = agent_dict[user_agent_dropdown] - bot_agent = agent_dict[bot_agent_dropdown] - - context = get_context_prompt(bot_agent, user_agent, environment) - dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent) - prompt_history = f"{context}{dialogue_history}" - agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE) - return agent_action.to_natural_language() - - with gr.Column(): - with gr.Blocks(): - model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion() - - with gr.Column(): - with gr.Accordion("Start the conversation to achieve your goal!", open=True): - gr.ChatInterface( - fn=run_chat, - chatbot=gr.Chatbot( - height=620, - render=False, - show_label=False, - rtl=False, - avatar_images=( - "images/profile1.jpg", - "images/profile2.jpg", - ), - ), - textbox=gr.Textbox( - placeholder="Write your message here...", - render=False, - scale=7, - rtl=False, - ), - additional_inputs=[ - scenario_dropdown, - user_agent_dropdown, - bot_agent_dropdown, - model_name_dropdown, - ], - submit_btn="Send", - stop_btn="Stop", - retry_btn="🔄 Retry", - undo_btn="↩️ Delete", - clear_btn="🗑️ Clear", - ) - - -def main(): - with gr.Blocks( - css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;} - #chatbot {height: 600px; overflow: auto;} - #create_container {height: 750px; margin-left: 0px; margin-right: 0px;} - #tokenizer_renderer span {white-space: pre-wrap} - """ - ) as demo: - with gr.Row(): - introduction() - with gr.Row(): - chat_tab() - - return demo - - -def start_demo(): - demo = main() - if DEPLOYED: - demo.queue(api_open=False).launch(show_api=False) - else: - demo.queue() - demo.launch(share=False, server_name="0.0.0.0") + gr.Markdown(HEADER_MD, elem_classes="markdown-text") + with gr.Tabs(elem_classes="tab-buttons") as tabs: + with gr.TabItem("🏅 Leaderboard", elem_id="benchmark-tab-table", id=0): + benchmark.benchmark_table() + with gr.TabItem("💬 Chat", elem_id="chat-tab-interface", id=1): + with gr.Row(): + chat_introduction() + with gr.Row(): + chat_tab() + with gr.Row(): + with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"): + gr.Textbox( + value=CITATION_TEXT, + lines=7, + label="Copy the BibTeX snippet to cite this source", + elem_id="citation-button", + show_copy_button=True) + +# def start_demo(): +# demo = main() +# if DEPLOYED: +# demo.queue(api_open=False).launch(show_api=False) +# else: +# demo.queue() +# demo.launch(share=False, server_name="0.0.0.0") if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--result_file", help="Path to results table", default="data_dir/models_vs_gpt35.jsonl") + #benchmark.original_df = pd.read_json(args.result_file, lines=True) get_sotopia_profiles() # prepare_model(DEFAULT_MODEL_SELECTION) - start_demo() \ No newline at end of file + demo.launch() \ No newline at end of file diff --git a/data_dir/models_vs_gpt35.jsonl b/data_dir/models_vs_gpt35.jsonl new file mode 100644 index 0000000..1a62d1f --- /dev/null +++ b/data_dir/models_vs_gpt35.jsonl @@ -0,0 +1,4 @@ +{"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28} +{"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15} +{"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1} +{"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 98fe252..59c1ad1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ annotated-types==0.6.0 anyio==3.7.1 attrs==23.2.0 beartype==0.14.1 -bitsandbytes==0.43.1 +bitsandbytes==0.42.0 certifi==2024.2.2 cffi==1.16.0 charset-normalizer==3.3.2 @@ -68,18 +68,18 @@ mypy-extensions==1.0.0 names==0.3.0 networkx==3.3 numpy==1.26.4 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-nccl-cu12==2.19.3 -nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.1.105 +# nvidia-cublas-cu12==12.1.3.1 +# nvidia-cuda-cupti-cu12==12.1.105 +# nvidia-cuda-nvrtc-cu12==12.1.105 +# nvidia-cuda-runtime-cu12==12.1.105 +# nvidia-cudnn-cu12==8.9.2.26 +# nvidia-cufft-cu12==11.0.2.54 +# nvidia-curand-cu12==10.3.2.106 +# nvidia-cusolver-cu12==11.4.5.107 +# nvidia-cusparse-cu12==12.1.0.106 +# nvidia-nccl-cu12==2.19.3 +# nvidia-nvjitlink-cu12==12.4.127 +# nvidia-nvtx-cu12==12.1.105 openai==1.22.0 orjson==3.10.1 packaging==23.2 @@ -129,7 +129,7 @@ toolz==0.12.1 torch==2.2.2 tqdm==4.66.2 transformers==4.40.0 -triton==2.2.0 +# triton==2.2.0 typer==0.12.3 types-cffi==1.16.0.20240331 types-pyOpenSSL==24.0.0.20240417 diff --git a/sotopia_space/_header.md b/sotopia_space/_header.md new file mode 100644 index 0000000..a1a998a --- /dev/null +++ b/sotopia_space/_header.md @@ -0,0 +1,4 @@ +
+ +# Sotopia Space: A Huggingface Space for the Sotopia projects +[⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions) diff --git a/sotopia_space/benchmark.py b/sotopia_space/benchmark.py new file mode 100644 index 0000000..1339be8 --- /dev/null +++ b/sotopia_space/benchmark.py @@ -0,0 +1,70 @@ +import gradio as gr # type: ignore +import pandas as pd +from sotopia_space.constants import MODEL_OPTIONS +from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty + +LP_MODE = "v2" +original_df, ablation_df = None, None +LP_original_dfs = {} +DEFAULT_LP = 0.5 + +available_models = [] # to be filled in later +original_df, ablation_df = None, None + +def slider_change_main(length_penalty): + global original_df, ablation_df, LP_MODE + adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) + adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]] + adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False) + # adjusted_df = add_winrates(adjusted_df, LP=length_penalty) + # adjusted_df = adjusted_df.drop(columns=["Length"]) + adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) + return adjusted_df + +def slider_change_full(length_penalty, show_winrate): + global original_df, ablation_df, LP_MODE + adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) + # sort the model by the "Task-Avg Elo" column + adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False) + adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True) + if show_winrate == "none": + adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) + return adjusted_df + elif show_winrate == "gpt-3.5": + adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty) + elif show_winrate == "gpt-4": + adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty) + adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) + return adjusted_df + +def benchmark_table(): + global original_df, ablation_df + global LP_original_dfs, LP_MODE + + gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text") + + with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"): + # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df + original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True) + default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs) + default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False) + # add a Rank column to the first columnn (starting from 1) + default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df))) + with gr.Row(): + with gr.Column(scale=4): + gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.") + with gr.Column(scale=1): + length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") + # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2) + TYPES = ["number", "markdown", "number"] + leaderboard_table = gr.components.Dataframe( + value=default_main_df, + datatype=TYPES, + # max_rows=None, + height=1000, + elem_id="leaderboard-table", + interactive=False, + visible=True, + min_width=60, + ) + #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table]) \ No newline at end of file diff --git a/sotopia_space/chat.py b/sotopia_space/chat.py new file mode 100644 index 0000000..44df50a --- /dev/null +++ b/sotopia_space/chat.py @@ -0,0 +1,284 @@ +import os +import gradio as gr # type: ignore +# Functions for creating the chat interface +from functools import cache +from typing import Literal +import json +from collections import defaultdict +from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt +from sotopia_pi_generate import prepare_model, generate_action +from sotopia_space.constants import MODEL_OPTIONS + +DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true" +DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo" +TEMPERATURE = 0.7 +TOP_P = 1 +MAX_TOKENS = 1024 + +ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl" +AGENT_PROFILES = "profiles/agent_profiles.jsonl" +RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl" + +Action = Literal['none', 'action', 'non-verbal communication', 'speak', 'leave'] +ACTION_TYPES: list[Action] = ['none', 'action', 'non-verbal communication', 'speak', 'leave'] + + + +@cache +def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES): + with open(env_file, 'r') as f: + data = [json.loads(line) for line in f.readlines()] + + code_names_count = defaultdict(int) + environments = [] + environment_dict = {} + for profile in sorted(data, key=lambda x: x['codename']): + env_obj = Environment(profile) + if profile['codename'] in code_names_count: + environments.append(( + "{}_{:05d}".format(profile['codename'], + code_names_count[profile['codename']] + ), + env_obj._id + )) + else: + environments.append((profile['codename'], env_obj._id)) + environment_dict[env_obj._id] = env_obj + code_names_count[profile['codename']] += 1 + + with open(agent_file, 'r') as f: + data = [json.loads(line) for line in f.readlines()] + + agent_dict = {} + for profile in data: + agent_obj = Agent(profile) + agent_dict[agent_obj._id] = agent_obj + + with open(relationship_file, 'r') as f: + data = [json.loads(line) for line in f.readlines()] + + relationship_dict = defaultdict(lambda : defaultdict(list)) + for profile in data: + relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id']) + relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id']) + + return environments, environment_dict, agent_dict, relationship_dict + +def chat_introduction(): + with gr.Column(scale=2): + gr.Image( + "images/sotopia.jpg", elem_id="banner-image", show_label=False + ) + with gr.Column(scale=5): + gr.Markdown( + """# Sotopia Space + **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!** + + ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios. + + ✨ **Guidance**: + + Step (1) Select a social scenario that interests you in "Scenario Selection" + + Step (2) Select a social agent you want to chat with in "Model Selection" + + Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection" + + Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved. + + ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so. + + 🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms. + """ + ) + # with gr.Column(scale=1): + # toggle_dark = gr.Button(value="Toggle Dark") + +def create_user_agent_dropdown(environment_id): + _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles() + environment = environment_dict[environment_id] + + user_agents_list = [] + unique_agent_ids = set() + for x, _ in relationship_dict[environment.relationship].items(): + unique_agent_ids.add(x) + + for agent_id in unique_agent_ids: + user_agents_list.append((agent_dict[agent_id].name, agent_id)) + return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection") + +def create_bot_agent_dropdown(environment_id, user_agent_id): + _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles() + environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id] + + bot_agent_list = [] + for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]: + bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id)) + + return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None, label="Bot Agent Selection") + +def create_environment_info(environment_dropdown): + _, environment_dict, _, _ = get_sotopia_profiles() + environment = environment_dict[environment_dropdown] + text = environment.scenario + return gr.Textbox(label="Scenario", lines=1, value=text) + +def create_user_info(user_agent_dropdown): + _, _, agent_dict, _ = get_sotopia_profiles() + user_agent = agent_dict[user_agent_dropdown] + text = f"{user_agent.background} {user_agent.personality}" + return gr.Textbox(label="User Agent Profile", lines=4, value=text) + +def create_bot_info(bot_agent_dropdown): + _, _, agent_dict, _ = get_sotopia_profiles() + bot_agent = agent_dict[bot_agent_dropdown] + text = f"{bot_agent.background} {bot_agent.personality}" + return gr.Textbox(label="Bot Agent Profile", lines=4, value=text) + +def create_user_goal(environment_dropdown): + _, environment_dict, _, _ = get_sotopia_profiles() + text = environment_dict[environment_dropdown].agent_goals[0] + text = text.replace('(', '').replace(')', '') + if "" in text: + text = text.replace("", "\n\n") + text = text.replace("", "\n") + if "" in text: + text = text.replace("", "\n\n") + text = text.replace("", "\n") + return gr.Textbox(label="User Agent Goal", lines=4, value=text) + +def create_bot_goal(environment_dropdown): + _, environment_dict, _, _ = get_sotopia_profiles() + text = environment_dict[environment_dropdown].agent_goals[1] + text = text.replace('(', '').replace(')', '') + if "" in text: + text = text.replace("", "\n\n") + text = text.replace("", "\n") + if "" in text: + text = text.replace("", "\n\n") + text = text.replace("", "\n") + return gr.Textbox(label="Bot Agent Goal", lines=4, value=text) + +def sotopia_info_accordion(accordion_visible=True): + environments, _, _, _ = get_sotopia_profiles() + + with gr.Accordion("Create your sotopia space!", open=accordion_visible): + with gr.Row(): + environment_dropdown = gr.Dropdown( + choices=environments, + label="Scenario Selection", + value=environments[0][1] if environments else None, + interactive=True, + ) + model_name_dropdown = gr.Dropdown( + choices=MODEL_OPTIONS, + value=DEFAULT_MODEL_SELECTION, + interactive=True, + label="Model Selection" + ) + + with gr.Row(): + user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value) + bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value) + + with gr.Accordion("Check your social task!", open=accordion_visible): + + scenario_info_display = create_environment_info(environment_dropdown.value) + + with gr.Row(): + bot_goal_display = create_bot_goal(environment_dropdown.value) + user_goal_display = create_user_goal(environment_dropdown.value) + + + + with gr.Row(): + bot_agent_info_display = create_bot_info(bot_agent_dropdown.value) + user_agent_info_display = create_user_info(user_agent_dropdown.value) + + # Update user dropdown when scenario changes + environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown]) + # Update bot dropdown when user or scenario changes + user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown]) + # Update scenario information when scenario changes + environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display]) + # Update user agent profile when user changes + user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display]) + # Update bot agent profile when bot changes + bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display]) + # Update user goal when scenario changes + environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display]) + # Update bot goal when scenario changes + environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display]) + + return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown + +def instructions_accordion(instructions, according_visible=False): + with gr.Accordion("Instructions", open=False, visible=according_visible): + instructions = gr.Textbox( + lines=10, + value=instructions, + interactive=False, + placeholder="Instructions", + show_label=False, + max_lines=10, + visible=False, + ) + return instructions + +def chat_tab(): + # history are input output pairs + _, environment_dict, agent_dict, _ = get_sotopia_profiles() + def run_chat( + message, + history, + environment_selection, + user_agent_dropdown, + bot_agent_dropdown, + model_selection:str + ): + environment = environment_dict[environment_selection] + user_agent = agent_dict[user_agent_dropdown] + bot_agent = agent_dict[bot_agent_dropdown] + + context = get_context_prompt(bot_agent, user_agent, environment) + dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent) + prompt_history = f"{context}{dialogue_history}" + agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE) + return agent_action.to_natural_language() + + with gr.Column(): + with gr.Blocks(): + model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion() + + with gr.Column(): + with gr.Accordion("Start the conversation to achieve your goal!", open=True): + gr.ChatInterface( + fn=run_chat, + chatbot=gr.Chatbot( + height=620, + render=False, + show_label=False, + rtl=False, + avatar_images=( + "images/profile1.jpg", + "images/profile2.jpg", + ), + ), + textbox=gr.Textbox( + placeholder="Write your message here...", + render=False, + scale=7, + rtl=False, + ), + additional_inputs=[ + scenario_dropdown, + user_agent_dropdown, + bot_agent_dropdown, + model_name_dropdown, + ], + submit_btn="Send", + stop_btn="Stop", + retry_btn="🔄 Retry", + undo_btn="↩️ Delete", + clear_btn="🗑️ Clear", + ) \ No newline at end of file diff --git a/sotopia_space/constants.py b/sotopia_space/constants.py new file mode 100644 index 0000000..76a6bdf --- /dev/null +++ b/sotopia_space/constants.py @@ -0,0 +1,39 @@ +MODEL_OPTIONS = [ + "gpt-3.5-turbo", + "gpt-4", + "gpt-4-turbo", + "cmu-lti/sotopia-pi-mistral-7b-BC_SR", + "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit", + "mistralai/Mistral-7B-Instruct-v0.1" + # "mistralai/Mixtral-8x7B-Instruct-v0.1", + # "togethercomputer/llama-2-7b-chat", + # "togethercomputer/llama-2-70b-chat", + # "togethercomputer/mpt-30b-chat", + # "together_ai/togethercomputer/llama-2-7b-chat", + # "together_ai/togethercomputer/falcon-7b-instruct", +] + +MODEL_INFO = { + "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"}, + "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"}, + "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"}, + "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"}, + "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"}, + "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"}, + "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"}, + "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"}, + "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"}, + "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"}, + "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"}, + "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"}, + "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"}, + "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"}, + "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"}, + "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"}, + "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"}, + "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"}, + "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"}, + "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"}, + "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"}, + "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"} +} diff --git a/sotopia_space/utils.py b/sotopia_space/utils.py new file mode 100644 index 0000000..7864aab --- /dev/null +++ b/sotopia_space/utils.py @@ -0,0 +1,223 @@ +from datasets import load_dataset, Dataset +import os +import json +from datasets import load_dataset +from datasets.utils.logging import disable_progress_bar # type: ignore +from ui_constants import column_names, all_task_types +import random +disable_progress_bar() +import math +from sotopia_space.constants import MODEL_INFO + +id_to_data = None +model_len_info = None + + +def make_clickable_model(model_name): + global MODEL_INFO + if model_name in MODEL_INFO: + if MODEL_INFO[model_name]["hf_model_id"].startswith("http"): + link = MODEL_INFO[model_name]["hf_model_id"] + return f'🔒 {MODEL_INFO[model_name]["pretty_name"]}' + else: + link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}" + return f'🔥 {MODEL_INFO[model_name]["pretty_name"]}' + else: + return model_name + + +def styled_error(error): + return f"

{error}

" + +def styled_warning(warn): + return f"

{warn}

" + +def styled_message(message): + return f"

{message}

" + + +def estimated_win_rate(elo_a, elo_b, LP=0): + """ + Calculate the estimated win rate for player A against player B using their Elo ratings. + :param elo_a: Elo rating of player A + :param elo_b: Elo rating of player B + :return: Estimated win rate for player A + """ + exponent = (elo_b - elo_a)*(10**LP) / 400 + probability_a_wins = 1 / (1 + 10 ** exponent) + return (1-probability_a_wins)*100 + + + +# Formats the columns +def formatter(x): + if type(x) is str: + x = x + else: + x = round(x, 1) + return x + + +def add_winrates(current_df, LP=0): + df = current_df.copy() + elo_column = "Task-Avg Elo" + + # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview" + model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0] + + # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125" + model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0] + + + # Calculate the win rate of "gpt-4-0125-preview" against all models + df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter) + df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter) + # apply the formatter for the two new columns + cols = list(df.columns) + cols.remove("# battles"); cols.append("# battles") + cols.remove("Length"); cols.append("Length") + df = df[cols] + return df + +def add_winrates_tasks(current_df, ref="gpt-4", LP=0): + new_df = current_df.copy() + for t in all_task_types: + column = column_names[t] + model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0] + new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter) + return new_df + + +def post_processing(df, model_len_info): + if model_len_info: + df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"]) + + for col in df.columns: + if col == "model name ": + df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) + else: + df[col] = df[col].apply(formatter) # For numerical values + df.rename(columns=column_names, inplace=True) + df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False) + # put the "Overall Elo" and "Task-Avg Elo" column to the front + # add the length info + df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]] + return df + +def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None): + """ + Temporarily disable the length penalty feature + if mode == 'v2' and LP_original_dfs is not None: + L = f"{length_penalty:.1f}" + return LP_original_dfs[L] + original_df = original_df.copy() + ablation_df = ablation_df.copy() + # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column + # except for the "Model" column and the "# battles" column + # do not assume the order of the rows are the same in both dataframes + for i, row in original_df.iterrows(): + for col in original_df.columns: + if col == "Model" or col == "# battles" or col == "Length": + continue + # assert that the model names are the same in both dataframes + assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0] + original_df[col] = original_df[col].astype(float) + if mode == "v1": + original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty + elif mode == "v1.1": + diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] + original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty + # post_processing + original_df = post_processing(original_df, model_len_info=None) + """ + return original_df + +def load_benchdata(): + print("Loading sotopia data...") + bench_data = load_dataset("cmu-lti/sotopia", split="test") + return bench_data + +def load_benchdata_dict(): + print("Loading sotopia data....") + bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train'] + id_to_data = {} + for item in bench_data: + id_to_data[item["session_id"]] = item + return id_to_data + +def load_eval_results(): + print("Loading sotopia Evaluation data...") + eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train") + return eval_results + +def load_infer_results(model_name): + print(f"Loading sotopia Results for {model_name}...") + infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train") + return infer_results + +def sample_an_eval_result(eval_results, model_list=[], tag_list=[]): + global id_to_data + eval_results = list(eval_results) + random.shuffle(eval_results) + for eval_item in eval_results: + # print(json.dumps(eval_item, indent=2)) + # print(f"## Session ID: {eval_item['session_id']}") + # eval_item["eval_id"] + assignment = eval_item['assignment'] + model_1, model_2 = eval_item['model_1'], eval_item['model_2'] + model_A = model_1 if assignment['A'] == model_1 else model_2 + model_B = model_2 if assignment['B'] == model_2 else model_1 + if len(model_list) >= 2: + if model_A not in model_list or model_B not in model_list: + continue + elif len(model_list) == 1: + if model_A != model_list[0] and model_B != model_list[0]: + continue + else: + pass + if tag_list: + if set(tag_list).isdisjoint(set(eval_item['tags'])): + continue + winner = eval_item['winner'] + # print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}") + task_type = eval_item['tags'][0] # primary task type + chat_history = eval_item['history'] + last_query = eval_item['last_query'] + # print(f"## Task Type: {task_type}") + # print(f"## Chat History: {chat_history}") + # print(f"## Last Query --> USER: {last_query}") + + model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output'] + model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output'] + + if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0: + continue + + conversation_input = id_to_data[eval_item['session_id']]["conversation_input"] + # print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}") + # print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}") + + # print(f"\n\n\n## Winner ##\n{winner}") + # print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}") + + result_dict = { + "session_id": eval_item['session_id'], + "model_A": model_A, + "model_B": model_B, + "winner": winner, + "intent": id_to_data[eval_item['session_id']]["intent"], + "task_type": task_type, + "all_tags": eval_item['tags'], + "chat_history": chat_history, + "last_query": last_query, + "conversation_input": conversation_input, + "model_A_output": model_A_output, + "model_B_output": model_B_output, + "reason": eval_item['parsed_result']["reason"], + "choice": eval_item['parsed_result']["choice"], + "checklist": id_to_data[eval_item['session_id']]["checklist"], + } + break + return result_dict + +#id_to_data = load_benchdata_dict() \ No newline at end of file diff --git a/start_app.sh b/start_app.sh index 00de646..604f02c 100644 --- a/start_app.sh +++ b/start_app.sh @@ -1,4 +1,4 @@ export OPENAI_API_KEY=$(cat openai_api.key) export HF_TOKEN=$(cat hf_token.key) -python app.py \ No newline at end of file +gradio app.py \ No newline at end of file diff --git a/ui_constants.py b/ui_constants.py new file mode 100644 index 0000000..f4919c4 --- /dev/null +++ b/ui_constants.py @@ -0,0 +1,191 @@ +from pathlib import Path + +DEFAULT_LP = 0.5 + +banner_url = "https://github.com/sotopia-lab/sotopia-website/blob/main/public/bg_xl.png" # the same repo here. +BANNER = f'
Banner
' + +TITLE = "

🦁 AI2 sotopia Leaderboard " + +WINRATE_HEATMAP = "
" + +CITATION_TEXT = """@inproceedings{ +zhou2024sotopia, +title={{SOTOPIA}: Interactive Evaluation for Social Intelligence in Language Agents}, +author={Xuhui Zhou and Hao Zhu and Leena Mathur and Ruohong Zhang and Haofei Yu and Zhengyang Qi and Louis-Philippe Morency and Yonatan Bisk and Daniel Fried and Graham Neubig and Maarten Sap}, +booktitle={The Twelfth International Conference on Learning Representations}, +year={2024}, +url={https://openreview.net/forum?id=mM7VurbA4r} +} +""" + + +column_names = { + "model name ": "Model", + "elo overall": "Overall Elo", + 'Information seeking': 'InfoSek', + 'Creative Writing': 'CrtWrt', + 'Coding & Debugging': 'Code', + 'Reasoning': 'Reason', + 'Editing': 'Edit', + 'Math': 'Math', + 'Planning': 'Plan', + 'Brainstorming': 'Brnstrm', + 'Role playing': 'RolPly', + 'Advice seeking': 'AdvSek', + 'Data Analysis': 'DataAna', + 'Others': 'Misc', + "average": "Task-Avg Elo", +} + +all_task_types = [ + 'Information seeking', + 'Creative Writing', + 'Coding & Debugging', + 'Reasoning', + 'Editing', + 'Math', + 'Planning', + 'Brainstorming', + 'Role playing', + 'Advice seeking', + 'Data Analysis', + 'Others' +] + + + +js_light = """ +function refresh() { + const url = new URL(window.location); + if (url.searchParams.get('__theme') !== 'light') { + url.searchParams.set('__theme', 'light'); + window.location.href = url.href; + } +} +""" + +js_code = """ +function scroll_top() { + console.log("Hello from Gradio!"); + const bubbles = document.querySelectorAll('.bubble-wrap'); + bubbles.forEach((bubble, index) => { + setTimeout(() => { + bubble.scrollTop = 0; + }, index * 100); // Delay of 100ms between each iteration + }); +} +""" + + +TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)" + +css = """ +code { + font-size: large; +} +footer {visibility: hidden} +.top-left-LP{ + margin-top: 6px; + margin-left: 5px; +} +.markdown-text{font-size: 14pt} +.markdown-text-small{font-size: 13pt} +.markdown-text-tiny{font-size: 12pt} +.markdown-text-tiny-red{ + font-size: 12pt; + color: red; + background-color: yellow; + font-color: red; + font-weight: bold; +} +th { + text-align: center; + font-size: 17px; /* Adjust the font size as needed */ +} +td { + font-size: 15px; /* Adjust the font size as needed */ + text-align: center; +} +.sample_button{ + border: 1px solid #000000; + border-radius: 5px; + padding: 5px; + font-size: 15pt; + font-weight: bold; + margin: 5px; +} +.chat-common{ + height: auto; + max-height: 400px; + min-height: 100px; +} +.chat-specific{ + height: auto; + max-height: 600px; + min-height: 200px; +} +#od-benchmark-tab-table-button{ + font-size: 15pt; + font-weight: bold; +} +.btn_boderline{ + border: 1px solid #000000; + border-radius: 5px; + padding: 5px; + margin: 5px; + font-size: 15pt; + font-weight: bold; +} +.btn_boderline_next{ + border: 0.1px solid #000000; + border-radius: 5px; + padding: 5px; + margin: 5px; + font-size: 15pt; + font-weight: bold; +} +.btn_boderline_gray{ + border: 0.5px solid gray; + border-radius: 5px; + padding: 5px; + margin: 5px; + font-size: 15pt; + font-weight: italic; +} +.btn_boderline_selected{ + border: 2px solid purple; + background-color: #f2f2f2; + border-radius: 5px; + padding: 5px; + margin: 5px; + font-size: 15pt; + font-weight: bold; +} +.accordion-label button span{ + font-size: 14pt; + font-weight: bold; +} +#select-models span{ + font-size: 10pt; +} +#select-tasks span{ + font-size: 10pt; +} +.markdown-text-details{ + margin: 10px; + padding: 10px; +} +button.selected[role="tab"][aria-selected="true"] { + font-size: 18px; /* or any other size you prefer */ + font-weight: bold; +} +#od-benchmark-tab-table-ablation-button { + font-size: larger; /* Adjust the font size as needed */ +} +.plotly-plot{ + height: auto; + max-height: 600px; + min-height: 600px; +} +""" \ No newline at end of file