Leaderboard and Unified UI (#61)

* Update start_app.sh to use gradio instead of python app.py * fixed action typing error --------- Co-authored-by: Jasonqi146 <[email protected]>
sotopia-lab · Apr 24, 2024 · 0adb6ea · 0adb6ea
1 parent 7d4925b
commit 0adb6ea
Show file tree

Hide file tree

Showing 11 changed files with 908 additions and 329 deletions.
diff --git a/README.md b/README.md
@@ -11,3 +11,16 @@ license: apache-2.0
 ---
 
 This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)
+
+## Getting Started
+
+```bash
+conda create -n sotopia-space python=3.11; conda activate sotopia-space
+python -m pip install -r requirements.txt
+```
+
+To run the app, run the following command:
+
+```bash
+bash start_app.sh
+```
diff --git a/app.py b/app.py
diff --git a/data_dir/models_vs_gpt35.jsonl b/data_dir/models_vs_gpt35.jsonl
@@ -0,0 +1,4 @@
+{"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28}
+{"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15}
+{"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1}
+{"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17}
diff --git a/requirements.txt b/requirements.txt
@@ -8,7 +8,7 @@ annotated-types==0.6.0
 anyio==3.7.1
 attrs==23.2.0
 beartype==0.14.1
-bitsandbytes==0.43.1
+bitsandbytes==0.42.0
 certifi==2024.2.2
 cffi==1.16.0
 charset-normalizer==3.3.2
@@ -68,18 +68,18 @@ mypy-extensions==1.0.0
 names==0.3.0
 networkx==3.3
 numpy==1.26.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==8.9.2.26
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.19.3
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.1.105
+# nvidia-cublas-cu12==12.1.3.1
+# nvidia-cuda-cupti-cu12==12.1.105
+# nvidia-cuda-nvrtc-cu12==12.1.105
+# nvidia-cuda-runtime-cu12==12.1.105
+# nvidia-cudnn-cu12==8.9.2.26
+# nvidia-cufft-cu12==11.0.2.54
+# nvidia-curand-cu12==10.3.2.106
+# nvidia-cusolver-cu12==11.4.5.107
+# nvidia-cusparse-cu12==12.1.0.106
+# nvidia-nccl-cu12==2.19.3
+# nvidia-nvjitlink-cu12==12.4.127
+# nvidia-nvtx-cu12==12.1.105
 openai==1.22.0
 orjson==3.10.1
 packaging==23.2
@@ -129,7 +129,7 @@ toolz==0.12.1
 torch==2.2.2
 tqdm==4.66.2
 transformers==4.40.0
-triton==2.2.0
+# triton==2.2.0
 typer==0.12.3
 types-cffi==1.16.0.20240331
 types-pyOpenSSL==24.0.0.20240417

diff --git a/sotopia_space/_header.md b/sotopia_space/_header.md
@@ -0,0 +1,4 @@
+<br/>
+
+# Sotopia Space: A Huggingface Space for the Sotopia projects
+[⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions)
diff --git a/sotopia_space/benchmark.py b/sotopia_space/benchmark.py
@@ -0,0 +1,70 @@
+import gradio as gr # type: ignore
+import pandas as pd
+from sotopia_space.constants import MODEL_OPTIONS
+from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty 
+
+LP_MODE = "v2"
+original_df, ablation_df = None, None
+LP_original_dfs = {} 
+DEFAULT_LP = 0.5 
+
+available_models = [] # to be filled in later
+original_df, ablation_df = None, None
+
+def slider_change_main(length_penalty):
+    global original_df, ablation_df, LP_MODE
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
+    adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
+    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
+    # adjusted_df = add_winrates(adjusted_df, LP=length_penalty) 
+    # adjusted_df = adjusted_df.drop(columns=["Length"])
+    adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+    return adjusted_df
+
+def slider_change_full(length_penalty, show_winrate):
+    global original_df, ablation_df, LP_MODE
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
+    # sort the model by the "Task-Avg Elo" column
+    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
+    adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
+    if show_winrate == "none":
+        adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+        return adjusted_df
+    elif show_winrate == "gpt-3.5":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
+    elif show_winrate == "gpt-4":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
+    adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+    return adjusted_df
+
+def benchmark_table():
+    global original_df, ablation_df
+    global LP_original_dfs, LP_MODE
+
+    gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
+
+    with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
+        # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
+        original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
+        default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
+        default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
+        # add a Rank column to the first columnn (starting from 1)
+        default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
+        with gr.Row():
+            with gr.Column(scale=4):
+                gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.") 
+            with gr.Column(scale=1):
+                length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") 
+        # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
+        TYPES = ["number", "markdown", "number"]
+        leaderboard_table = gr.components.Dataframe(
+            value=default_main_df,
+            datatype=TYPES,
+            # max_rows=None,
+            height=1000,
+            elem_id="leaderboard-table",
+            interactive=False,
+            visible=True,
+            min_width=60,
+            ) 
+        #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])