Skip to content

Commit

Permalink
Leaderboard and Unified UI (#61)
Browse files Browse the repository at this point in the history
* Update start_app.sh to use gradio instead of python app.py

* fixed action typing error

---------

Co-authored-by: Jasonqi146 <[email protected]>
  • Loading branch information
XuhuiZhou and Jasonqi146 authored Apr 24, 2024
1 parent 7d4925b commit 0adb6ea
Show file tree
Hide file tree
Showing 11 changed files with 908 additions and 329 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,16 @@ license: apache-2.0
---

This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)

## Getting Started

```bash
conda create -n sotopia-space python=3.11; conda activate sotopia-space
python -m pip install -r requirements.txt
```

To run the app, run the following command:

```bash
bash start_app.sh
```
379 changes: 65 additions & 314 deletions app.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions data_dir/models_vs_gpt35.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28}
{"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15}
{"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1}
{"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17}
28 changes: 14 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ annotated-types==0.6.0
anyio==3.7.1
attrs==23.2.0
beartype==0.14.1
bitsandbytes==0.43.1
bitsandbytes==0.42.0
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
Expand Down Expand Up @@ -68,18 +68,18 @@ mypy-extensions==1.0.0
names==0.3.0
networkx==3.3
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.19.3
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.1.105
# nvidia-cublas-cu12==12.1.3.1
# nvidia-cuda-cupti-cu12==12.1.105
# nvidia-cuda-nvrtc-cu12==12.1.105
# nvidia-cuda-runtime-cu12==12.1.105
# nvidia-cudnn-cu12==8.9.2.26
# nvidia-cufft-cu12==11.0.2.54
# nvidia-curand-cu12==10.3.2.106
# nvidia-cusolver-cu12==11.4.5.107
# nvidia-cusparse-cu12==12.1.0.106
# nvidia-nccl-cu12==2.19.3
# nvidia-nvjitlink-cu12==12.4.127
# nvidia-nvtx-cu12==12.1.105
openai==1.22.0
orjson==3.10.1
packaging==23.2
Expand Down Expand Up @@ -129,7 +129,7 @@ toolz==0.12.1
torch==2.2.2
tqdm==4.66.2
transformers==4.40.0
triton==2.2.0
# triton==2.2.0
typer==0.12.3
types-cffi==1.16.0.20240331
types-pyOpenSSL==24.0.0.20240417
Expand Down
4 changes: 4 additions & 0 deletions sotopia_space/_header.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<br/>

# Sotopia Space: A Huggingface Space for the Sotopia projects
[⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions)
70 changes: 70 additions & 0 deletions sotopia_space/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import gradio as gr # type: ignore
import pandas as pd
from sotopia_space.constants import MODEL_OPTIONS
from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty

LP_MODE = "v2"
original_df, ablation_df = None, None
LP_original_dfs = {}
DEFAULT_LP = 0.5

available_models = [] # to be filled in later
original_df, ablation_df = None, None

def slider_change_main(length_penalty):
global original_df, ablation_df, LP_MODE
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
# adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
# adjusted_df = adjusted_df.drop(columns=["Length"])
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df

def slider_change_full(length_penalty, show_winrate):
global original_df, ablation_df, LP_MODE
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
# sort the model by the "Task-Avg Elo" column
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
if show_winrate == "none":
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df
elif show_winrate == "gpt-3.5":
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
elif show_winrate == "gpt-4":
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df

def benchmark_table():
global original_df, ablation_df
global LP_original_dfs, LP_MODE

gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")

with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
# original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
# add a Rank column to the first columnn (starting from 1)
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
with gr.Row():
with gr.Column(scale=4):
gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
with gr.Column(scale=1):
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
TYPES = ["number", "markdown", "number"]
leaderboard_table = gr.components.Dataframe(
value=default_main_df,
datatype=TYPES,
# max_rows=None,
height=1000,
elem_id="leaderboard-table",
interactive=False,
visible=True,
min_width=60,
)
#length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
Loading

0 comments on commit 0adb6ea

Please sign in to comment.