-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update start_app.sh to use gradio instead of python app.py * fixed action typing error --------- Co-authored-by: Jasonqi146 <[email protected]>
- Loading branch information
1 parent
7d4925b
commit 0adb6ea
Showing
11 changed files
with
908 additions
and
329 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28} | ||
{"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15} | ||
{"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1} | ||
{"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
<br/> | ||
|
||
# Sotopia Space: A Huggingface Space for the Sotopia projects | ||
[⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import gradio as gr # type: ignore | ||
import pandas as pd | ||
from sotopia_space.constants import MODEL_OPTIONS | ||
from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty | ||
|
||
LP_MODE = "v2" | ||
original_df, ablation_df = None, None | ||
LP_original_dfs = {} | ||
DEFAULT_LP = 0.5 | ||
|
||
available_models = [] # to be filled in later | ||
original_df, ablation_df = None, None | ||
|
||
def slider_change_main(length_penalty): | ||
global original_df, ablation_df, LP_MODE | ||
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) | ||
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]] | ||
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False) | ||
# adjusted_df = add_winrates(adjusted_df, LP=length_penalty) | ||
# adjusted_df = adjusted_df.drop(columns=["Length"]) | ||
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) | ||
return adjusted_df | ||
|
||
def slider_change_full(length_penalty, show_winrate): | ||
global original_df, ablation_df, LP_MODE | ||
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) | ||
# sort the model by the "Task-Avg Elo" column | ||
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False) | ||
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True) | ||
if show_winrate == "none": | ||
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) | ||
return adjusted_df | ||
elif show_winrate == "gpt-3.5": | ||
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty) | ||
elif show_winrate == "gpt-4": | ||
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty) | ||
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df))) | ||
return adjusted_df | ||
|
||
def benchmark_table(): | ||
global original_df, ablation_df | ||
global LP_original_dfs, LP_MODE | ||
|
||
gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text") | ||
|
||
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"): | ||
# original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df | ||
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True) | ||
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs) | ||
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False) | ||
# add a Rank column to the first columnn (starting from 1) | ||
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df))) | ||
with gr.Row(): | ||
with gr.Column(scale=4): | ||
gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.") | ||
with gr.Column(scale=1): | ||
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") | ||
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2) | ||
TYPES = ["number", "markdown", "number"] | ||
leaderboard_table = gr.components.Dataframe( | ||
value=default_main_df, | ||
datatype=TYPES, | ||
# max_rows=None, | ||
height=1000, | ||
elem_id="leaderboard-table", | ||
interactive=False, | ||
visible=True, | ||
min_width=60, | ||
) | ||
#length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table]) |
Oops, something went wrong.