Skip to content

Commit

Permalink
reduced repeat Style Control leaderboard code
Browse files Browse the repository at this point in the history
  • Loading branch information
lisadunlap committed Dec 10, 2024
1 parent 7f4c09f commit 4686e7d
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 38 deletions.
64 changes: 33 additions & 31 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,11 @@ def build_arena_tab(
for k in key_to_category_name.keys():
if k not in elo_results:
continue
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
category_elo_results[key_to_category_name[k]] = elo_results[k]
category_name = key_to_category_name[k.replace("_style_control", "")]
if "_style_control" in k:
category_name = f"{category_name} w/ Style Control"
arena_dfs[category_name] = elo_results[k]["leaderboard_table_df"]
category_elo_results[category_name] = elo_results[k]

arena_df = arena_dfs["Overall"]

Expand Down Expand Up @@ -791,7 +794,7 @@ def highlight_top_3(s):
style = style.background_gradient(
cmap="Blues",
subset=category_names,
vmin=1150,
vmin=category_df[category_names].max().max() - 250,
vmax=category_df[category_names].max().max(),
)

Expand All @@ -814,10 +817,6 @@ def build_category_leaderboard_tab(
combined_elo_df, categories, "rating"
)
sort_ranking = lambda _: get_arena_category_table(combined_elo_df, categories)
with gr.Row():
gr.Markdown(
f"""&emsp; <span style='font-weight: bold; font-size: 150%;'>Chatbot Arena Overview</span>"""
)

overall_ranking_leaderboard = gr.Dataframe(
headers=["Model"] + [key_to_category_name[k] for k in categories],
Expand Down Expand Up @@ -852,6 +851,20 @@ def build_category_leaderboard_tab(
]
selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]

vision_categories = [
"full",
"full_style_control",
"captioning",
"captioning_style_control",
"entity_recognition",
"ocr",
"creative_writing_vision",
"homework",
"diagram",
"no_refusal",
]
vision_categories_width = [110, 110, 100, 110, 110, 60, 80, 80, 80, 80]

language_categories = [
"english",
"chinese",
Expand Down Expand Up @@ -963,16 +976,26 @@ def build_leaderboard_tab(
combined_table = get_combined_table(elo_results_text, model_table_df)
build_category_leaderboard_tab(
combined_table,
"Task",
"LLM Task",
selected_categories,
selected_categories_width,
)
build_category_leaderboard_tab(
combined_table,
"Language",
"LLM Language",
language_categories,
language_categories_width,
)
if elo_results_vision is not None:
vision_combined_table = get_combined_table(
elo_results_vision, model_table_df
)
build_category_leaderboard_tab(
vision_combined_table,
"VLM Task",
vision_categories,
vision_categories_width,
)
gr.Markdown(
f"""
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
Expand Down Expand Up @@ -1074,31 +1097,10 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard)
from fastchat.serve.gradio_web_server import block_css

text_size = gr.themes.sizes.text_lg
# load theme from theme.json
theme = gr.themes.Default.load("theme.json")
# set text size to large
theme.text_size = text_size
theme.set(
button_large_text_size="20px",
button_small_text_size="20px",
button_large_text_weight="100",
button_small_text_weight="100",
button_shadow="*shadow_drop_lg",
button_shadow_hover="*shadow_drop_lg",
checkbox_label_shadow="*shadow_drop_lg",
button_shadow_active="*shadow_inset",
button_secondary_background_fill="*primary_300",
button_secondary_background_fill_dark="*primary_700",
button_secondary_background_fill_hover="*primary_200",
button_secondary_background_fill_hover_dark="*primary_500",
button_secondary_text_color="*primary_800",
button_secondary_text_color_dark="white",
)

with gr.Blocks(
title="Chatbot Arena Leaderboard",
# theme=gr.themes.Default(text_size=text_size),
theme=theme,
theme=gr.themes.Default(text_size=text_size),
css=block_css,
) as demo:
with gr.Tabs() as tabs:
Expand Down
13 changes: 6 additions & 7 deletions fastchat/serve/monitor/monitor_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,15 @@

key_to_category_name = {
"full": "Overall",
"full_style_control": "Overall w/ Style Control",
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
"math": "Math",
"if": "Instruction Following",
"multiturn": "Multi-Turn",
"creative_writing": "Creative Writing",
"creative_writing_vision": "Creative Writing",
"coding": "Coding",
"coding_style_control": "Coding w/ Style Control",
"hard_6": "Hard Prompts",
"hard_english_6": "Hard Prompts (English)",
"hard_6_style_control": "Hard Prompts w/ Style Control",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
Expand All @@ -49,15 +47,12 @@
}
cat_name_to_explanation = {
"Overall": "Overall Questions",
"Overall w/ Style Control": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Math": "Math",
"Instruction Following": "Instruction Following",
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
"Coding": "Coding: whether conversation contains code snippets",
"Coding w/ Style Control": "Coding with Style Control",
"Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
Expand Down Expand Up @@ -140,7 +135,11 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"
space = "&nbsp;&nbsp;&nbsp;"
total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
total_subset_models = len(arena_subset_df)
leaderboard_md = f"""### {cat_name_to_explanation[name]}
if "w/ Style Control" in name:
explanation = cat_name_to_explanation[name.replace(" w/ Style Control", "")] + " with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/)."
else:
explanation = cat_name_to_explanation[name]
leaderboard_md = f"""### {explanation}
#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
"""
return leaderboard_md
Expand Down

0 comments on commit 4686e7d

Please sign in to comment.