reduced repeat Style Control leaderboard code

lm-sys · Dec 10, 2024 · 4686e7d · 4686e7d
1 parent 7f4c09f
commit 4686e7d
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 38 deletions.
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
@@ -442,8 +442,11 @@ def build_arena_tab(
     for k in key_to_category_name.keys():
         if k not in elo_results:
             continue
-        arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
-        category_elo_results[key_to_category_name[k]] = elo_results[k]
+        category_name = key_to_category_name[k.replace("_style_control", "")]
+        if "_style_control" in k:
+            category_name = f"{category_name} w/ Style Control"
+        arena_dfs[category_name] = elo_results[k]["leaderboard_table_df"]
+        category_elo_results[category_name] = elo_results[k]
 
     arena_df = arena_dfs["Overall"]
 
@@ -791,7 +794,7 @@ def highlight_top_3(s):
         style = style.background_gradient(
             cmap="Blues",
             subset=category_names,
-            vmin=1150,
+            vmin=category_df[category_names].max().max() - 250,
             vmax=category_df[category_names].max().max(),
         )
 
@@ -814,10 +817,6 @@ def build_category_leaderboard_tab(
             combined_elo_df, categories, "rating"
         )
         sort_ranking = lambda _: get_arena_category_table(combined_elo_df, categories)
-    with gr.Row():
-        gr.Markdown(
-            f"""&emsp; <span style='font-weight: bold; font-size: 150%;'>Chatbot Arena Overview</span>"""
-        )
 
     overall_ranking_leaderboard = gr.Dataframe(
         headers=["Model"] + [key_to_category_name[k] for k in categories],
@@ -852,6 +851,20 @@ def build_category_leaderboard_tab(
 ]
 selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]
 
+vision_categories = [
+    "full",
+    "full_style_control",
+    "captioning",
+    "captioning_style_control",
+    "entity_recognition",
+    "ocr",
+    "creative_writing_vision",
+    "homework",
+    "diagram",
+    "no_refusal",
+]
+vision_categories_width = [110, 110, 100, 110, 110, 60, 80, 80, 80, 80]
+
 language_categories = [
     "english",
     "chinese",
@@ -963,16 +976,26 @@ def build_leaderboard_tab(
                 combined_table = get_combined_table(elo_results_text, model_table_df)
                 build_category_leaderboard_tab(
                     combined_table,
-                    "Task",
+                    "LLM Task",
                     selected_categories,
                     selected_categories_width,
                 )
                 build_category_leaderboard_tab(
                     combined_table,
-                    "Language",
+                    "LLM Language",
                     language_categories,
                     language_categories_width,
                 )
+                if elo_results_vision is not None:
+                    vision_combined_table = get_combined_table(
+                        elo_results_vision, model_table_df
+                    )
+                    build_category_leaderboard_tab(
+                        vision_combined_table,
+                        "VLM Task",
+                        vision_categories,
+                        vision_categories_width,
+                    )
                 gr.Markdown(
                     f"""
             ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
@@ -1074,31 +1097,10 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard)
     from fastchat.serve.gradio_web_server import block_css
 
     text_size = gr.themes.sizes.text_lg
-    # load theme from theme.json
-    theme = gr.themes.Default.load("theme.json")
-    # set text size to large
-    theme.text_size = text_size
-    theme.set(
-        button_large_text_size="20px",
-        button_small_text_size="20px",
-        button_large_text_weight="100",
-        button_small_text_weight="100",
-        button_shadow="*shadow_drop_lg",
-        button_shadow_hover="*shadow_drop_lg",
-        checkbox_label_shadow="*shadow_drop_lg",
-        button_shadow_active="*shadow_inset",
-        button_secondary_background_fill="*primary_300",
-        button_secondary_background_fill_dark="*primary_700",
-        button_secondary_background_fill_hover="*primary_200",
-        button_secondary_background_fill_hover_dark="*primary_500",
-        button_secondary_text_color="*primary_800",
-        button_secondary_text_color_dark="white",
-    )
 
     with gr.Blocks(
         title="Chatbot Arena Leaderboard",
-        # theme=gr.themes.Default(text_size=text_size),
-        theme=theme,
+        theme=gr.themes.Default(text_size=text_size),
         css=block_css,
     ) as demo:
         with gr.Tabs() as tabs:

diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
@@ -14,17 +14,15 @@
 
 key_to_category_name = {
     "full": "Overall",
-    "full_style_control": "Overall w/ Style Control",
     "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
     "math": "Math",
     "if": "Instruction Following",
     "multiturn": "Multi-Turn",
     "creative_writing": "Creative Writing",
+    "creative_writing_vision": "Creative Writing",
     "coding": "Coding",
-    "coding_style_control": "Coding w/ Style Control",
     "hard_6": "Hard Prompts",
     "hard_english_6": "Hard Prompts (English)",
-    "hard_6_style_control": "Hard Prompts w/ Style Control",
     "long_user": "Longer Query",
     "english": "English",
     "chinese": "Chinese",
@@ -49,15 +47,12 @@
 }
 cat_name_to_explanation = {
     "Overall": "Overall Questions",
-    "Overall w/ Style Control": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
     "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
     "Math": "Math",
     "Instruction Following": "Instruction Following",
     "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
     "Coding": "Coding: whether conversation contains code snippets",
-    "Coding w/ Style Control": "Coding with Style Control",
     "Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
-    "Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
     "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
     "Longer Query": "Longer Query (>= 500 tokens)",
     "English": "English Prompts",
@@ -140,7 +135,11 @@ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"
     space = "&nbsp;&nbsp;&nbsp;"
     total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
     total_subset_models = len(arena_subset_df)
-    leaderboard_md = f"""### {cat_name_to_explanation[name]}
+    if "w/ Style Control" in name:
+        explanation = cat_name_to_explanation[name.replace(" w/ Style Control", "")] + " with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/)."
+    else:
+        explanation = cat_name_to_explanation[name]
+    leaderboard_md = f"""### {explanation}
 #### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
 """
     return leaderboard_md