allenai · yuchenlin · Apr 20, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 30, 2024
diff --git a/bon_eval/.gitignore b/bon_eval/.gitignore
@@ -0,0 +1,2 @@
+bon_data/
+rm_bon_eval_results/
diff --git a/bon_eval/README.md b/bon_eval/README.md
@@ -0,0 +1,46 @@
+# Best-of-N Evaluation with RewardBench 
+
+This folder serves for evaluating reward models' abilities in selecting the best output from a set of N outputs (i.e., best-of-N sampling, aka. rejection sampling).  
+
+
+## Download BoN data 
+
+```bash
+cd bon_eval
+pip install alpaca_eval==0.6.2
+python download_bon_data.py
+python model_mapping_gen.py
+python rm_bon_eval.py
+```
+
+## AlpacaEval-BoN (notes)
+
+- LLM: Tulu-2-dpo-13b
+- N: 16
+- Judge: GPT-4-turbo (same as the standard AlpacaEval 2 with length control)
+- Reference: GPT-3.5-turbo (`bon_data/gpt-3.5-turbo-0613.ae.json`)
+
+We sample 16 outputs from the Tulu-2-dpo-13b LLM. The model outputs are divided into 16 files: `bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.[x].json` where `[x]` is from 0 to 15. 
+
+We use AlpacaEval to evaluate each output with the GPT-4-turbo judge. 
+The reference for computing win-rates is the GPT-3.5-turbo model.  
+The AlpacaEval annotations are stored in `bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.[x]/weighted_alpaca_eval_gpt4_turbo/annotations.json`. 
+
+If you'd like to reproduce the evaluation annotations:"
+```bash
+for i in {0..15}
+do 
+  output_dir="bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.$i/"
+  mkdir -p $output_dir
+  alpaca_eval --reference_outputs "bon_data/gpt-3.5-turbo-0613.ae.json" \
+              --model_outputs "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.$i.json" \
+              --output_path $output_dir
+done 
+```
+
+## Evaluation
+
+<!-- export HF_ENDPOINT=https://hf-mirror.com -->
+
+Run `python rm_bon_eval.py` and you will get a json named `bon_eval_results.json` in the current directory, which will be also uploaded to HuggingFace hub.
+
diff --git a/analysis/bon_to_alpacaeval.py → bon_eval/bon_to_alpacaeval.py b/analysis/bon_to_alpacaeval.py → bon_eval/bon_to_alpacaeval.py
diff --git a/bon_eval/bon_utils.py b/bon_eval/bon_utils.py
@@ -0,0 +1,183 @@
+import logging 
+from typing import Any, Optional, Sequence, Union
+import pandas as pd
+from alpaca_eval import annotators, constants, metrics, utils
+from alpaca_eval.types import AnyData, AnyLoadableDF, AnyPath
+
+# Source: modified from https://github.com/tatsu-lab/alpaca_eval/blob/c4a4ca716b4cab46701af759c244cb9d05772e15/src/alpaca_eval/main.py#L17C5-L17C5 
+
+def evaluate(
+    model_outputs: Optional[AnyLoadableDF] = None,
+    reference_outputs: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS,
+    annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG,
+    name: Optional[str] = None,
+    output_path: Optional[Union[AnyPath, str]] = "auto",
+    precomputed_leaderboard: Optional[Union[str, AnyPath, AnyData]] = "auto",
+    is_overwrite_leaderboard: bool = False,
+    leaderboard_mode_to_print: Optional[Union[str, Sequence[str]]] = "minimal",
+    current_leaderboard_mode: str = "community",
+    is_return_instead_of_print: bool = False,
+    fn_metric: Union[str, callable] = "get_length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "get_winrate",
+    metric_kwargs: Optional[dict[str, Any]] = None,
+    is_recompute_metrics_only: bool = False,
+    sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate",
+    is_cache_leaderboard: Optional[bool] = None,
+    max_instances: Optional[int] = None,
+    annotation_kwargs: Optional[dict[str, Any]] = None,
+    Annotator=annotators.PairwiseAnnotator,
+    annotaitons_file: Optional[AnyPath] = None,
+    **annotator_kwargs,
+):
+    """Evaluate a model based on its outputs. This is the default entrypoint if no command is specified.
+
+    Parameters
+    ----------
+    model_outputs : path or data or dict
+        The outputs of the model to add to the leaderboard. Accepts data (list of dictionary, pd.dataframe,
+        datasets.Dataset) or a path to read those (json, csv, tsv) or a function to generate those. Each dictionary
+        (or row of dataframe) should contain the keys that are formatted in the prompts. E.g. by default `instruction`
+        and `output` with optional `input`. If None, we just print the leaderboard.
+
+    reference_outputs : path or data, optional
+        The outputs of the reference model. Same format as `model_outputs`. If None, the reference outputs are a
+        specific set of Davinci 003 outputs on the AlpacaEval set:
+        https://huggingface.co/datasets/tatsu-lab/alpaca_eval.
+
+    annotators_config : path or list of dict, optional
+        The path the (or list of dict of) the annotator's config file. For details see the docstring of
+        `PairwiseAnnotator`.
+
+    name : str, optional
+        The name of the model to add to the leaderboard. If None we check if `generator is in model_outputs` if not
+        we use "Current model".
+
+    output_path : path, optional
+        Path to the directory where the new leaderboard and the annotations should be stored. If None we don't save.
+        If `auto` we use `model_outputs` if it is a path, and otherwise use the directory from which we call the script.
+
+    precomputed_leaderboard : path or data, optional
+        The precomputed leaderboard or a path to it (json, csv, or tsv). The leaderboard should contain at least the
+        column `win_rate`. If `auto` we will try to use the corresponding leaderboard for the reference outputs (only if
+        in CORRESPONDING_OUTPUTS_LEADERBOARDS). If `None` we won't add other models from the leaderboard.
+
+    is_overwrite_leaderboard : bool, optional
+        Whether to overwrite the leaderboard if the model is already in it.
+
+    leaderboard_mode_to_print : {"minimal", "verified", "community", None} or list, optional
+        The mode of the leaderboard to use. Only used if the precomputed leaderboard has a column `mode`, in which case
+        it will filter the leaderboard by this mode. If None keeps all. If a list, will print all the models in the
+        list.
+
+    current_leaderboard_mode : {"minimal", "verified", "community"}, optional
+        The mode of the leaderboard for the current method.
+
+    is_return_instead_of_print : bool, optional
+        Whether to return the metrics instead of printing the results.
+
+    fn_metric : str or callable, optional
+        The function or function name in `metrics` that will be used to convert preference to metrics. The function
+        should take a sequence of dict annotations. Each dict has a preference key (1.5 for draw, 1 for base win,
+        2 when the model to compare wins) and return a dictionary of metrics and the key by which to sort the
+        leaderboard. Common choices: `get_winrate`, `get_length_controlled_winrate`, `get_length_controlled_elo`.
+
+    metric_kwargs : dict, optional
+        Additional arguments to pass to `fn_metric`.
+
+    is_recompute_metrics_only : bool, optional
+        Whether to recompute the metrics. Useful if all you want to recompute the metrics without reannotating.
+
+    sort_by : str, optional
+        The key by which to sort the leaderboard.
+
+    is_cache_leaderboard : bool, optional
+        Whether to save the result leaderboard to `precomputed_leaderboard`. If None we save only if max_instances
+        not None. A preferred way of adding models to the leaderboard is to set `precomputed_leaderboard` to the
+        previously saved leaderboard at `<output_path>/leaderboard.csv`.
+
+    max_instances : int, optional
+        The maximum number of instances to annotate. Useful for testing.
+
+    annotation_kwargs : dict, optional
+        Additional arguments to pass to `PairwiseAnnotator.annotate_head2head`.
+
+    Annotator : class, optional
+        The annotator class to use.
+
+    annotator_kwargs :
+        Additional arguments to pass to `PairwiseAnnotator`.
+    """
+    if (
+        isinstance(current_leaderboard_mode, str)
+        and current_leaderboard_mode not in constants.ORDERED_LEADERBOARD_MODES
+    ):
+        raise ValueError(f"current_leaderboard_mode should be one of {constants.ORDERED_LEADERBOARD_MODES}")
+
+    annotation_kwargs = annotation_kwargs or dict()
+
+    leaderboard, precomputed_leaderboard = utils.get_precomputed_leaderboard(
+        precomputed_leaderboard, reference_outputs, annotators_config
+    )
+    annotations = None
+
+
+    model_outputs = utils.load_or_convert_to_dataframe(model_outputs)
+    reference_outputs = utils.load_or_convert_to_dataframe(reference_outputs)
+    name = utils.get_generator_name(name, model_outputs)
+    leaderboard[name] = {}
+    if max_instances is not None:
+        # first we shuffle both outputs with a fix seed => more representative
+        if len(model_outputs) != len(reference_outputs):
+            logging.warning(
+                "model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances."
+            )
+        else:
+            seed = 123
+            model_outputs = model_outputs.sample(frac=1, random_state=seed)
+            reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
+
+        model_outputs = model_outputs[:max_instances]
+        reference_outputs = reference_outputs[:max_instances]
+
+    leaderboard[name]["mode"] = current_leaderboard_mode 
+    leaderboard[name]["avg_length"] = int(model_outputs["output"].str.len().mean())
+    annotations = pd.read_json(annotaitons_file)
+    # print(f"len(annotations): {len(annotations)}")
+    if isinstance(fn_metric, str):
+        fn_metric_ = getattr(metrics, fn_metric)
+    else:
+        fn_metric_ = fn_metric
+
+    leaderboard[name].update(fn_metric_(annotations, **(metric_kwargs or {})))
+
+
+
+    df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(by=sort_by, ascending=False)
+    df_leaderboard = df_leaderboard[
+        utils.prioritize_elements(list(df_leaderboard.columns), ["win_rate", "standard_error"])
+    ]
+
+
+    if is_cache_leaderboard is None:
+        is_cache_leaderboard = max_instances is None
+
+    if is_cache_leaderboard:
+        if isinstance(precomputed_leaderboard, AnyPath):
+            logging.info(f"Saving result to the precomputed leaderboard at {precomputed_leaderboard}")
+            df_leaderboard.to_csv(precomputed_leaderboard)
+        else:
+            logging.info(
+                f"Not saving the result to the cached leaderboard because precomputed_leaderboard is not a "
+                f"path but {type(precomputed_leaderboard)}."
+            )
+
+    if is_return_instead_of_print:
+        return df_leaderboard, annotations
+    else:
+        utils.print_leaderboard(
+            df_leaderboard,
+            leaderboard_mode_to_print,
+            current_name=name,
+            cols_to_print=[sort_by, "win_rate", "standard_error", "n_total", "avg_length"],
+        )
+
+
diff --git a/bon_eval/download_bon_data.py b/bon_eval/download_bon_data.py
@@ -0,0 +1,8 @@
+from huggingface_hub import hf_hub_download, snapshot_download
+import os 
+repo_id = "ai2-adapt-dev/HERM_BoN_candidates"  
+local_dir = f"bon_data/" 
+os.makedirs(local_dir, exist_ok=True) 
+snapshot_download(repo_id=repo_id, allow_patterns="alpaca_eval_n=16/*", local_dir=local_dir, repo_type="dataset")
+hf_hub_download(repo_id=repo_id, filename="gpt-3.5-turbo-0613.ae.json", local_dir=local_dir, repo_type="dataset")
+snapshot_download(repo_id="allenai/reward-bench-results", allow_patterns="best-of-n/alpaca_eval/tulu-13b/*", local_dir=f"{local_dir}/rm_bon_eval_results", repo_type="dataset")
diff --git a/bon_eval/make_virual_models.py b/bon_eval/make_virual_models.py
@@ -0,0 +1,22 @@
+import json
+
+N = 16
+input_file = "bon_data/alpaca_eval_n=16/tulu-2-dpo-13b.json"
+output_file = "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.{model_id}.json"
+
+with open(input_file, "r") as f:
+    data = json.load(f)
+
+virtual_models = {}
+for item in data:
+    for i in range(N):
+        item_copy = item.copy()
+        item_copy["generator"] = f'{item["generator"]}.{i}'
+        item_copy["output"] = item["output"][i]
+        if i not in virtual_models:
+            virtual_models[i] = []
+        virtual_models[i].append(item_copy)
+
+for i in range(N):
+    with open(output_file.format(model_id=i), "w") as f:
+        json.dump(virtual_models[i], f, indent=2)
diff --git a/bon_eval/model_mapping_gen.py b/bon_eval/model_mapping_gen.py
@@ -0,0 +1,17 @@
+import os 
+import json
+file_paths = {}
+# list all filepaths under `reward-bench-results/best-of-n/alpaca_eval/tulu-13b/`
+for root, dirs, files in os.walk("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/"):
+    for file in files:
+        filepath = str(os.path.join(root, file))
+        if "/eval_results/" in filepath:
+            continue
+        rm_name = filepath.replace("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/", "").replace(".json", "" )
+        url = f"https://huggingface.co/datasets/allenai/reward-bench-results/raw/main/best-of-n/alpaca_eval/tulu-13b/{rm_name}.json"
+        if rm_name == "bon_eval_results":
+            continue
+        file_paths[rm_name] = {"url": url, "localpath": filepath}
+with open("bon_data/rm_mapping.json", "w") as f:
+    json.dump(file_paths, f, indent=4)
+