diff --git a/bon_eval/.gitignore b/bon_eval/.gitignore
new file mode 100644
index 00000000..4d554968
--- /dev/null
+++ b/bon_eval/.gitignore
@@ -0,0 +1,2 @@
+bon_data/
+rm_bon_eval_results/
\ No newline at end of file
diff --git a/bon_eval/README.md b/bon_eval/README.md
new file mode 100644
index 00000000..7a3ba7ec
--- /dev/null
+++ b/bon_eval/README.md
@@ -0,0 +1,46 @@
+# Best-of-N Evaluation with RewardBench 
+
+This folder serves for evaluating reward models' abilities in selecting the best output from a set of N outputs (i.e., best-of-N sampling, aka. rejection sampling).  
+
+
+## Download BoN data 
+
+```bash
+cd bon_eval
+pip install alpaca_eval==0.6.2
+python download_bon_data.py
+python model_mapping_gen.py
+python rm_bon_eval.py
+```
+
+## AlpacaEval-BoN (notes)
+
+- LLM: Tulu-2-dpo-13b
+- N: 16
+- Judge: GPT-4-turbo (same as the standard AlpacaEval 2 with length control)
+- Reference: GPT-3.5-turbo (`bon_data/gpt-3.5-turbo-0613.ae.json`)
+
+We sample 16 outputs from the Tulu-2-dpo-13b LLM. The model outputs are divided into 16 files: `bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.[x].json` where `[x]` is from 0 to 15. 
+
+We use AlpacaEval to evaluate each output with the GPT-4-turbo judge. 
+The reference for computing win-rates is the GPT-3.5-turbo model.  
+The AlpacaEval annotations are stored in `bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.[x]/weighted_alpaca_eval_gpt4_turbo/annotations.json`. 
+
+If you'd like to reproduce the evaluation annotations:"
+```bash
+for i in {0..15}
+do 
+  output_dir="bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.$i/"
+  mkdir -p $output_dir
+  alpaca_eval --reference_outputs "bon_data/gpt-3.5-turbo-0613.ae.json" \
+              --model_outputs "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.$i.json" \
+              --output_path $output_dir
+done 
+```
+
+## Evaluation
+
+<!-- export HF_ENDPOINT=https://hf-mirror.com -->
+
+Run `python rm_bon_eval.py` and you will get a json named `bon_eval_results.json` in the current directory, which will be also uploaded to HuggingFace hub.
+
diff --git a/analysis/bon_to_alpacaeval.py b/bon_eval/bon_to_alpacaeval.py
similarity index 100%
rename from analysis/bon_to_alpacaeval.py
rename to bon_eval/bon_to_alpacaeval.py
diff --git a/bon_eval/bon_utils.py b/bon_eval/bon_utils.py
new file mode 100644
index 00000000..6b960bc7
--- /dev/null
+++ b/bon_eval/bon_utils.py
@@ -0,0 +1,183 @@
+import logging 
+from typing import Any, Optional, Sequence, Union
+import pandas as pd
+from alpaca_eval import annotators, constants, metrics, utils
+from alpaca_eval.types import AnyData, AnyLoadableDF, AnyPath
+
+# Source: modified from https://github.com/tatsu-lab/alpaca_eval/blob/c4a4ca716b4cab46701af759c244cb9d05772e15/src/alpaca_eval/main.py#L17C5-L17C5 
+
+def evaluate(
+    model_outputs: Optional[AnyLoadableDF] = None,
+    reference_outputs: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS,
+    annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG,
+    name: Optional[str] = None,
+    output_path: Optional[Union[AnyPath, str]] = "auto",
+    precomputed_leaderboard: Optional[Union[str, AnyPath, AnyData]] = "auto",
+    is_overwrite_leaderboard: bool = False,
+    leaderboard_mode_to_print: Optional[Union[str, Sequence[str]]] = "minimal",
+    current_leaderboard_mode: str = "community",
+    is_return_instead_of_print: bool = False,
+    fn_metric: Union[str, callable] = "get_length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "get_winrate",
+    metric_kwargs: Optional[dict[str, Any]] = None,
+    is_recompute_metrics_only: bool = False,
+    sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate",
+    is_cache_leaderboard: Optional[bool] = None,
+    max_instances: Optional[int] = None,
+    annotation_kwargs: Optional[dict[str, Any]] = None,
+    Annotator=annotators.PairwiseAnnotator,
+    annotaitons_file: Optional[AnyPath] = None,
+    **annotator_kwargs,
+):
+    """Evaluate a model based on its outputs. This is the default entrypoint if no command is specified.
+
+    Parameters
+    ----------
+    model_outputs : path or data or dict
+        The outputs of the model to add to the leaderboard. Accepts data (list of dictionary, pd.dataframe,
+        datasets.Dataset) or a path to read those (json, csv, tsv) or a function to generate those. Each dictionary
+        (or row of dataframe) should contain the keys that are formatted in the prompts. E.g. by default `instruction`
+        and `output` with optional `input`. If None, we just print the leaderboard.
+
+    reference_outputs : path or data, optional
+        The outputs of the reference model. Same format as `model_outputs`. If None, the reference outputs are a
+        specific set of Davinci 003 outputs on the AlpacaEval set:
+        https://huggingface.co/datasets/tatsu-lab/alpaca_eval.
+
+    annotators_config : path or list of dict, optional
+        The path the (or list of dict of) the annotator's config file. For details see the docstring of
+        `PairwiseAnnotator`.
+
+    name : str, optional
+        The name of the model to add to the leaderboard. If None we check if `generator is in model_outputs` if not
+        we use "Current model".
+
+    output_path : path, optional
+        Path to the directory where the new leaderboard and the annotations should be stored. If None we don't save.
+        If `auto` we use `model_outputs` if it is a path, and otherwise use the directory from which we call the script.
+
+    precomputed_leaderboard : path or data, optional
+        The precomputed leaderboard or a path to it (json, csv, or tsv). The leaderboard should contain at least the
+        column `win_rate`. If `auto` we will try to use the corresponding leaderboard for the reference outputs (only if
+        in CORRESPONDING_OUTPUTS_LEADERBOARDS). If `None` we won't add other models from the leaderboard.
+
+    is_overwrite_leaderboard : bool, optional
+        Whether to overwrite the leaderboard if the model is already in it.
+
+    leaderboard_mode_to_print : {"minimal", "verified", "community", None} or list, optional
+        The mode of the leaderboard to use. Only used if the precomputed leaderboard has a column `mode`, in which case
+        it will filter the leaderboard by this mode. If None keeps all. If a list, will print all the models in the
+        list.
+
+    current_leaderboard_mode : {"minimal", "verified", "community"}, optional
+        The mode of the leaderboard for the current method.
+
+    is_return_instead_of_print : bool, optional
+        Whether to return the metrics instead of printing the results.
+
+    fn_metric : str or callable, optional
+        The function or function name in `metrics` that will be used to convert preference to metrics. The function
+        should take a sequence of dict annotations. Each dict has a preference key (1.5 for draw, 1 for base win,
+        2 when the model to compare wins) and return a dictionary of metrics and the key by which to sort the
+        leaderboard. Common choices: `get_winrate`, `get_length_controlled_winrate`, `get_length_controlled_elo`.
+
+    metric_kwargs : dict, optional
+        Additional arguments to pass to `fn_metric`.
+
+    is_recompute_metrics_only : bool, optional
+        Whether to recompute the metrics. Useful if all you want to recompute the metrics without reannotating.
+
+    sort_by : str, optional
+        The key by which to sort the leaderboard.
+
+    is_cache_leaderboard : bool, optional
+        Whether to save the result leaderboard to `precomputed_leaderboard`. If None we save only if max_instances
+        not None. A preferred way of adding models to the leaderboard is to set `precomputed_leaderboard` to the
+        previously saved leaderboard at `<output_path>/leaderboard.csv`.
+
+    max_instances : int, optional
+        The maximum number of instances to annotate. Useful for testing.
+
+    annotation_kwargs : dict, optional
+        Additional arguments to pass to `PairwiseAnnotator.annotate_head2head`.
+
+    Annotator : class, optional
+        The annotator class to use.
+
+    annotator_kwargs :
+        Additional arguments to pass to `PairwiseAnnotator`.
+    """
+    if (
+        isinstance(current_leaderboard_mode, str)
+        and current_leaderboard_mode not in constants.ORDERED_LEADERBOARD_MODES
+    ):
+        raise ValueError(f"current_leaderboard_mode should be one of {constants.ORDERED_LEADERBOARD_MODES}")
+
+    annotation_kwargs = annotation_kwargs or dict()
+
+    leaderboard, precomputed_leaderboard = utils.get_precomputed_leaderboard(
+        precomputed_leaderboard, reference_outputs, annotators_config
+    )
+    annotations = None
+ 
+
+    model_outputs = utils.load_or_convert_to_dataframe(model_outputs)
+    reference_outputs = utils.load_or_convert_to_dataframe(reference_outputs)
+    name = utils.get_generator_name(name, model_outputs)
+    leaderboard[name] = {}
+    if max_instances is not None:
+        # first we shuffle both outputs with a fix seed => more representative
+        if len(model_outputs) != len(reference_outputs):
+            logging.warning(
+                "model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances."
+            )
+        else:
+            seed = 123
+            model_outputs = model_outputs.sample(frac=1, random_state=seed)
+            reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
+
+        model_outputs = model_outputs[:max_instances]
+        reference_outputs = reference_outputs[:max_instances]
+ 
+    leaderboard[name]["mode"] = current_leaderboard_mode 
+    leaderboard[name]["avg_length"] = int(model_outputs["output"].str.len().mean())
+    annotations = pd.read_json(annotaitons_file)
+    # print(f"len(annotations): {len(annotations)}")
+    if isinstance(fn_metric, str):
+        fn_metric_ = getattr(metrics, fn_metric)
+    else:
+        fn_metric_ = fn_metric
+
+    leaderboard[name].update(fn_metric_(annotations, **(metric_kwargs or {})))
+
+ 
+
+    df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(by=sort_by, ascending=False)
+    df_leaderboard = df_leaderboard[
+        utils.prioritize_elements(list(df_leaderboard.columns), ["win_rate", "standard_error"])
+    ]
+ 
+
+    if is_cache_leaderboard is None:
+        is_cache_leaderboard = max_instances is None
+
+    if is_cache_leaderboard:
+        if isinstance(precomputed_leaderboard, AnyPath):
+            logging.info(f"Saving result to the precomputed leaderboard at {precomputed_leaderboard}")
+            df_leaderboard.to_csv(precomputed_leaderboard)
+        else:
+            logging.info(
+                f"Not saving the result to the cached leaderboard because precomputed_leaderboard is not a "
+                f"path but {type(precomputed_leaderboard)}."
+            )
+
+    if is_return_instead_of_print:
+        return df_leaderboard, annotations
+    else:
+        utils.print_leaderboard(
+            df_leaderboard,
+            leaderboard_mode_to_print,
+            current_name=name,
+            cols_to_print=[sort_by, "win_rate", "standard_error", "n_total", "avg_length"],
+        )
+ 
+ 
\ No newline at end of file
diff --git a/bon_eval/download_bon_data.py b/bon_eval/download_bon_data.py
new file mode 100644
index 00000000..84af51f6
--- /dev/null
+++ b/bon_eval/download_bon_data.py
@@ -0,0 +1,8 @@
+from huggingface_hub import hf_hub_download, snapshot_download
+import os 
+repo_id = "ai2-adapt-dev/HERM_BoN_candidates"  
+local_dir = f"bon_data/" 
+os.makedirs(local_dir, exist_ok=True) 
+snapshot_download(repo_id=repo_id, allow_patterns="alpaca_eval_n=16/*", local_dir=local_dir, repo_type="dataset")
+hf_hub_download(repo_id=repo_id, filename="gpt-3.5-turbo-0613.ae.json", local_dir=local_dir, repo_type="dataset")
+snapshot_download(repo_id="allenai/reward-bench-results", allow_patterns="best-of-n/alpaca_eval/tulu-13b/*", local_dir=f"{local_dir}/rm_bon_eval_results", repo_type="dataset")
\ No newline at end of file
diff --git a/bon_eval/make_virual_models.py b/bon_eval/make_virual_models.py
new file mode 100644
index 00000000..41e5f77d
--- /dev/null
+++ b/bon_eval/make_virual_models.py
@@ -0,0 +1,22 @@
+import json
+
+N = 16
+input_file = "bon_data/alpaca_eval_n=16/tulu-2-dpo-13b.json"
+output_file = "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.{model_id}.json"
+
+with open(input_file, "r") as f:
+    data = json.load(f)
+
+virtual_models = {}
+for item in data:
+    for i in range(N):
+        item_copy = item.copy()
+        item_copy["generator"] = f'{item["generator"]}.{i}'
+        item_copy["output"] = item["output"][i]
+        if i not in virtual_models:
+            virtual_models[i] = []
+        virtual_models[i].append(item_copy)
+
+for i in range(N):
+    with open(output_file.format(model_id=i), "w") as f:
+        json.dump(virtual_models[i], f, indent=2)
diff --git a/bon_eval/model_mapping_gen.py b/bon_eval/model_mapping_gen.py
new file mode 100644
index 00000000..71c82c9b
--- /dev/null
+++ b/bon_eval/model_mapping_gen.py
@@ -0,0 +1,17 @@
+import os 
+import json
+file_paths = {}
+# list all filepaths under `reward-bench-results/best-of-n/alpaca_eval/tulu-13b/`
+for root, dirs, files in os.walk("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/"):
+    for file in files:
+        filepath = str(os.path.join(root, file))
+        if "/eval_results/" in filepath:
+            continue
+        rm_name = filepath.replace("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/", "").replace(".json", "" )
+        url = f"https://huggingface.co/datasets/allenai/reward-bench-results/raw/main/best-of-n/alpaca_eval/tulu-13b/{rm_name}.json"
+        if rm_name == "bon_eval_results":
+            continue
+        file_paths[rm_name] = {"url": url, "localpath": filepath}
+with open("bon_data/rm_mapping.json", "w") as f:
+    json.dump(file_paths, f, indent=4)
+
diff --git a/bon_eval/rm_bon_eval.py b/bon_eval/rm_bon_eval.py
new file mode 100644
index 00000000..8ca19d1b
--- /dev/null
+++ b/bon_eval/rm_bon_eval.py
@@ -0,0 +1,179 @@
+import json 
+import requests
+from bon_utils import evaluate
+import sys 
+import os 
+import pandas as pd
+from huggingface_hub import HfApi
+
+# Load all tulu results by the ids 
+model_oputputs_all = {}
+for i in range(16):
+    file_outputs = f"bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.{i}.json"
+    with open(file_outputs) as f:
+        model_oputputs = json.load(f)
+        model_oputputs_all[i] = model_oputputs
+
+# Load all annotations by the ids
+annotations_all = {}
+for i in range(16):
+    file_annotations = f"bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.{i}/weighted_alpaca_eval_gpt4_turbo/annotations.json"
+    with open(file_annotations) as f:
+        annotations = json.load(f)
+        annotations_all[i] = annotations
+
+def extract_score(score_item):
+    if type(score_item) == list:
+        return score_item[0]
+    elif type(score_item) == float:
+        return score_item
+    else:
+        raise ValueError("Invalid score item")
+    
+def compute_rm_bon_eval(pretty_rm_name, rm_result, model_oputputs_all, annotations_all, mode="rm"):
+    if mode == "rm":  
+        assert len(rm_result) == 805 * 16, "The length of the result should be 805 * 16 but got {}".format(len(rm_result))
+
+        # split the results by grouping them by each of the 805 example 
+        rm_result_grouped = [rm_result[i:i+16] for i in range(0, len(rm_result), 16)]
+        
+        # rank the items with scores and take the top one in each group 
+        rm_bon_selection = []
+        
+        for group in rm_result_grouped:
+            group = sorted(group, key=lambda x: extract_score(x["scores"]), reverse=True)
+            # select the top one as the submitted one 
+            rm_bon_selection.append(group[0])
+    elif mode == "oracle":
+        pass
+    elif mode == "longest":
+        pass 
+    elif mode == "shortest":
+        pass 
+    # Example item in rm_bon_selection: 
+    # {'config': 'top_p=0.9;temp=1.0', 'dataset_details': 'helpful_base', 'id': [0, 9], 'model': 'allenai/tulu-2-dpo-13b', 'scores': [7.94921875]}
+
+
+    # generate the selection of virutal model output by selection 
+    # generate the annotations of the virtual model output by selection
+
+    rm_bon_model_outputs = []
+    rm_bon_annotations = []
+
+    for item in rm_bon_selection:
+        example_id = item["id"][0]
+        virutal_model_id = item["id"][1]
+        output_item = model_oputputs_all[virutal_model_id][example_id].copy()
+        original_generator = output_item["generator"]
+        output_item["generator"] = pretty_rm_name+"-BoN"
+        anno_item = annotations_all[virutal_model_id][example_id].copy()
+        if anno_item["generator_1"] == original_generator:
+            anno_item["generator_1"] = pretty_rm_name+"-BoN"
+        elif anno_item["generator_2"] == original_generator:
+            anno_item["generator_2"] = pretty_rm_name+"-BoN"
+        rm_bon_model_outputs.append(output_item)
+        rm_bon_annotations.append(anno_item)
+
+    file_model_outputs = f"rm_bon_eval_results/{pretty_rm_name}.model_outputs.json"
+    file_annotations = f"rm_bon_eval_results/{pretty_rm_name}.annotations.json"
+    # create folder if not exists
+
+    os.makedirs(os.path.dirname(file_model_outputs), exist_ok=True)
+    os.makedirs(os.path.dirname(file_annotations), exist_ok=True)
+    with open(file_model_outputs, "w") as f: 
+        json.dump(rm_bon_model_outputs, f, indent=2) 
+    with open(file_annotations, "w") as f:
+        json.dump(rm_bon_annotations, f, indent=2)
+
+    df_leaderboard, _ = evaluate(model_outputs=file_model_outputs, annotaitons_file=file_annotations, is_return_instead_of_print=True)
+    # print(df_leaderboard)
+    df_leaderboard = df_leaderboard.reset_index()
+
+    # convert the dataframe to json
+    rm_row_json = df_leaderboard.to_dict(orient="records") #, index=True,)
+    # print(json.dumps(rm_row_json, indent=2))
+    # find the one with pretty_rm_name 
+    for row in rm_row_json:
+        if row["index"] == pretty_rm_name + "-BoN":
+            target_rm_row_json = row
+            break
+    target_rm_row_json["reward_model"] = pretty_rm_name
+    del target_rm_row_json["index"]
+    # print(json.dumps(target_rm_row_json, indent=2))
+    file_result = f"rm_bon_eval_results/eval_results/{pretty_rm_name}.json"
+    os.makedirs(os.path.dirname(file_result), exist_ok=True)
+    with open(file_result, "w") as f:
+        json.dump(target_rm_row_json, f, indent=2)
+    
+    # TODO: check if uploaded path is correct 
+    api.upload_file(
+        path_or_fileobj=file_result,
+        path_in_repo=f"best-of-n/alpaca_eval/tulu-13b/eval_results/{pretty_rm_name}.json",
+        repo_id="allenai/reward-bench-results",
+        repo_type="dataset",
+    )
+    return target_rm_row_json
+
+def extract_random(eval_results):
+    
+    table_file = "bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/merged_leaderboard.csv"
+    # load as dataframe
+    df = pd.read_csv(table_file) 
+    # convert to list of dict 
+    df_json = df.to_dict(orient="records")
+    # find the one with maximum length_controlled_winrate value
+    eval_results["random_max"] = max(df_json, key=lambda x: x["length_controlled_winrate"])
+    # find the one with minimum length_controlled_winrate value
+    eval_results["random_min"] = min(df_json, key=lambda x: x["length_controlled_winrate"])
+    # find the one with median length_controlled_winrate value
+    length_controlled_winrate_values = [x["length_controlled_winrate"] for x in df_json]
+    median_value = sorted(length_controlled_winrate_values)[len(length_controlled_winrate_values)//2]
+    eval_results["random_median"] = [x for x in df_json if x["length_controlled_winrate"] == median_value][0]
+    # give the average values of all columns 
+    # eval_results["random_avg"] = {}
+    # for column in df.columns:
+    #     if column == "index":
+    #         continue
+    #     values = [x[column] for x in df_json]
+    #     avg_value = sum(values) / len
+    #     eval_results["random_avg"][column] = avg_value
+    # change model_name with reward_model name 
+    eval_results["random_max"]["reward_model"] = eval_results["random_max"]["model_name"]
+    eval_results["random_min"]["reward_model"] = eval_results["random_min"]["model_name"]
+    eval_results["random_median"]["reward_model"] = eval_results["random_median"]["model_name"] 
+    del eval_results["random_max"]["model_name"]
+    del eval_results["random_min"]["model_name"]
+    del eval_results["random_median"]["model_name"]
+
+
+if __name__ == "__main__": 
+    eval_results = {}
+    # extract_random(eval_results) 
+    api = HfApi()
+    with open("bon_data/rm_mapping.json") as f:
+        rm_mapping = json.load(f) 
+    for pretty_rm_name in rm_mapping:
+        rm_result_path = rm_mapping[pretty_rm_name]["localpath"]
+        print(f"Running evaluation for {pretty_rm_name} with url {rm_result_path}")
+        with open(rm_result_path) as f:
+            text_content = f.read()
+        text_content = text_content.replace("}\n{", "},\n{") 
+        text_content = "[" + text_content + "]"   
+        rm_result = json.loads(text_content)
+        eval_result = compute_rm_bon_eval(pretty_rm_name, rm_result, model_oputputs_all, annotations_all) 
+        eval_results[pretty_rm_name] = eval_result
+        print(eval_result)  
+    with open("bon_data/bon_eval_results.json", "w") as f:
+        json.dump(eval_results, f, indent=2)
+    # save the file to rewardbench result hf repo
+    # https://huggingface.co/datasets/allenai/reward-bench-results/tree/main/best-of-n/alpaca_eval/tulu-13b
+    
+    api.upload_file(
+        path_or_fileobj="bon_data/bon_eval_results.json",
+        path_in_repo="best-of-n/alpaca_eval/tulu-13b/bon_eval_results.json",
+        repo_id="allenai/reward-bench-results",
+        repo_type="dataset",
+    )
+    
+    
+    
\ No newline at end of file