diff --git a/bon_eval/.gitignore b/bon_eval/.gitignore new file mode 100644 index 00000000..4d554968 --- /dev/null +++ b/bon_eval/.gitignore @@ -0,0 +1,2 @@ +bon_data/ +rm_bon_eval_results/ \ No newline at end of file diff --git a/bon_eval/README.md b/bon_eval/README.md new file mode 100644 index 00000000..7a3ba7ec --- /dev/null +++ b/bon_eval/README.md @@ -0,0 +1,46 @@ +# Best-of-N Evaluation with RewardBench + +This folder serves for evaluating reward models' abilities in selecting the best output from a set of N outputs (i.e., best-of-N sampling, aka. rejection sampling). + + +## Download BoN data + +```bash +cd bon_eval +pip install alpaca_eval==0.6.2 +python download_bon_data.py +python model_mapping_gen.py +python rm_bon_eval.py +``` + +## AlpacaEval-BoN (notes) + +- LLM: Tulu-2-dpo-13b +- N: 16 +- Judge: GPT-4-turbo (same as the standard AlpacaEval 2 with length control) +- Reference: GPT-3.5-turbo (`bon_data/gpt-3.5-turbo-0613.ae.json`) + +We sample 16 outputs from the Tulu-2-dpo-13b LLM. The model outputs are divided into 16 files: `bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.[x].json` where `[x]` is from 0 to 15. + +We use AlpacaEval to evaluate each output with the GPT-4-turbo judge. +The reference for computing win-rates is the GPT-3.5-turbo model. +The AlpacaEval annotations are stored in `bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.[x]/weighted_alpaca_eval_gpt4_turbo/annotations.json`. + +If you'd like to reproduce the evaluation annotations:" +```bash +for i in {0..15} +do + output_dir="bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.$i/" + mkdir -p $output_dir + alpaca_eval --reference_outputs "bon_data/gpt-3.5-turbo-0613.ae.json" \ + --model_outputs "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.$i.json" \ + --output_path $output_dir +done +``` + +## Evaluation + + + +Run `python rm_bon_eval.py` and you will get a json named `bon_eval_results.json` in the current directory, which will be also uploaded to HuggingFace hub. + diff --git a/analysis/bon_to_alpacaeval.py b/bon_eval/bon_to_alpacaeval.py similarity index 100% rename from analysis/bon_to_alpacaeval.py rename to bon_eval/bon_to_alpacaeval.py diff --git a/bon_eval/bon_utils.py b/bon_eval/bon_utils.py new file mode 100644 index 00000000..6b960bc7 --- /dev/null +++ b/bon_eval/bon_utils.py @@ -0,0 +1,183 @@ +import logging +from typing import Any, Optional, Sequence, Union +import pandas as pd +from alpaca_eval import annotators, constants, metrics, utils +from alpaca_eval.types import AnyData, AnyLoadableDF, AnyPath + +# Source: modified from https://github.com/tatsu-lab/alpaca_eval/blob/c4a4ca716b4cab46701af759c244cb9d05772e15/src/alpaca_eval/main.py#L17C5-L17C5 + +def evaluate( + model_outputs: Optional[AnyLoadableDF] = None, + reference_outputs: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS, + annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG, + name: Optional[str] = None, + output_path: Optional[Union[AnyPath, str]] = "auto", + precomputed_leaderboard: Optional[Union[str, AnyPath, AnyData]] = "auto", + is_overwrite_leaderboard: bool = False, + leaderboard_mode_to_print: Optional[Union[str, Sequence[str]]] = "minimal", + current_leaderboard_mode: str = "community", + is_return_instead_of_print: bool = False, + fn_metric: Union[str, callable] = "get_length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "get_winrate", + metric_kwargs: Optional[dict[str, Any]] = None, + is_recompute_metrics_only: bool = False, + sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate", + is_cache_leaderboard: Optional[bool] = None, + max_instances: Optional[int] = None, + annotation_kwargs: Optional[dict[str, Any]] = None, + Annotator=annotators.PairwiseAnnotator, + annotaitons_file: Optional[AnyPath] = None, + **annotator_kwargs, +): + """Evaluate a model based on its outputs. This is the default entrypoint if no command is specified. + + Parameters + ---------- + model_outputs : path or data or dict + The outputs of the model to add to the leaderboard. Accepts data (list of dictionary, pd.dataframe, + datasets.Dataset) or a path to read those (json, csv, tsv) or a function to generate those. Each dictionary + (or row of dataframe) should contain the keys that are formatted in the prompts. E.g. by default `instruction` + and `output` with optional `input`. If None, we just print the leaderboard. + + reference_outputs : path or data, optional + The outputs of the reference model. Same format as `model_outputs`. If None, the reference outputs are a + specific set of Davinci 003 outputs on the AlpacaEval set: + https://huggingface.co/datasets/tatsu-lab/alpaca_eval. + + annotators_config : path or list of dict, optional + The path the (or list of dict of) the annotator's config file. For details see the docstring of + `PairwiseAnnotator`. + + name : str, optional + The name of the model to add to the leaderboard. If None we check if `generator is in model_outputs` if not + we use "Current model". + + output_path : path, optional + Path to the directory where the new leaderboard and the annotations should be stored. If None we don't save. + If `auto` we use `model_outputs` if it is a path, and otherwise use the directory from which we call the script. + + precomputed_leaderboard : path or data, optional + The precomputed leaderboard or a path to it (json, csv, or tsv). The leaderboard should contain at least the + column `win_rate`. If `auto` we will try to use the corresponding leaderboard for the reference outputs (only if + in CORRESPONDING_OUTPUTS_LEADERBOARDS). If `None` we won't add other models from the leaderboard. + + is_overwrite_leaderboard : bool, optional + Whether to overwrite the leaderboard if the model is already in it. + + leaderboard_mode_to_print : {"minimal", "verified", "community", None} or list, optional + The mode of the leaderboard to use. Only used if the precomputed leaderboard has a column `mode`, in which case + it will filter the leaderboard by this mode. If None keeps all. If a list, will print all the models in the + list. + + current_leaderboard_mode : {"minimal", "verified", "community"}, optional + The mode of the leaderboard for the current method. + + is_return_instead_of_print : bool, optional + Whether to return the metrics instead of printing the results. + + fn_metric : str or callable, optional + The function or function name in `metrics` that will be used to convert preference to metrics. The function + should take a sequence of dict annotations. Each dict has a preference key (1.5 for draw, 1 for base win, + 2 when the model to compare wins) and return a dictionary of metrics and the key by which to sort the + leaderboard. Common choices: `get_winrate`, `get_length_controlled_winrate`, `get_length_controlled_elo`. + + metric_kwargs : dict, optional + Additional arguments to pass to `fn_metric`. + + is_recompute_metrics_only : bool, optional + Whether to recompute the metrics. Useful if all you want to recompute the metrics without reannotating. + + sort_by : str, optional + The key by which to sort the leaderboard. + + is_cache_leaderboard : bool, optional + Whether to save the result leaderboard to `precomputed_leaderboard`. If None we save only if max_instances + not None. A preferred way of adding models to the leaderboard is to set `precomputed_leaderboard` to the + previously saved leaderboard at `/leaderboard.csv`. + + max_instances : int, optional + The maximum number of instances to annotate. Useful for testing. + + annotation_kwargs : dict, optional + Additional arguments to pass to `PairwiseAnnotator.annotate_head2head`. + + Annotator : class, optional + The annotator class to use. + + annotator_kwargs : + Additional arguments to pass to `PairwiseAnnotator`. + """ + if ( + isinstance(current_leaderboard_mode, str) + and current_leaderboard_mode not in constants.ORDERED_LEADERBOARD_MODES + ): + raise ValueError(f"current_leaderboard_mode should be one of {constants.ORDERED_LEADERBOARD_MODES}") + + annotation_kwargs = annotation_kwargs or dict() + + leaderboard, precomputed_leaderboard = utils.get_precomputed_leaderboard( + precomputed_leaderboard, reference_outputs, annotators_config + ) + annotations = None + + + model_outputs = utils.load_or_convert_to_dataframe(model_outputs) + reference_outputs = utils.load_or_convert_to_dataframe(reference_outputs) + name = utils.get_generator_name(name, model_outputs) + leaderboard[name] = {} + if max_instances is not None: + # first we shuffle both outputs with a fix seed => more representative + if len(model_outputs) != len(reference_outputs): + logging.warning( + "model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances." + ) + else: + seed = 123 + model_outputs = model_outputs.sample(frac=1, random_state=seed) + reference_outputs = reference_outputs.sample(frac=1, random_state=seed) + + model_outputs = model_outputs[:max_instances] + reference_outputs = reference_outputs[:max_instances] + + leaderboard[name]["mode"] = current_leaderboard_mode + leaderboard[name]["avg_length"] = int(model_outputs["output"].str.len().mean()) + annotations = pd.read_json(annotaitons_file) + # print(f"len(annotations): {len(annotations)}") + if isinstance(fn_metric, str): + fn_metric_ = getattr(metrics, fn_metric) + else: + fn_metric_ = fn_metric + + leaderboard[name].update(fn_metric_(annotations, **(metric_kwargs or {}))) + + + + df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(by=sort_by, ascending=False) + df_leaderboard = df_leaderboard[ + utils.prioritize_elements(list(df_leaderboard.columns), ["win_rate", "standard_error"]) + ] + + + if is_cache_leaderboard is None: + is_cache_leaderboard = max_instances is None + + if is_cache_leaderboard: + if isinstance(precomputed_leaderboard, AnyPath): + logging.info(f"Saving result to the precomputed leaderboard at {precomputed_leaderboard}") + df_leaderboard.to_csv(precomputed_leaderboard) + else: + logging.info( + f"Not saving the result to the cached leaderboard because precomputed_leaderboard is not a " + f"path but {type(precomputed_leaderboard)}." + ) + + if is_return_instead_of_print: + return df_leaderboard, annotations + else: + utils.print_leaderboard( + df_leaderboard, + leaderboard_mode_to_print, + current_name=name, + cols_to_print=[sort_by, "win_rate", "standard_error", "n_total", "avg_length"], + ) + + \ No newline at end of file diff --git a/bon_eval/download_bon_data.py b/bon_eval/download_bon_data.py new file mode 100644 index 00000000..84af51f6 --- /dev/null +++ b/bon_eval/download_bon_data.py @@ -0,0 +1,8 @@ +from huggingface_hub import hf_hub_download, snapshot_download +import os +repo_id = "ai2-adapt-dev/HERM_BoN_candidates" +local_dir = f"bon_data/" +os.makedirs(local_dir, exist_ok=True) +snapshot_download(repo_id=repo_id, allow_patterns="alpaca_eval_n=16/*", local_dir=local_dir, repo_type="dataset") +hf_hub_download(repo_id=repo_id, filename="gpt-3.5-turbo-0613.ae.json", local_dir=local_dir, repo_type="dataset") +snapshot_download(repo_id="allenai/reward-bench-results", allow_patterns="best-of-n/alpaca_eval/tulu-13b/*", local_dir=f"{local_dir}/rm_bon_eval_results", repo_type="dataset") \ No newline at end of file diff --git a/bon_eval/make_virual_models.py b/bon_eval/make_virual_models.py new file mode 100644 index 00000000..41e5f77d --- /dev/null +++ b/bon_eval/make_virual_models.py @@ -0,0 +1,22 @@ +import json + +N = 16 +input_file = "bon_data/alpaca_eval_n=16/tulu-2-dpo-13b.json" +output_file = "bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.{model_id}.json" + +with open(input_file, "r") as f: + data = json.load(f) + +virtual_models = {} +for item in data: + for i in range(N): + item_copy = item.copy() + item_copy["generator"] = f'{item["generator"]}.{i}' + item_copy["output"] = item["output"][i] + if i not in virtual_models: + virtual_models[i] = [] + virtual_models[i].append(item_copy) + +for i in range(N): + with open(output_file.format(model_id=i), "w") as f: + json.dump(virtual_models[i], f, indent=2) diff --git a/bon_eval/model_mapping_gen.py b/bon_eval/model_mapping_gen.py new file mode 100644 index 00000000..71c82c9b --- /dev/null +++ b/bon_eval/model_mapping_gen.py @@ -0,0 +1,17 @@ +import os +import json +file_paths = {} +# list all filepaths under `reward-bench-results/best-of-n/alpaca_eval/tulu-13b/` +for root, dirs, files in os.walk("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/"): + for file in files: + filepath = str(os.path.join(root, file)) + if "/eval_results/" in filepath: + continue + rm_name = filepath.replace("bon_data/rm_bon_eval_results/best-of-n/alpaca_eval/tulu-13b/", "").replace(".json", "" ) + url = f"https://huggingface.co/datasets/allenai/reward-bench-results/raw/main/best-of-n/alpaca_eval/tulu-13b/{rm_name}.json" + if rm_name == "bon_eval_results": + continue + file_paths[rm_name] = {"url": url, "localpath": filepath} +with open("bon_data/rm_mapping.json", "w") as f: + json.dump(file_paths, f, indent=4) + diff --git a/bon_eval/rm_bon_eval.py b/bon_eval/rm_bon_eval.py new file mode 100644 index 00000000..8ca19d1b --- /dev/null +++ b/bon_eval/rm_bon_eval.py @@ -0,0 +1,179 @@ +import json +import requests +from bon_utils import evaluate +import sys +import os +import pandas as pd +from huggingface_hub import HfApi + +# Load all tulu results by the ids +model_oputputs_all = {} +for i in range(16): + file_outputs = f"bon_data/alpaca_eval_n=16/virtual/tulu-2-dpo-13b.{i}.json" + with open(file_outputs) as f: + model_oputputs = json.load(f) + model_oputputs_all[i] = model_oputputs + +# Load all annotations by the ids +annotations_all = {} +for i in range(16): + file_annotations = f"bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/tulu-2-dpo-13b.{i}/weighted_alpaca_eval_gpt4_turbo/annotations.json" + with open(file_annotations) as f: + annotations = json.load(f) + annotations_all[i] = annotations + +def extract_score(score_item): + if type(score_item) == list: + return score_item[0] + elif type(score_item) == float: + return score_item + else: + raise ValueError("Invalid score item") + +def compute_rm_bon_eval(pretty_rm_name, rm_result, model_oputputs_all, annotations_all, mode="rm"): + if mode == "rm": + assert len(rm_result) == 805 * 16, "The length of the result should be 805 * 16 but got {}".format(len(rm_result)) + + # split the results by grouping them by each of the 805 example + rm_result_grouped = [rm_result[i:i+16] for i in range(0, len(rm_result), 16)] + + # rank the items with scores and take the top one in each group + rm_bon_selection = [] + + for group in rm_result_grouped: + group = sorted(group, key=lambda x: extract_score(x["scores"]), reverse=True) + # select the top one as the submitted one + rm_bon_selection.append(group[0]) + elif mode == "oracle": + pass + elif mode == "longest": + pass + elif mode == "shortest": + pass + # Example item in rm_bon_selection: + # {'config': 'top_p=0.9;temp=1.0', 'dataset_details': 'helpful_base', 'id': [0, 9], 'model': 'allenai/tulu-2-dpo-13b', 'scores': [7.94921875]} + + + # generate the selection of virutal model output by selection + # generate the annotations of the virtual model output by selection + + rm_bon_model_outputs = [] + rm_bon_annotations = [] + + for item in rm_bon_selection: + example_id = item["id"][0] + virutal_model_id = item["id"][1] + output_item = model_oputputs_all[virutal_model_id][example_id].copy() + original_generator = output_item["generator"] + output_item["generator"] = pretty_rm_name+"-BoN" + anno_item = annotations_all[virutal_model_id][example_id].copy() + if anno_item["generator_1"] == original_generator: + anno_item["generator_1"] = pretty_rm_name+"-BoN" + elif anno_item["generator_2"] == original_generator: + anno_item["generator_2"] = pretty_rm_name+"-BoN" + rm_bon_model_outputs.append(output_item) + rm_bon_annotations.append(anno_item) + + file_model_outputs = f"rm_bon_eval_results/{pretty_rm_name}.model_outputs.json" + file_annotations = f"rm_bon_eval_results/{pretty_rm_name}.annotations.json" + # create folder if not exists + + os.makedirs(os.path.dirname(file_model_outputs), exist_ok=True) + os.makedirs(os.path.dirname(file_annotations), exist_ok=True) + with open(file_model_outputs, "w") as f: + json.dump(rm_bon_model_outputs, f, indent=2) + with open(file_annotations, "w") as f: + json.dump(rm_bon_annotations, f, indent=2) + + df_leaderboard, _ = evaluate(model_outputs=file_model_outputs, annotaitons_file=file_annotations, is_return_instead_of_print=True) + # print(df_leaderboard) + df_leaderboard = df_leaderboard.reset_index() + + # convert the dataframe to json + rm_row_json = df_leaderboard.to_dict(orient="records") #, index=True,) + # print(json.dumps(rm_row_json, indent=2)) + # find the one with pretty_rm_name + for row in rm_row_json: + if row["index"] == pretty_rm_name + "-BoN": + target_rm_row_json = row + break + target_rm_row_json["reward_model"] = pretty_rm_name + del target_rm_row_json["index"] + # print(json.dumps(target_rm_row_json, indent=2)) + file_result = f"rm_bon_eval_results/eval_results/{pretty_rm_name}.json" + os.makedirs(os.path.dirname(file_result), exist_ok=True) + with open(file_result, "w") as f: + json.dump(target_rm_row_json, f, indent=2) + + # TODO: check if uploaded path is correct + api.upload_file( + path_or_fileobj=file_result, + path_in_repo=f"best-of-n/alpaca_eval/tulu-13b/eval_results/{pretty_rm_name}.json", + repo_id="allenai/reward-bench-results", + repo_type="dataset", + ) + return target_rm_row_json + +def extract_random(eval_results): + + table_file = "bon_data/alpaca_eval_n=16/virtual/annotations_ref=GPT35t/merged_leaderboard.csv" + # load as dataframe + df = pd.read_csv(table_file) + # convert to list of dict + df_json = df.to_dict(orient="records") + # find the one with maximum length_controlled_winrate value + eval_results["random_max"] = max(df_json, key=lambda x: x["length_controlled_winrate"]) + # find the one with minimum length_controlled_winrate value + eval_results["random_min"] = min(df_json, key=lambda x: x["length_controlled_winrate"]) + # find the one with median length_controlled_winrate value + length_controlled_winrate_values = [x["length_controlled_winrate"] for x in df_json] + median_value = sorted(length_controlled_winrate_values)[len(length_controlled_winrate_values)//2] + eval_results["random_median"] = [x for x in df_json if x["length_controlled_winrate"] == median_value][0] + # give the average values of all columns + # eval_results["random_avg"] = {} + # for column in df.columns: + # if column == "index": + # continue + # values = [x[column] for x in df_json] + # avg_value = sum(values) / len + # eval_results["random_avg"][column] = avg_value + # change model_name with reward_model name + eval_results["random_max"]["reward_model"] = eval_results["random_max"]["model_name"] + eval_results["random_min"]["reward_model"] = eval_results["random_min"]["model_name"] + eval_results["random_median"]["reward_model"] = eval_results["random_median"]["model_name"] + del eval_results["random_max"]["model_name"] + del eval_results["random_min"]["model_name"] + del eval_results["random_median"]["model_name"] + + +if __name__ == "__main__": + eval_results = {} + # extract_random(eval_results) + api = HfApi() + with open("bon_data/rm_mapping.json") as f: + rm_mapping = json.load(f) + for pretty_rm_name in rm_mapping: + rm_result_path = rm_mapping[pretty_rm_name]["localpath"] + print(f"Running evaluation for {pretty_rm_name} with url {rm_result_path}") + with open(rm_result_path) as f: + text_content = f.read() + text_content = text_content.replace("}\n{", "},\n{") + text_content = "[" + text_content + "]" + rm_result = json.loads(text_content) + eval_result = compute_rm_bon_eval(pretty_rm_name, rm_result, model_oputputs_all, annotations_all) + eval_results[pretty_rm_name] = eval_result + print(eval_result) + with open("bon_data/bon_eval_results.json", "w") as f: + json.dump(eval_results, f, indent=2) + # save the file to rewardbench result hf repo + # https://huggingface.co/datasets/allenai/reward-bench-results/tree/main/best-of-n/alpaca_eval/tulu-13b + + api.upload_file( + path_or_fileobj="bon_data/bon_eval_results.json", + path_in_repo="best-of-n/alpaca_eval/tulu-13b/bon_eval_results.json", + repo_id="allenai/reward-bench-results", + repo_type="dataset", + ) + + + \ No newline at end of file