From 465bb893e3b4f773b842215e9483fa2714e32ba6 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 12 Feb 2024 23:07:37 +0000 Subject: [PATCH 1/4] init --- README.md | 18 ++--- scripts/run_dpo.py | 21 +++--- scripts/run_rm.py | 141 ++++++++++++++++++++++-------------- scripts/submit_eval_jobs.py | 14 ++++ 4 files changed, 121 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 2cf372d5..fc31cd02 100644 --- a/README.md +++ b/README.md @@ -37,17 +37,17 @@ For reference on Chat Templates, many models follow the base / sft model termino I was debugging with default gpt2, but the random head may be causing numerical stability issues. Next: ``` -python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8 --direct_load -python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia --direct_load -python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia --direct_load +python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8 +python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia +python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia python scripts/run_rm.py --model=OpenAssistant/reward-model-deberta-v3-large-v2 --chat_template=raw python scripts/run_rm.py --model=weqweasdas/hh_rlhf_rm_open_llama_3b --chat_template=Robin -python scripts/run_rm.py --model=llm-blender/PairRM-hf --direct_load -python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --direct_load --batch_size=16 -python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --direct_load --batch_size=32 -python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --direct_load --batch_size=16 -python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --direct_load --batch_size=16 -python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --direct_load --trust_remote_code --chat_template=Ziya # custom code causing cuda issues +python scripts/run_rm.py --model=llm-blender/PairRM-hf +python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --batch_size=16 +python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --batch_size=32 +python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --batch_size=16 +python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --batch_size=16 +python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --trust_remote_code --chat_template=Ziya # custom code causing cuda issues ``` And for DPO: diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py index 5a7f07ca..ef5543f4 100644 --- a/scripts/run_dpo.py +++ b/scripts/run_dpo.py @@ -46,11 +46,8 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model") parser.add_argument("--ref_model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model") - parser.add_argument( - "--tokenizer", type=str, default=None, help="path to non-matching tokenizer, requires --direct_load" - ) + parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer") parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template") - parser.add_argument("--direct_load", action="store_true", help="directly load model instead of pipeline") parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)") parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference") parser.add_argument( @@ -171,26 +168,28 @@ def main(): ############################ # add column for results for easy printing out_dataset = dataset.add_column("results", results) + # add subsets back (removed so it's not handled by cuda) + out_dataset = out_dataset.add_column("subset", subsets) - results = {} - results["model"] = args.model - results["chat_template"] = args.chat_template - # print per subset and log into results file + results_grouped = {} + results_grouped["model"] = args.model + results_grouped["chat_template"] = args.chat_template + # print per subset and log into results_grouped file present_subsets = np.unique(subsets) for subset in present_subsets: subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset) num_correct = sum(subset_dataset["results"]) num_total = len(subset_dataset["results"]) print(f"{subset}: {num_correct}/{num_total} ({num_correct/num_total})") - results[subset] = num_correct / num_total + results_grouped[subset] = num_correct / num_total ############################ # Upload results to hub ############################ # Save results locally (results/results.json)\ - dumped = json.dumps(results, indent=4, sort_keys=True, default=str) + dumped = json.dumps(results_grouped, indent=4, sort_keys=True, default=str) logger.info(f"Stored local JSON data {dumped}.") - path = f"results/{args.model}.json" + path = "results/metrics.json" dirname = os.path.dirname(path) if dirname != "": diff --git a/scripts/run_rm.py b/scripts/run_rm.py index e84148fd..c6ca6856 100644 --- a/scripts/run_rm.py +++ b/scripts/run_rm.py @@ -55,13 +55,8 @@ def get_args(): """ parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model") - parser.add_argument( - "--tokenizer", type=str, default=None, help="path to non-matching tokenizer, requires --direct_load" - ) + parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer to model") parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template") - parser.add_argument( - "--direct_load", action="store_true", default=False, help="directly load model instead of pipeline" - ) parser.add_argument( "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline" ) @@ -70,6 +65,9 @@ def get_args(): parser.add_argument( "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set" ) + parser.add_argument( + "--debug", action="store_true", help="run on common preference sets instead of our custom eval set" + ) args = parser.parse_args() return args @@ -101,7 +99,7 @@ def main(): custom_dialogue = True model_builder = DebertaV2PairRM.from_pretrained pipeline_builder = PairRMPipeline - elif "SHP" in args.model or "SHP" in args.chat_template: + elif "SteamSHP" in args.model or "SteamSHP" in args.chat_template: from herm.models.shp import SHPPipeline custom_dialogue = True @@ -160,9 +158,19 @@ def main(): custom_dialogue_formatting=custom_dialogue, tokenizer=tokenizer, logger=logger, - keep_columns=["text_chosen", "text_rejected"], + keep_columns=["text_chosen", "text_rejected", "id"], ) + # copy id for saving, then remove + ids = dataset["id"] + dataset = dataset.remove_columns("id") + + # debug: use only 10 examples + if args.debug: + dataset = dataset.select(range(10)) + subsets = subsets[:10] + ids = ids[:10] + ############################ # Load reward model pipeline ############################ @@ -184,25 +192,13 @@ def main(): } else: model_kwargs = {"device_map": {"": current_device}} - # TODO remove direct load logic - # if pipeline_builder is pipeline, use built in pipeline, else custom - if args.direct_load or not pipeline_builder == pipeline: - model = model_builder(args.model, **model_kwargs, trust_remote_code=trust_remote_code) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - reward_pipe = pipeline_builder( - "text-classification", - model=model, - tokenizer=tokenizer, - ) - else: - reward_pipe = pipeline( - "text-classification", - model=args.model, - tokenizer=tokenizer, - revision="main", - model_kwargs=model_kwargs, - trust_remote_code=trust_remote_code, - ) + + model = model_builder(args.model, **model_kwargs, trust_remote_code=trust_remote_code) + reward_pipe = pipeline_builder( + "text-classification", + model=model, + tokenizer=tokenizer, + ) ############################ # Tokenization settings & dataset preparation @@ -217,7 +213,7 @@ def main(): ############################ # if using HF pipeline, can pass entire dataset and get results # first, handle custom pipelines that we must batch normally - if not args.direct_load or pipeline_builder == pipeline: + if pipeline_builder == pipeline: logger.info("*** Running forward pass via built in pipeline abstraction ***") # this setup can be optimized slightly with one pipeline call # prepare for inference @@ -227,11 +223,11 @@ def main(): results_cho = reward_pipe(dataset["text_chosen"], **reward_pipeline_kwargs) # extract scores from results which is list of dicts, e.g. [{'label': 'LABEL_1', 'score': 0.6826171875},... ] - score_chosen = [result["score"] for result in results_cho] - score_rejected = [result["score"] for result in results_rej] + scores_chosen = [result["score"] for result in results_cho] + scores_rejected = [result["score"] for result in results_rej] # pairwise comparison list comprehension - results = [1 if chosen > rejected else 0 for chosen, rejected in zip(score_chosen, score_rejected)] + results = [1 if chosen > rejected else 0 for chosen, rejected in zip(scores_chosen, scores_rejected)] ############################ # Run inference [2/2] custom pipelines @@ -262,19 +258,18 @@ def custom_collate_fn(batch): reward_pipe.model = model results = [] + scores_chosen = [] + scores_rejected = [] for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")): logger.info(f"RM inference step {step}/{len(dataloader)}") - if ( - "PairRM" in args.model - or "PairRM" in args.chat_template - or "SHP" in args.model - or "SHP" in args.chat_template - ): + if "PairRM" in args.model or "SteamSHP" in args.model: text_rejected = [b["text_rejected"] for b in batch] text_chosen = [b["text_chosen"] for b in batch] results_sub = reward_pipe(text_chosen, text_rejected, **reward_pipeline_kwargs) [results.append(1) if result else results.append(0) for result in results_sub.cpu().numpy().tolist()] + scores_chosen.extend(None * len(results_sub)) + scores_rejected.extend(None * len(results_sub)) else: rewards_chosen = reward_pipe(batch["text_chosen"], **reward_pipeline_kwargs) rewards_rejected = reward_pipe(batch["text_rejected"], **reward_pipeline_kwargs) @@ -283,49 +278,58 @@ def custom_collate_fn(batch): # extra score from dict within batched results (e.g. logits) # [{'label': 'LABEL_1', 'score': 0.6826171875},... ] if isinstance(rewards_chosen[0], dict): - score_chosen = [result["score"] for result in rewards_chosen] - score_rejected = [result["score"] for result in rewards_rejected] + score_chosen_batch = [result["score"] for result in rewards_chosen] + score_rejected_batch = [result["score"] for result in rewards_rejected] # for classes that directly output scores (custom code) else: - score_chosen = rewards_chosen.cpu().numpy().tolist() - score_rejected = rewards_rejected.cpu().numpy().tolist() + score_chosen_batch = rewards_chosen.cpu().numpy().tolist() + score_rejected_batch = rewards_rejected.cpu().numpy().tolist() + # log results [ results.append(1) if chosen > rejected else results.append(0) - for chosen, rejected in zip(score_chosen, score_rejected) + for chosen, rejected in zip(score_chosen_batch, score_rejected_batch) ] + scores_chosen.extend(score_chosen_batch) + scores_rejected.extend(score_rejected_batch) ############################ # Print & process results ############################ # add column for results for easy printing out_dataset = dataset.add_column("results", results) + # add subsets back (removed so it's not handled by cuda) out_dataset = out_dataset.add_column("subset", subsets) + out_dataset = out_dataset.add_column("id", ids) - results = {} - results["model"] = args.model - results["chat_template"] = args.chat_template - # print per subset and log into results file + # add scores_chosen and scores_rejected to the dataset + out_dataset = out_dataset.add_column("scores_chosen", scores_chosen) + out_dataset = out_dataset.add_column("scores_rejected", scores_rejected) + + # get core dataset + results_grouped = {} + results_grouped["model"] = args.model + results_grouped["chat_template"] = args.chat_template + + # print per subset and log into results_grouped file present_subsets = np.unique(subsets) for subset in present_subsets: subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset) num_correct = sum(subset_dataset["results"]) num_total = len(subset_dataset["results"]) print(f"{subset}: {num_correct}/{num_total} ({num_correct/num_total})") - results[subset] = num_correct / num_total + results_grouped[subset] = num_correct / num_total ############################ # Upload results to hub ############################ # Save results locally (results/results.json)\ - dumped = json.dumps(results, indent=4, sort_keys=True, default=str) + dumped = json.dumps(results_grouped, indent=4, sort_keys=True, default=str) logger.info(f"Stored local JSON data {dumped}.") - path = "results/metrics.json" + path = f"results/metrics/{args.model}.json" dirname = os.path.dirname(path) - - if dirname != "": - os.makedirs(dirname, exist_ok=True) + os.makedirs(dirname, exist_ok=True) # remove old data if os.path.isfile(path): @@ -336,16 +340,47 @@ def custom_collate_fn(batch): # Upload results as json if not args.do_not_save: + # upload core results sub_path = "eval-set/" if not args.pref_sets else "pref-sets/" scores_url = api.upload_file( path_or_fileobj=path, path_in_repo=sub_path + f"{args.model}.json", - repo_id=EVAL_REPO, # push to correct results repo + repo_id=EVAL_REPO if not args.debug else "ai2-adapt-dev/herm-debug", # push to correct results repo repo_type="dataset", commit_message=f"Add reward model scores for model {args.model}", ) logger.info(f"Uploaded reward model scores to {scores_url}") + # upload chosen-rejected with scores + if not ("PairRM" in args.model or "SteamSHP" in args.model): + # create new json with scores and upload + scores_dict = out_dataset.to_dict() + dumped = json.dumps(scores_dict, indent=4, sort_keys=True, default=str) + scores_path = f"results/scores/{args.model}.json" + dirname = os.path.dirname(scores_path) + os.makedirs(dirname, exist_ok=True) + + # remove old data + if os.path.isfile(scores_path): + os.remove(scores_path) + + with open(scores_path, "w") as f: + f.write(dumped) + + sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/" + + scores_url = api.upload_file( + path_or_fileobj=scores_path, + path_in_repo=sub_path_scores + f"{args.model}.json", + repo_id=EVAL_REPO if not args.debug else "ai2-adapt-dev/herm-debug", # push to correct results repo + repo_type="dataset", + commit_message=f"Add chosen-rejected text with scores for model {args.model}", + ) + + logger.info("Uploading chosen-rejected text with scores") + else: + logger.info("Not uploading chosen-rejected text with scores due to model compatibility") + if __name__ == "__main__": main() diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 20da20f4..d6d66791 100644 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -1,3 +1,17 @@ +# Copyright 2023 AllenAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy import os import subprocess From 315926193ed9a20080087adb87147f043754c611 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 12 Feb 2024 23:23:09 +0000 Subject: [PATCH 2/4] refactor saving --- herm/__init__.py | 9 ++++-- herm/utils.py | 40 +++++++++++++++++++++++++ scripts/run_rm.py | 76 +++++++++-------------------------------------- 3 files changed, 61 insertions(+), 64 deletions(-) diff --git a/herm/__init__.py b/herm/__init__.py index 2485a807..2ebbd270 100644 --- a/herm/__init__.py +++ b/herm/__init__.py @@ -14,6 +14,11 @@ __version__ = "0.1.0.dev" from .dpo import DPOInference -from .utils import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer +from .utils import ( + load_eval_dataset, + prepare_dialogue, + prepare_dialogue_from_tokenizer, + save_to_hub, +) -__all__ = [DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer, load_eval_dataset] +__all__ = [DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer, load_eval_dataset, save_to_hub] diff --git a/herm/utils.py b/herm/utils.py index 29469273..d377f0ef 100644 --- a/herm/utils.py +++ b/herm/utils.py @@ -12,16 +12,56 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging +import os from typing import Any, Dict, List from datasets import Dataset, concatenate_datasets, load_dataset from fastchat.conversation import Conversation +from huggingface_hub import HfApi from transformers import PreTrainedTokenizer CORE_EVAL_SET = "ai2-adapt-dev/rm-benchmark-dev" EXTRA_PREF_SETS = "allenai/pref-test-sets" +# data repo to upload results +EVAL_REPO = "ai2-adapt-dev/HERM-Results" + +# get token from HF_TOKEN env variable, but if it doesn't exist pass none +HF_TOKEN = os.getenv("HF_TOKEN", None) +api = HfApi(token=HF_TOKEN) + + +def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bool = False, local_only: bool = False): + dumped = json.dumps(results_dict, indent=4, sort_keys=True, default=str) + if "scores" in target_path: + scores_path = f"results/scores/{model_name}.json" + else: + scores_path = f"results/metrics/{model_name}.json" + + dirname = os.path.dirname(scores_path) + os.makedirs(dirname, exist_ok=True) + + # remove old data + if os.path.isfile(scores_path): + os.remove(scores_path) + + with open(scores_path, "w") as f: + f.write(dumped) + + if not local_only: + scores_url = api.upload_file( + path_or_fileobj=scores_path, + path_in_repo=target_path + f"{model_name}.json", + repo_id=EVAL_REPO if not debug else "ai2-adapt-dev/herm-debug", # push to correct results repo + repo_type="dataset", + commit_message=f"Add chosen-rejected text with scores for model {model_name}", + ) + return scores_url + else: + return None + def map_conversations_testsets(example): prompt = example["prompt"] diff --git a/scripts/run_rm.py b/scripts/run_rm.py index c6ca6856..8c959414 100644 --- a/scripts/run_rm.py +++ b/scripts/run_rm.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse -import json import logging import os import sys @@ -24,7 +23,6 @@ from accelerate import Accelerator from accelerate.logging import get_logger from fastchat.conversation import get_conv_template -from huggingface_hub import HfApi from tqdm import tqdm from transformers import ( AutoModelForSequenceClassification, @@ -33,21 +31,16 @@ pipeline, ) -from herm import load_eval_dataset +from herm import load_eval_dataset, save_to_hub # get token from HF_TOKEN env variable, but if it doesn't exist pass none HF_TOKEN = os.getenv("HF_TOKEN", None) -api = HfApi(token=HF_TOKEN) - # this is necessary to automatically log in when running this script in docker/batch beaker jobs if HF_TOKEN is not None: from huggingface_hub._login import _login _login(token=HF_TOKEN, add_to_git_credential=False) -# data repo to upload results -EVAL_REPO = "ai2-adapt-dev/HERM-Results" - def get_args(): """ @@ -324,62 +317,21 @@ def custom_collate_fn(batch): ############################ # Upload results to hub ############################ - # Save results locally (results/results.json)\ - dumped = json.dumps(results_grouped, indent=4, sort_keys=True, default=str) - logger.info(f"Stored local JSON data {dumped}.") - path = f"results/metrics/{args.model}.json" - dirname = os.path.dirname(path) - os.makedirs(dirname, exist_ok=True) - - # remove old data - if os.path.isfile(path): - os.remove(path) + sub_path = "eval-set/" if not args.pref_sets else "pref-sets/" + results_url = save_to_hub(results_grouped, args.model, sub_path, args.debug, local_only=args.do_not_save) + if not args.do_not_save: + logger.info(f"Uploaded reward model results to {results_url}") - with open(path, "w") as f: - f.write(dumped) + # upload chosen-rejected with scores + if not ("PairRM" in args.model or "SteamSHP" in args.model): + # create new json with scores and upload + scores_dict = out_dataset.to_dict() + sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/" - # Upload results as json - if not args.do_not_save: - # upload core results - sub_path = "eval-set/" if not args.pref_sets else "pref-sets/" - scores_url = api.upload_file( - path_or_fileobj=path, - path_in_repo=sub_path + f"{args.model}.json", - repo_id=EVAL_REPO if not args.debug else "ai2-adapt-dev/herm-debug", # push to correct results repo - repo_type="dataset", - commit_message=f"Add reward model scores for model {args.model}", - ) - logger.info(f"Uploaded reward model scores to {scores_url}") - - # upload chosen-rejected with scores - if not ("PairRM" in args.model or "SteamSHP" in args.model): - # create new json with scores and upload - scores_dict = out_dataset.to_dict() - dumped = json.dumps(scores_dict, indent=4, sort_keys=True, default=str) - scores_path = f"results/scores/{args.model}.json" - dirname = os.path.dirname(scores_path) - os.makedirs(dirname, exist_ok=True) - - # remove old data - if os.path.isfile(scores_path): - os.remove(scores_path) - - with open(scores_path, "w") as f: - f.write(dumped) - - sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/" - - scores_url = api.upload_file( - path_or_fileobj=scores_path, - path_in_repo=sub_path_scores + f"{args.model}.json", - repo_id=EVAL_REPO if not args.debug else "ai2-adapt-dev/herm-debug", # push to correct results repo - repo_type="dataset", - commit_message=f"Add chosen-rejected text with scores for model {args.model}", - ) - - logger.info("Uploading chosen-rejected text with scores") - else: - logger.info("Not uploading chosen-rejected text with scores due to model compatibility") + scores_url = save_to_hub(scores_dict, args.model, sub_path_scores, args.debug) + logger.info(f"Uploading chosen-rejected text with scores to {scores_url}") + else: + logger.info("Not uploading chosen-rejected text with scores due to model compatibility") if __name__ == "__main__": From 52a6b52402919697e7feadae217ca92b3ea078de Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 12 Feb 2024 23:27:02 +0000 Subject: [PATCH 3/4] update DPO --- herm/__init__.py | 2 +- herm/utils.py | 3 +++ scripts/run_dpo.py | 67 +++++++++++++++++++++++----------------------- 3 files changed, 38 insertions(+), 34 deletions(-) diff --git a/herm/__init__.py b/herm/__init__.py index 2ebbd270..c4c0534f 100644 --- a/herm/__init__.py +++ b/herm/__init__.py @@ -21,4 +21,4 @@ save_to_hub, ) -__all__ = [DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer, load_eval_dataset, save_to_hub] +__all__ = [DPOInference, load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer, save_to_hub] diff --git a/herm/utils.py b/herm/utils.py index d377f0ef..d134085c 100644 --- a/herm/utils.py +++ b/herm/utils.py @@ -34,6 +34,9 @@ def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bool = False, local_only: bool = False): + """ + Utility for saving results in dict to the hub in programatic organization. + """ dumped = json.dumps(results_dict, indent=4, sort_keys=True, default=str) if "scores" in target_path: scores_path = f"results/scores/{model_name}.json" diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py index ef5543f4..1281d8be 100644 --- a/scripts/run_dpo.py +++ b/scripts/run_dpo.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse -import json import logging import os import sys @@ -29,7 +28,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from trl.trainer.utils import DPODataCollatorWithPadding -from herm import DPOInference, load_eval_dataset +from herm import DPOInference, load_eval_dataset, save_to_hub # get token from HF_TOKEN env variable, but if it doesn't exist pass none HF_TOKEN = os.getenv("HF_TOKEN", None) @@ -95,9 +94,19 @@ def main(): conv=conv, tokenizer=tokenizer, logger=logger, - keep_columns=["text_chosen", "text_rejected"], + keep_columns=["text_chosen", "text_rejected", "id"], ) + # copy id for saving, then remove + ids = dataset["id"] + dataset = dataset.remove_columns("id") + + # debug: use only 10 examples + if args.debug: + dataset = dataset.select(range(10)) + subsets = subsets[:10] + ids = ids[:10] + ############################ # Load reward model pipeline ############################ @@ -143,6 +152,8 @@ def main(): ) results = [] + scores_chosen = [] + scores_rejected = [] for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")): logger.info(f"RM inference step {step}/{len(dataloader)}") @@ -151,16 +162,16 @@ def main(): # extra score from dict within batched results (e.g. logits) # [{'label': 'LABEL_1', 'score': 0.6826171875},... ] if isinstance(rewards_chosen[0], dict): - score_chosen = [result["score"] for result in rewards_chosen] - score_rejected = [result["score"] for result in rewards_rejected] + scores_chosen_batch = [result["score"] for result in rewards_chosen] + scores_rejected_batch = [result["score"] for result in rewards_rejected] # for classes that directly output scores (custom code) else: - score_chosen = rewards_chosen.cpu().numpy().tolist() - score_rejected = rewards_rejected.cpu().numpy().tolist() + scores_chosen_batch = rewards_chosen.cpu().numpy().tolist() + scores_rejected_batch = rewards_rejected.cpu().numpy().tolist() [ results.append(1) if chosen > rejected else results.append(0) - for chosen, rejected in zip(score_chosen, score_rejected) + for chosen, rejected in zip(scores_chosen_batch, scores_rejected_batch) ] ############################ @@ -168,9 +179,14 @@ def main(): ############################ # add column for results for easy printing out_dataset = dataset.add_column("results", results) + # add subsets back (removed so it's not handled by cuda) out_dataset = out_dataset.add_column("subset", subsets) + # add scores_chosen and scores_rejected to the dataset + out_dataset = out_dataset.add_column("scores_chosen", scores_chosen) + out_dataset = out_dataset.add_column("scores_rejected", scores_rejected) + results_grouped = {} results_grouped["model"] = args.model results_grouped["chat_template"] = args.chat_template @@ -186,33 +202,18 @@ def main(): ############################ # Upload results to hub ############################ - # Save results locally (results/results.json)\ - dumped = json.dumps(results_grouped, indent=4, sort_keys=True, default=str) - logger.info(f"Stored local JSON data {dumped}.") - path = "results/metrics.json" - dirname = os.path.dirname(path) - - if dirname != "": - os.makedirs(dirname, exist_ok=True) - - # remove old data - if os.path.isfile(path): - os.remove(path) + sub_path = "eval-set/" if not args.pref_sets else "pref-sets/" + results_url = save_to_hub(results_grouped, args.model, sub_path, args.debug, local_only=args.do_not_save) + if not args.do_not_save: + logger.info(f"Uploaded reward model results to {results_url}") - with open(path, "w") as f: - f.write(dumped) + # upload chosen-rejected with scores + # create new json with scores and upload + scores_dict = out_dataset.to_dict() + sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/" - # Upload results as json - if not args.do_not_save: - sub_path = "eval-set/" if not args.pref_sets else "pref-sets/" - scores_url = api.upload_file( - path_or_fileobj=path, - path_in_repo=sub_path + f"{args.model}.json", - repo_id=EVAL_REPO, # push to correct results repo - repo_type="dataset", - commit_message=f"Add reward model scores for model {args.model}", - ) - logger.info(f"Uploaded reward model scores to {scores_url}") + scores_url = save_to_hub(scores_dict, args.model, sub_path_scores, args.debug) + logger.info(f"Uploading chosen-rejected text with scores to {scores_url}") if __name__ == "__main__": From d35874f6d9f32ab32a7b312e3750d01caa9f5519 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 12 Feb 2024 23:46:42 +0000 Subject: [PATCH 4/4] nit --- herm/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/herm/utils.py b/herm/utils.py index d134085c..81599c03 100644 --- a/herm/utils.py +++ b/herm/utils.py @@ -40,8 +40,10 @@ def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bo dumped = json.dumps(results_dict, indent=4, sort_keys=True, default=str) if "scores" in target_path: scores_path = f"results/scores/{model_name}.json" + beaker_path = None else: scores_path = f"results/metrics/{model_name}.json" + beaker_path = "results/metrics.json" # save format for AI2 beaker to show results dirname = os.path.dirname(scores_path) os.makedirs(dirname, exist_ok=True) @@ -53,6 +55,11 @@ def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bo with open(scores_path, "w") as f: f.write(dumped) + # ai2 internal visualization, not needed external + if beaker_path: + with open(beaker_path, "w") as f: + f.write(dumped) + if not local_only: scores_url = api.upload_file( path_or_fileobj=scores_path,