Skip to content

Commit

Permalink
Merge pull request #21 from allenai/save_scores
Browse files Browse the repository at this point in the history
Save scores per prompt
  • Loading branch information
natolambert authored Feb 13, 2024
2 parents a4eec4a + d35874f commit f441f9c
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 133 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ For reference on Chat Templates, many models follow the base / sft model termino
I was debugging with default gpt2, but the random head may be causing numerical stability issues.
Next:
```
python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8 --direct_load
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia --direct_load
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia --direct_load
python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia
python scripts/run_rm.py --model=OpenAssistant/reward-model-deberta-v3-large-v2 --chat_template=raw
python scripts/run_rm.py --model=weqweasdas/hh_rlhf_rm_open_llama_3b --chat_template=Robin
python scripts/run_rm.py --model=llm-blender/PairRM-hf --direct_load
python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --direct_load --batch_size=16
python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --direct_load --batch_size=32
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --direct_load --batch_size=16
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --direct_load --batch_size=16
python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --direct_load --trust_remote_code --chat_template=Ziya # custom code causing cuda issues
python scripts/run_rm.py --model=llm-blender/PairRM-hf
python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --batch_size=16
python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --batch_size=32
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --batch_size=16
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --batch_size=16
python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --trust_remote_code --chat_template=Ziya # custom code causing cuda issues
```

And for DPO:
Expand Down
9 changes: 7 additions & 2 deletions herm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

__version__ = "0.1.0.dev"
from .dpo import DPOInference
from .utils import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer
from .utils import (
load_eval_dataset,
prepare_dialogue,
prepare_dialogue_from_tokenizer,
save_to_hub,
)

__all__ = [DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer, load_eval_dataset]
__all__ = [DPOInference, load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer, save_to_hub]
50 changes: 50 additions & 0 deletions herm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,66 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
from typing import Any, Dict, List

from datasets import Dataset, concatenate_datasets, load_dataset
from fastchat.conversation import Conversation
from huggingface_hub import HfApi
from transformers import PreTrainedTokenizer

CORE_EVAL_SET = "ai2-adapt-dev/rm-benchmark-dev"
EXTRA_PREF_SETS = "allenai/pref-test-sets"

# data repo to upload results
EVAL_REPO = "ai2-adapt-dev/HERM-Results"

# get token from HF_TOKEN env variable, but if it doesn't exist pass none
HF_TOKEN = os.getenv("HF_TOKEN", None)
api = HfApi(token=HF_TOKEN)


def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bool = False, local_only: bool = False):
"""
Utility for saving results in dict to the hub in programatic organization.
"""
dumped = json.dumps(results_dict, indent=4, sort_keys=True, default=str)
if "scores" in target_path:
scores_path = f"results/scores/{model_name}.json"
beaker_path = None
else:
scores_path = f"results/metrics/{model_name}.json"
beaker_path = "results/metrics.json" # save format for AI2 beaker to show results

dirname = os.path.dirname(scores_path)
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.isfile(scores_path):
os.remove(scores_path)

with open(scores_path, "w") as f:
f.write(dumped)

# ai2 internal visualization, not needed external
if beaker_path:
with open(beaker_path, "w") as f:
f.write(dumped)

if not local_only:
scores_url = api.upload_file(
path_or_fileobj=scores_path,
path_in_repo=target_path + f"{model_name}.json",
repo_id=EVAL_REPO if not debug else "ai2-adapt-dev/herm-debug", # push to correct results repo
repo_type="dataset",
commit_message=f"Add chosen-rejected text with scores for model {model_name}",
)
return scores_url
else:
return None


def map_conversations_testsets(example):
prompt = example["prompt"]
Expand Down
84 changes: 42 additions & 42 deletions scripts/run_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import argparse
import json
import logging
import os
import sys
Expand All @@ -29,7 +28,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl.trainer.utils import DPODataCollatorWithPadding

from herm import DPOInference, load_eval_dataset
from herm import DPOInference, load_eval_dataset, save_to_hub

# get token from HF_TOKEN env variable, but if it doesn't exist pass none
HF_TOKEN = os.getenv("HF_TOKEN", None)
Expand All @@ -46,11 +45,8 @@ def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model")
parser.add_argument("--ref_model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model")
parser.add_argument(
"--tokenizer", type=str, default=None, help="path to non-matching tokenizer, requires --direct_load"
)
parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer")
parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template")
parser.add_argument("--direct_load", action="store_true", help="directly load model instead of pipeline")
parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference")
parser.add_argument(
Expand Down Expand Up @@ -98,9 +94,19 @@ def main():
conv=conv,
tokenizer=tokenizer,
logger=logger,
keep_columns=["text_chosen", "text_rejected"],
keep_columns=["text_chosen", "text_rejected", "id"],
)

# copy id for saving, then remove
ids = dataset["id"]
dataset = dataset.remove_columns("id")

# debug: use only 10 examples
if args.debug:
dataset = dataset.select(range(10))
subsets = subsets[:10]
ids = ids[:10]

############################
# Load reward model pipeline
############################
Expand Down Expand Up @@ -146,6 +152,8 @@ def main():
)

results = []
scores_chosen = []
scores_rejected = []
for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")):
logger.info(f"RM inference step {step}/{len(dataloader)}")

Expand All @@ -154,16 +162,16 @@ def main():
# extra score from dict within batched results (e.g. logits)
# [{'label': 'LABEL_1', 'score': 0.6826171875},... ]
if isinstance(rewards_chosen[0], dict):
score_chosen = [result["score"] for result in rewards_chosen]
score_rejected = [result["score"] for result in rewards_rejected]
scores_chosen_batch = [result["score"] for result in rewards_chosen]
scores_rejected_batch = [result["score"] for result in rewards_rejected]
# for classes that directly output scores (custom code)
else:
score_chosen = rewards_chosen.cpu().numpy().tolist()
score_rejected = rewards_rejected.cpu().numpy().tolist()
scores_chosen_batch = rewards_chosen.cpu().numpy().tolist()
scores_rejected_batch = rewards_rejected.cpu().numpy().tolist()

[
results.append(1) if chosen > rejected else results.append(0)
for chosen, rejected in zip(score_chosen, score_rejected)
for chosen, rejected in zip(scores_chosen_batch, scores_rejected_batch)
]

############################
Expand All @@ -172,48 +180,40 @@ def main():
# add column for results for easy printing
out_dataset = dataset.add_column("results", results)

results = {}
results["model"] = args.model
results["chat_template"] = args.chat_template
# print per subset and log into results file
# add subsets back (removed so it's not handled by cuda)
out_dataset = out_dataset.add_column("subset", subsets)

# add scores_chosen and scores_rejected to the dataset
out_dataset = out_dataset.add_column("scores_chosen", scores_chosen)
out_dataset = out_dataset.add_column("scores_rejected", scores_rejected)

results_grouped = {}
results_grouped["model"] = args.model
results_grouped["chat_template"] = args.chat_template
# print per subset and log into results_grouped file
present_subsets = np.unique(subsets)
for subset in present_subsets:
subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset)
num_correct = sum(subset_dataset["results"])
num_total = len(subset_dataset["results"])
print(f"{subset}: {num_correct}/{num_total} ({num_correct/num_total})")
results[subset] = num_correct / num_total
results_grouped[subset] = num_correct / num_total

############################
# Upload results to hub
############################
# Save results locally (results/results.json)\
dumped = json.dumps(results, indent=4, sort_keys=True, default=str)
logger.info(f"Stored local JSON data {dumped}.")
path = f"results/{args.model}.json"
dirname = os.path.dirname(path)

if dirname != "":
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.isfile(path):
os.remove(path)
sub_path = "eval-set/" if not args.pref_sets else "pref-sets/"
results_url = save_to_hub(results_grouped, args.model, sub_path, args.debug, local_only=args.do_not_save)
if not args.do_not_save:
logger.info(f"Uploaded reward model results to {results_url}")

with open(path, "w") as f:
f.write(dumped)
# upload chosen-rejected with scores
# create new json with scores and upload
scores_dict = out_dataset.to_dict()
sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/"

# Upload results as json
if not args.do_not_save:
sub_path = "eval-set/" if not args.pref_sets else "pref-sets/"
scores_url = api.upload_file(
path_or_fileobj=path,
path_in_repo=sub_path + f"{args.model}.json",
repo_id=EVAL_REPO, # push to correct results repo
repo_type="dataset",
commit_message=f"Add reward model scores for model {args.model}",
)
logger.info(f"Uploaded reward model scores to {scores_url}")
scores_url = save_to_hub(scores_dict, args.model, sub_path_scores, args.debug)
logger.info(f"Uploading chosen-rejected text with scores to {scores_url}")


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit f441f9c

Please sign in to comment.