Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Save scores per prompt #21

Merged
merged 4 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ For reference on Chat Templates, many models follow the base / sft model termino
I was debugging with default gpt2, but the random head may be causing numerical stability issues.
Next:
```
python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8 --direct_load
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia --direct_load
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia --direct_load
python scripts/run_rm.py --model=openbmb/UltraRM-13b --chat_template=billa --batch_size=8
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 --chat_template=oasst_pythia
python scripts/run_rm.py --model=OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 --chat_template=oasst_pythia
python scripts/run_rm.py --model=OpenAssistant/reward-model-deberta-v3-large-v2 --chat_template=raw
jacob-morrison marked this conversation as resolved.
Show resolved Hide resolved
python scripts/run_rm.py --model=weqweasdas/hh_rlhf_rm_open_llama_3b --chat_template=Robin
python scripts/run_rm.py --model=llm-blender/PairRM-hf --direct_load
python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --direct_load --batch_size=16
python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --direct_load --batch_size=32
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --direct_load --batch_size=16
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --direct_load --batch_size=16
python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --direct_load --trust_remote_code --chat_template=Ziya # custom code causing cuda issues
python scripts/run_rm.py --model=llm-blender/PairRM-hf
python scripts/run_rm.py --model=berkeley-nest/Starling-RM-7B-alpha --tokenizer=meta-llama/Llama-2-7b-chat-hf --chat_template=llama-2 --batch_size=16
python scripts/run_rm.py --model=stanfordnlp/SteamSHP-flan-t5-xl --batch_size=32
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-reward --chat_template=pku-align --batch_size=16
python scripts/run_rm.py --model=PKU-Alignment/beaver-7b-v1.0-cost --chat_template=pku-align --batch_size=16
python scripts/run_rm.py --model=IDEA-CCNL/Ziya-LLaMA-7B-Reward --batch_size=32 --trust_remote_code --chat_template=Ziya # custom code causing cuda issues
```

And for DPO:
Expand Down
9 changes: 7 additions & 2 deletions herm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

__version__ = "0.1.0.dev"
from .dpo import DPOInference
from .utils import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer
from .utils import (
load_eval_dataset,
prepare_dialogue,
prepare_dialogue_from_tokenizer,
save_to_hub,
)

__all__ = [DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer, load_eval_dataset]
__all__ = [DPOInference, load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer, save_to_hub]
43 changes: 43 additions & 0 deletions herm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,59 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
from typing import Any, Dict, List

from datasets import Dataset, concatenate_datasets, load_dataset
from fastchat.conversation import Conversation
from huggingface_hub import HfApi
from transformers import PreTrainedTokenizer

CORE_EVAL_SET = "ai2-adapt-dev/rm-benchmark-dev"
EXTRA_PREF_SETS = "allenai/pref-test-sets"

# data repo to upload results
EVAL_REPO = "ai2-adapt-dev/HERM-Results"

# get token from HF_TOKEN env variable, but if it doesn't exist pass none
HF_TOKEN = os.getenv("HF_TOKEN", None)
api = HfApi(token=HF_TOKEN)


def save_to_hub(results_dict: Dict, model_name: str, target_path: str, debug: bool = False, local_only: bool = False):
"""
Utility for saving results in dict to the hub in programatic organization.
"""
dumped = json.dumps(results_dict, indent=4, sort_keys=True, default=str)
if "scores" in target_path:
scores_path = f"results/scores/{model_name}.json"
else:
scores_path = f"results/metrics/{model_name}.json"

dirname = os.path.dirname(scores_path)
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.isfile(scores_path):
os.remove(scores_path)

with open(scores_path, "w") as f:
f.write(dumped)

if not local_only:
scores_url = api.upload_file(
path_or_fileobj=scores_path,
path_in_repo=target_path + f"{model_name}.json",
repo_id=EVAL_REPO if not debug else "ai2-adapt-dev/herm-debug", # push to correct results repo
repo_type="dataset",
commit_message=f"Add chosen-rejected text with scores for model {model_name}",
)
return scores_url
else:
return None


def map_conversations_testsets(example):
prompt = example["prompt"]
Expand Down
84 changes: 42 additions & 42 deletions scripts/run_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import argparse
import json
import logging
import os
import sys
Expand All @@ -29,7 +28,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl.trainer.utils import DPODataCollatorWithPadding

from herm import DPOInference, load_eval_dataset
from herm import DPOInference, load_eval_dataset, save_to_hub

# get token from HF_TOKEN env variable, but if it doesn't exist pass none
HF_TOKEN = os.getenv("HF_TOKEN", None)
Expand All @@ -46,11 +45,8 @@ def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model")
parser.add_argument("--ref_model", type=str, default="natolambert/gpt2-dummy-rm", help="path to model")
parser.add_argument(
"--tokenizer", type=str, default=None, help="path to non-matching tokenizer, requires --direct_load"
)
parser.add_argument("--tokenizer", type=str, default=None, help="path to non-matching tokenizer")
parser.add_argument("--chat_template", type=str, default="tulu", help="path to chat template")
parser.add_argument("--direct_load", action="store_true", help="directly load model instead of pipeline")
parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference")
parser.add_argument(
Expand Down Expand Up @@ -98,9 +94,19 @@ def main():
conv=conv,
tokenizer=tokenizer,
logger=logger,
keep_columns=["text_chosen", "text_rejected"],
keep_columns=["text_chosen", "text_rejected", "id"],
)

# copy id for saving, then remove
ids = dataset["id"]
dataset = dataset.remove_columns("id")

# debug: use only 10 examples
if args.debug:
dataset = dataset.select(range(10))
subsets = subsets[:10]
ids = ids[:10]

############################
# Load reward model pipeline
############################
Expand Down Expand Up @@ -146,6 +152,8 @@ def main():
)

results = []
scores_chosen = []
scores_rejected = []
for step, batch in enumerate(tqdm(dataloader, desc="RM batch steps")):
logger.info(f"RM inference step {step}/{len(dataloader)}")

Expand All @@ -154,16 +162,16 @@ def main():
# extra score from dict within batched results (e.g. logits)
# [{'label': 'LABEL_1', 'score': 0.6826171875},... ]
if isinstance(rewards_chosen[0], dict):
score_chosen = [result["score"] for result in rewards_chosen]
score_rejected = [result["score"] for result in rewards_rejected]
scores_chosen_batch = [result["score"] for result in rewards_chosen]
scores_rejected_batch = [result["score"] for result in rewards_rejected]
# for classes that directly output scores (custom code)
else:
score_chosen = rewards_chosen.cpu().numpy().tolist()
score_rejected = rewards_rejected.cpu().numpy().tolist()
scores_chosen_batch = rewards_chosen.cpu().numpy().tolist()
scores_rejected_batch = rewards_rejected.cpu().numpy().tolist()

[
results.append(1) if chosen > rejected else results.append(0)
for chosen, rejected in zip(score_chosen, score_rejected)
for chosen, rejected in zip(scores_chosen_batch, scores_rejected_batch)
]

############################
Expand All @@ -172,48 +180,40 @@ def main():
# add column for results for easy printing
out_dataset = dataset.add_column("results", results)

results = {}
results["model"] = args.model
results["chat_template"] = args.chat_template
# print per subset and log into results file
# add subsets back (removed so it's not handled by cuda)
out_dataset = out_dataset.add_column("subset", subsets)

# add scores_chosen and scores_rejected to the dataset
out_dataset = out_dataset.add_column("scores_chosen", scores_chosen)
out_dataset = out_dataset.add_column("scores_rejected", scores_rejected)

results_grouped = {}
results_grouped["model"] = args.model
results_grouped["chat_template"] = args.chat_template
# print per subset and log into results_grouped file
present_subsets = np.unique(subsets)
for subset in present_subsets:
subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset)
num_correct = sum(subset_dataset["results"])
num_total = len(subset_dataset["results"])
print(f"{subset}: {num_correct}/{num_total} ({num_correct/num_total})")
results[subset] = num_correct / num_total
results_grouped[subset] = num_correct / num_total

############################
# Upload results to hub
############################
# Save results locally (results/results.json)\
dumped = json.dumps(results, indent=4, sort_keys=True, default=str)
logger.info(f"Stored local JSON data {dumped}.")
path = f"results/{args.model}.json"
dirname = os.path.dirname(path)

if dirname != "":
os.makedirs(dirname, exist_ok=True)

# remove old data
if os.path.isfile(path):
os.remove(path)
sub_path = "eval-set/" if not args.pref_sets else "pref-sets/"
results_url = save_to_hub(results_grouped, args.model, sub_path, args.debug, local_only=args.do_not_save)
if not args.do_not_save:
logger.info(f"Uploaded reward model results to {results_url}")

with open(path, "w") as f:
f.write(dumped)
# upload chosen-rejected with scores
# create new json with scores and upload
scores_dict = out_dataset.to_dict()
sub_path_scores = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/"

# Upload results as json
if not args.do_not_save:
sub_path = "eval-set/" if not args.pref_sets else "pref-sets/"
scores_url = api.upload_file(
path_or_fileobj=path,
path_in_repo=sub_path + f"{args.model}.json",
repo_id=EVAL_REPO, # push to correct results repo
repo_type="dataset",
commit_message=f"Add reward model scores for model {args.model}",
)
logger.info(f"Uploaded reward model scores to {scores_url}")
scores_url = save_to_hub(scores_dict, args.model, sub_path_scores, args.debug)
logger.info(f"Uploading chosen-rejected text with scores to {scores_url}")


if __name__ == "__main__":
Expand Down
Loading
Loading