diff --git a/rewardbench/rewardbench.py b/rewardbench/rewardbench.py index 0599dff..130baf5 100644 --- a/rewardbench/rewardbench.py +++ b/rewardbench/rewardbench.py @@ -27,6 +27,7 @@ import wandb from accelerate import Accelerator from accelerate.logging import get_logger +from huggingface_hub import EvalResult, ModelCard, ModelCardData from tqdm import tqdm from transformers import AutoTokenizer, HfArgumentParser @@ -424,6 +425,54 @@ def actual_main(args: Args): for chosen, rejected in zip(scores_chosen, scores_rejected): f.write(json.dumps({"chosen": chosen, "rejected": rejected}) + "\n") + ############################ + # Upload metadata to Hugging Face Hub + ############################ + if args.upload_metadata_to_hf: + logger.info("*** Uploading metadata to Hugging Face Hub ***") + try: + # Initialize ModelCardData with basic metadata + card_data = ModelCardData( + language="en", + model_name=args.model, + eval_results=[ + EvalResult( + task_type="preference_evaluation", + dataset_type=args.dataset, + dataset_name=args.dataset.split("/")[-1], # Assuming dataset ID is like 'owner/dataset' + metric_type="accuracy", + metric_value=accuracy, + ) + ], + ) + + # If there are extra results (per subset), add them as separate EvalResults + if args.dataset == "allenai/reward-bench" and results_grouped: + for subset, subset_accuracy in results_grouped.items(): + print(f"Adding subset {subset} with accuracy {subset_accuracy}") + subset_eval = EvalResult( + task_type="preference_evaluation", + dataset_type=subset, + dataset_name=subset, + metric_type="accuracy", + metric_value=subset_accuracy, + ) + card_data.eval_results.append(subset_eval) + + # Create a ModelCard + card = ModelCard.from_template( + card_data, + model_id=args.model, + ) + + # Push the updated ModelCard to the Hugging Face Hub + card.push_to_hub( + args.model, revision=args.revision, commit_message="Update evaluation results via RewardBench" + ) + logger.info(f"Successfully pushed updated ModelCard to Hugging Face Hub for {args.model}") + except Exception as e: + logger.error(f"Failed to upload metadata to Hugging Face Hub: {e}") + if __name__ == "__main__": main()