Merge branch 'main' into bump_cuda

allenai · Sep 26, 2024 · b96085a · b96085a
2 parents 11b0af8 + 011dfbd
commit b96085a
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 2 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -48,12 +48,13 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d
 RUN apt-get -y install git-lfs
 
 WORKDIR /stage/
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
 RUN pip install --upgrade pip setuptools wheel
 # designed for cuda 12.1
 RUN pip3 install torch torchvision torchaudio
 # If you need to use cuda 11.8, use this and the below vllm code for installing with cuda 11.8
-# RUN pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+# RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118
 # Install vLLM with CUDA 11.8.
 # RUN export VLLM_VERSION=0.6.1.post1
 # RUN export PYTHON_VERSION=310

diff --git a/rewardbench/rewardbench.py b/rewardbench/rewardbench.py
@@ -27,6 +27,7 @@
 import wandb
 from accelerate import Accelerator
 from accelerate.logging import get_logger
+from huggingface_hub import EvalResult, ModelCard, ModelCardData
 from tqdm import tqdm
 from transformers import AutoTokenizer, HfArgumentParser
 
@@ -61,6 +62,8 @@ class Args:
     # wandb args
     wandb_run: Optional[str] = None
     """The wandb run to extract model and revision from."""
+    upload_metadata_to_hf: bool = False
+    """Upload metadata to Hugging Face Hub."""
 
     # inference args
     batch_size: int = 8
@@ -424,6 +427,65 @@ def actual_main(args: Args):
             for chosen, rejected in zip(scores_chosen, scores_rejected):
                 f.write(json.dumps({"chosen": chosen, "rejected": rejected}) + "\n")
 
+    ############################
+    # Upload metadata to Hugging Face Hub
+    ############################
+    if args.upload_metadata_to_hf:
+        logger.info("*** Uploading metadata to Hugging Face Hub ***")
+        try:
+            # Initialize ModelCardData with basic metadata
+            card_data = ModelCardData(
+                language="en",
+                model_name=args.model,
+                eval_results=[
+                    EvalResult(
+                        task_type="preference_evaluation",
+                        dataset_type=args.dataset,
+                        dataset_name=args.dataset.split("/")[-1],  # Assuming dataset ID is like 'owner/dataset'
+                        metric_type="accuracy",
+                        metric_value=accuracy,
+                    )
+                ],
+            )
+
+            # If there are extra results (per subset), add them as separate EvalResults
+            if args.dataset == "allenai/reward-bench" and results_grouped:
+                for section, section_accuracy in results_section.items():
+                    print(f"Adding section {section} with accuracy {section_accuracy}")
+                    section_eval = EvalResult(
+                        task_type="preference_evaluation",
+                        dataset_type=section.replace(" ", "_"),
+                        dataset_name=section,
+                        metric_type="accuracy",
+                        metric_value=section_accuracy,
+                    )
+                    card_data.eval_results.append(section_eval)
+
+                for subset, subset_accuracy in results_grouped.items():
+                    print(f"Adding subset {subset} with accuracy {subset_accuracy}")
+                    subset_eval = EvalResult(
+                        task_type="preference_evaluation",
+                        dataset_type=subset,
+                        dataset_name=subset,
+                        metric_type="accuracy",
+                        metric_value=subset_accuracy,
+                    )
+                    card_data.eval_results.append(subset_eval)
+
+            # Create a ModelCard
+            card = ModelCard.from_template(
+                card_data,
+                model_id=args.model,
+            )
+
+            # Push the updated ModelCard to the Hugging Face Hub
+            card.push_to_hub(
+                args.model, revision=args.revision, commit_message="Update evaluation results via RewardBench"
+            )
+            logger.info(f"Successfully pushed updated ModelCard to Hugging Face Hub for {args.model}")
+        except Exception as e:
+            logger.error(f"Failed to upload metadata to Hugging Face Hub: {e}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/run_generative.py b/scripts/run_generative.py
@@ -70,6 +70,8 @@ def get_args():
         "--trust_remote_code", action="store_true", default=False, help="directly load model instead of pipeline"
     )
     parser.add_argument("--num_gpus", type=int, default=1, help="number of gpus to use, for multi-node vllm")
+    parser.add_arugment("--vllm_gpu_util", type=float, default=0.9, help="gpu utilization for vllm")
+    parser.add_argument("--vllm_max_seq_length", type=int, default=None, help="max sequence length for vllm")
     parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
     parser.add_argument(
         "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set"
@@ -124,8 +126,19 @@ def main():
 
     # if model isn't API, load via vllm
     if not is_api_models:
+        # if multi gpu, set multiproc method to spawn
+        if args.num_gpus > 1:
+            # Set the environment variable
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
         # load model
-        model = LLM(args.model, trust_remote_code=args.trust_remote_code, tensor_parallel_size=args.num_gpus)
+        model = LLM(
+            args.model,
+            trust_remote_code=args.trust_remote_code,
+            tensor_parallel_size=args.num_gpus,
+            gpu_memory_utilization=args.vllm_gpu_util,
+            max_seq_length=args.vllm_max_seq_length,
+        )
         tokenizer = AutoTokenizer.from_pretrained(args.model)
         if "Llama-3" in args.model or "llama3-8b" in args.model and "3.1" not in args.model:
             stop_token_ids = [128009]

diff --git a/setup.py b/setup.py
@@ -49,6 +49,7 @@
         "flake8>=6.0",
         "fschat",
         "huggingface_hub",
+        "hf_transfer",
         "isort>=5.12.0",
         "pandas",
         "peft",