Log beaker experiment URL (#274)

allenai · Aug 19, 2024 · 2627b69 · 2627b69
1 parent 1cf5eba
commit 2627b69
Showing 4 changed files with 47 additions and 12 deletions.
diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
@@ -62,6 +62,7 @@
     clean_last_n_checkpoints,
     get_last_checkpoint_path,
     get_wandb_tags,
+    maybe_get_beaker_config,
     maybe_use_ai2_wandb_entity,
 )
 
@@ -523,7 +524,7 @@ def load_model():
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     # Figure out how many steps we should save the Accelerator states
-    checkpointing_steps = args.checkpointing_steps
+    checkpointing_steps = str(args.checkpointing_steps)
     if checkpointing_steps is not None and checkpointing_steps.lower() != "epoch":
         checkpointing_steps = int(checkpointing_steps)
 
@@ -533,8 +534,13 @@ def load_model():
         experiment_config = vars(args)
         # TensorBoard cannot log Enums, need the raw value
         experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"]
+
+        # (Optional) Ai2 internal tracking
         if args.wandb_entity is None:
             args.wandb_entity = maybe_use_ai2_wandb_entity()
+        beaker_config = maybe_get_beaker_config()
+        if beaker_config is not None:
+            experiment_config.update(vars(beaker_config))
         exp_name = os.path.basename(__file__)[: -len(".py")]
         accelerator.init_trackers(
             "open_instruct_internal",

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
@@ -53,6 +53,7 @@
     get_datasets,
     get_last_checkpoint_path,
     get_wandb_tags,
+    maybe_get_beaker_config,
     maybe_use_ai2_wandb_entity,
 )
 
@@ -551,7 +552,7 @@ def main(args: FlatArguments):
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
     # Figure out how many steps we should save the Accelerator states
-    checkpointing_steps = args.checkpointing_steps
+    checkpointing_steps = str(args.checkpointing_steps)
     if checkpointing_steps is not None and checkpointing_steps.lower() != "epoch":
         checkpointing_steps = int(checkpointing_steps)
 
@@ -561,8 +562,13 @@ def main(args: FlatArguments):
         experiment_config = vars(args)
         # TensorBoard cannot log Enums, need the raw value
         experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"]
+
+        # (Optional) Ai2 internal tracking
         if args.wandb_entity is None:
             args.wandb_entity = maybe_use_ai2_wandb_entity()
+        beaker_config = maybe_get_beaker_config()
+        if beaker_config is not None:
+            experiment_config.update(vars(beaker_config))
         exp_name = os.path.basename(__file__)[: -len(".py")]
         accelerator.init_trackers(
             "open_instruct_internal",
@@ -692,7 +698,7 @@ def main(args: FlatArguments):
                 if completed_steps >= args.max_train_steps:
                     break
 
-        if args.checkpointing_steps == "epoch":
+        if checkpointing_steps == "epoch":
             output_dir = f"epoch_{epoch}"
             if args.output_dir is not None:
                 output_dir = os.path.join(args.output_dir, output_dir)

diff --git a/open_instruct/utils.py b/open_instruct/utils.py
@@ -15,9 +15,9 @@
 import dataclasses
 import logging
 import os
+import shutil
 import subprocess
 import sys
-import shutil
 from dataclasses import dataclass, field
 from typing import Any, List, NewType, Optional, Tuple, Union
 
@@ -630,10 +630,14 @@ class FlatArguments:
         },
     )
     overwrite_output_dir: bool = field(
-        default=False, metadata={"help": "Overwrite the content of the output directory. Means that resumption will always start from scratch."},
+        default=False,
+        metadata={
+            "help": "Overwrite the content of the output directory. Means that resumption will always start from scratch."
+        },
     )
     keep_last_n_checkpoints: int = field(
-        default=3, metadata={"help": "How many checkpoints to keep in the output directory. -1 for all."},
+        default=3,
+        metadata={"help": "How many checkpoints to keep in the output directory. -1 for all."},
     )
 
     def __post_init__(self):
@@ -661,6 +665,24 @@ def __post_init__(self):
             raise ValueError("Cannot provide two dataset selection mechanisms.")
 
 
+@dataclass
+class BeakerRuntimeConfig:
+    beaker_workload_id: str
+    beaker_node_hostname: str
+    beaker_experiment_url: str
+
+
+def maybe_get_beaker_config():
+    beaker_runtime_config = None
+    if "BEAKER_JOB_ID" in os.environ:
+        beaker_runtime_config = BeakerRuntimeConfig(
+            beaker_workload_id=os.environ["BEAKER_WORKLOAD_ID"],
+            beaker_node_hostname=os.environ["BEAKER_NODE_HOSTNAME"],
+            beaker_experiment_url=f"https://beaker.org/ex/{os.environ['BEAKER_WORKLOAD_ID']}/",
+        )
+    return beaker_runtime_config
+
+
 def maybe_use_ai2_wandb_entity() -> Optional[str]:
     """Ai2 internal logic: try use the ai2-llm team if possible. Should not affect external users."""
     import wandb
@@ -814,8 +836,8 @@ def parse(self) -> Union[DataClassType, Tuple[DataClassType]]:
 
 def get_last_checkpoint(folder: str, incomplete: bool = False) -> Optional[str]:
     content = os.listdir(folder)
-    checkpoint_steps = [path for path in content if path.startswith('step_')]
-    checkpoint_epochs = [path for path in content if path.startswith('epoch_')]
+    checkpoint_steps = [path for path in content if path.startswith("step_")]
+    checkpoint_epochs = [path for path in content if path.startswith("epoch_")]
     if len(checkpoint_steps) > 0 and len(checkpoint_epochs) > 0:
         logger.info("Mixed step and epoch checkpoints found. Using step checkpoints.")
         checkpoints = checkpoint_steps
@@ -824,10 +846,10 @@ def get_last_checkpoint(folder: str, incomplete: bool = False) -> Optional[str]:
     else:
         checkpoints = checkpoint_steps
     if not incomplete:
-        checkpoints = [path for path in checkpoints if os.path.exists(os.path.join(folder, path, 'COMPLETED'))]
+        checkpoints = [path for path in checkpoints if os.path.exists(os.path.join(folder, path, "COMPLETED"))]
     if len(checkpoints) == 0:
         return
-    return os.path.join(folder, max(checkpoints, key=lambda x: x.split('_')[-1]))
+    return os.path.join(folder, max(checkpoints, key=lambda x: x.split("_")[-1]))
 
 
 def get_last_checkpoint_path(args: FlatArguments, incomplete: bool = False) -> str:

diff --git a/scripts/README.md b/scripts/README.md
@@ -48,6 +48,7 @@ python scripts/submit_eval_jobs.py --model_name llama_31_tulu_2_8b --location 01
 
 # submit evals on a model in huggingface; note you need to 1) prepend the model name with `hf-` and 2) replace `--location` with the hf repo id
 python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location allenai/llama-3-tulu-2-8b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_olmo_auto --upload_to_hf allenai/tulu-3-evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location vwxyzjn/online_dpo_tulu_2 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_olmo_auto --upload_to_hf allenai/tulu-3-evals
 ```
 2. `submit_finetune_jobs.py`: **Core script** for submitting multiple and configurable instruction tuning jobs. This script works for both single- and multi-node configurations. It by default reads configs in `configs/train_configs`, but also can take in CLI arguments matching those in `open_instruct/utils.py` `FlatArguments` class. 
 Example of running this is in `scripts/submit_finetune_jobs.sh`. 
@@ -93,10 +94,10 @@ After setting it up successfully, say you are running `sh scripts/dpo_train_with
 
 ```bash
 python mason.py \
-    --cluster ai2/allennlp-cirrascale ai2/general-cirrascale-a5000 ai2/general-cirrascale-a5000 ai2/general-cirrascale-a100-80g-ib \
+    --cluster ai2/allennlp-cirrascale ai2/general-cirrascale-a5000 ai2/general-cirrascale-a5000  \
     --priority low \
     --budget ai2/allennlp \
-    --gpus 1 -- sh scripts/dpo_train_with_accelerate_config.sh 8 configs/train_configs/dpo/default.yaml
+    --gpus 8 -- sh scripts/dpo_train_with_accelerate_config.sh 8 configs/train_configs/dpo/default.yaml
 ```