Skip to content

Commit

Permalink
Log beaker experiment URL (#274)
Browse files Browse the repository at this point in the history
vwxyzjn authored Aug 19, 2024
1 parent 1cf5eba commit 2627b69
Showing 4 changed files with 47 additions and 12 deletions.
8 changes: 7 additions & 1 deletion open_instruct/dpo_tune.py
Original file line number Diff line number Diff line change
@@ -62,6 +62,7 @@
clean_last_n_checkpoints,
get_last_checkpoint_path,
get_wandb_tags,
maybe_get_beaker_config,
maybe_use_ai2_wandb_entity,
)

@@ -523,7 +524,7 @@ def load_model():
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# Figure out how many steps we should save the Accelerator states
checkpointing_steps = args.checkpointing_steps
checkpointing_steps = str(args.checkpointing_steps)
if checkpointing_steps is not None and checkpointing_steps.lower() != "epoch":
checkpointing_steps = int(checkpointing_steps)

@@ -533,8 +534,13 @@ def load_model():
experiment_config = vars(args)
# TensorBoard cannot log Enums, need the raw value
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"]

# (Optional) Ai2 internal tracking
if args.wandb_entity is None:
args.wandb_entity = maybe_use_ai2_wandb_entity()
beaker_config = maybe_get_beaker_config()
if beaker_config is not None:
experiment_config.update(vars(beaker_config))
exp_name = os.path.basename(__file__)[: -len(".py")]
accelerator.init_trackers(
"open_instruct_internal",
10 changes: 8 additions & 2 deletions open_instruct/finetune.py
Original file line number Diff line number Diff line change
@@ -53,6 +53,7 @@
get_datasets,
get_last_checkpoint_path,
get_wandb_tags,
maybe_get_beaker_config,
maybe_use_ai2_wandb_entity,
)

@@ -551,7 +552,7 @@ def main(args: FlatArguments):
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# Figure out how many steps we should save the Accelerator states
checkpointing_steps = args.checkpointing_steps
checkpointing_steps = str(args.checkpointing_steps)
if checkpointing_steps is not None and checkpointing_steps.lower() != "epoch":
checkpointing_steps = int(checkpointing_steps)

@@ -561,8 +562,13 @@ def main(args: FlatArguments):
experiment_config = vars(args)
# TensorBoard cannot log Enums, need the raw value
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"]

# (Optional) Ai2 internal tracking
if args.wandb_entity is None:
args.wandb_entity = maybe_use_ai2_wandb_entity()
beaker_config = maybe_get_beaker_config()
if beaker_config is not None:
experiment_config.update(vars(beaker_config))
exp_name = os.path.basename(__file__)[: -len(".py")]
accelerator.init_trackers(
"open_instruct_internal",
@@ -692,7 +698,7 @@ def main(args: FlatArguments):
if completed_steps >= args.max_train_steps:
break

if args.checkpointing_steps == "epoch":
if checkpointing_steps == "epoch":
output_dir = f"epoch_{epoch}"
if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir)
36 changes: 29 additions & 7 deletions open_instruct/utils.py
Original file line number Diff line number Diff line change
@@ -15,9 +15,9 @@
import dataclasses
import logging
import os
import shutil
import subprocess
import sys
import shutil
from dataclasses import dataclass, field
from typing import Any, List, NewType, Optional, Tuple, Union

@@ -630,10 +630,14 @@ class FlatArguments:
},
)
overwrite_output_dir: bool = field(
default=False, metadata={"help": "Overwrite the content of the output directory. Means that resumption will always start from scratch."},
default=False,
metadata={
"help": "Overwrite the content of the output directory. Means that resumption will always start from scratch."
},
)
keep_last_n_checkpoints: int = field(
default=3, metadata={"help": "How many checkpoints to keep in the output directory. -1 for all."},
default=3,
metadata={"help": "How many checkpoints to keep in the output directory. -1 for all."},
)

def __post_init__(self):
@@ -661,6 +665,24 @@ def __post_init__(self):
raise ValueError("Cannot provide two dataset selection mechanisms.")


@dataclass
class BeakerRuntimeConfig:
beaker_workload_id: str
beaker_node_hostname: str
beaker_experiment_url: str


def maybe_get_beaker_config():
beaker_runtime_config = None
if "BEAKER_JOB_ID" in os.environ:
beaker_runtime_config = BeakerRuntimeConfig(
beaker_workload_id=os.environ["BEAKER_WORKLOAD_ID"],
beaker_node_hostname=os.environ["BEAKER_NODE_HOSTNAME"],
beaker_experiment_url=f"https://beaker.org/ex/{os.environ['BEAKER_WORKLOAD_ID']}/",
)
return beaker_runtime_config


def maybe_use_ai2_wandb_entity() -> Optional[str]:
"""Ai2 internal logic: try use the ai2-llm team if possible. Should not affect external users."""
import wandb
@@ -814,8 +836,8 @@ def parse(self) -> Union[DataClassType, Tuple[DataClassType]]:

def get_last_checkpoint(folder: str, incomplete: bool = False) -> Optional[str]:
content = os.listdir(folder)
checkpoint_steps = [path for path in content if path.startswith('step_')]
checkpoint_epochs = [path for path in content if path.startswith('epoch_')]
checkpoint_steps = [path for path in content if path.startswith("step_")]
checkpoint_epochs = [path for path in content if path.startswith("epoch_")]
if len(checkpoint_steps) > 0 and len(checkpoint_epochs) > 0:
logger.info("Mixed step and epoch checkpoints found. Using step checkpoints.")
checkpoints = checkpoint_steps
@@ -824,10 +846,10 @@ def get_last_checkpoint(folder: str, incomplete: bool = False) -> Optional[str]:
else:
checkpoints = checkpoint_steps
if not incomplete:
checkpoints = [path for path in checkpoints if os.path.exists(os.path.join(folder, path, 'COMPLETED'))]
checkpoints = [path for path in checkpoints if os.path.exists(os.path.join(folder, path, "COMPLETED"))]
if len(checkpoints) == 0:
return
return os.path.join(folder, max(checkpoints, key=lambda x: x.split('_')[-1]))
return os.path.join(folder, max(checkpoints, key=lambda x: x.split("_")[-1]))


def get_last_checkpoint_path(args: FlatArguments, incomplete: bool = False) -> str:
5 changes: 3 additions & 2 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -48,6 +48,7 @@ python scripts/submit_eval_jobs.py --model_name llama_31_tulu_2_8b --location 01

# submit evals on a model in huggingface; note you need to 1) prepend the model name with `hf-` and 2) replace `--location` with the hf repo id
python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location allenai/llama-3-tulu-2-8b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_olmo_auto --upload_to_hf allenai/tulu-3-evals
python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location vwxyzjn/online_dpo_tulu_2 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_olmo_auto --upload_to_hf allenai/tulu-3-evals
```
2. `submit_finetune_jobs.py`: **Core script** for submitting multiple and configurable instruction tuning jobs. This script works for both single- and multi-node configurations. It by default reads configs in `configs/train_configs`, but also can take in CLI arguments matching those in `open_instruct/utils.py` `FlatArguments` class.
Example of running this is in `scripts/submit_finetune_jobs.sh`.
@@ -93,10 +94,10 @@ After setting it up successfully, say you are running `sh scripts/dpo_train_with

```bash
python mason.py \
--cluster ai2/allennlp-cirrascale ai2/general-cirrascale-a5000 ai2/general-cirrascale-a5000 ai2/general-cirrascale-a100-80g-ib \
--cluster ai2/allennlp-cirrascale ai2/general-cirrascale-a5000 ai2/general-cirrascale-a5000 \
--priority low \
--budget ai2/allennlp \
--gpus 1 -- sh scripts/dpo_train_with_accelerate_config.sh 8 configs/train_configs/dpo/default.yaml
--gpus 8 -- sh scripts/dpo_train_with_accelerate_config.sh 8 configs/train_configs/dpo/default.yaml
```


0 comments on commit 2627b69

Please sign in to comment.