Skip to content

Commit

Permalink
added script to run on gantry
Browse files Browse the repository at this point in the history
new scripts and config

flags torchrun

trying delayed expansion

wadb key

reducing microbatch

trying micro batch size 8

new image, trying 4/16

back to 8 microbatch

debugging

removed id

run with blocking

added req nodes

removed blocking flag
  • Loading branch information
soldni committed Jul 13, 2023
1 parent 952819b commit baee669
Showing 1 changed file with 59 additions and 0 deletions.
59 changes: 59 additions & 0 deletions scripts/olmo-small-ablation-on-gantry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

set -ex

# check if LOAD_PATH is provided as an environment variable; if so, create an argument
# to pass to the training script
if [ -z ${LOAD_PATH+x} ]; then
LOAD_PATH_ARG=""
else
LOAD_PATH_ARG="--load_path=${LOAD_PATH}"
fi


# check if CONFIG PATH is provided as an environment variable;
# if so, use that instead of olmo-small-ablation.yaml
if [ -z ${CONFIG_PATH+x} ]; then
export CONFIG_PATH=configs/olmo-small-ablation.yaml
else
export CONFIG_PATH="${CONFIG_PATH}"
fi

# get run name, we will use this as task name in gantry
RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//')

# get a hash of the load path and config path; take the first 8 characters
RUN_HASH=$(echo "${LOAD_PATH_ARG}-${CONFIG_PATH}" | md5sum | cut -c 1-8)

# compose the two
FULL_RUN_NAME="${RUN_NAME}-${RUN_HASH}"

# check if there is an env var called 'WANDB_API_KEY' and if so, create a flag
# to pass to gantry
if [ -z ${WANDB_API_KEY+x} ]; then
WANDB_API_KEY_ARG="--env-secret WANDB_API_KEY=WANDB_API_KEY"
else
WANDB_API_KEY_ARG="--env WANDB_API_KEY=${WANDB_API_KEY}"
fi

NUM_NODES=4

gantry run \
--workspace ai2/llm-testing \
--task-name "${FULL_RUN_NAME}" \
--description "${FULL_RUN_NAME}" \
--priority "normal" \
--beaker-image olmo-torch2-gantry \
--cluster ai2/general-cirrascale-a100-80g-ib \
--gpus 8 \
--replicas ${NUM_NODES} \
--leader-selection \
--host-networking \
--nfs \
${WANDB_API_KEY_ARG} \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--shared-memory 10GiB \
--venv base \
--yes \
-- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} --run_name=${FULL_RUN_NAME} ${LOAD_PATH_ARG} --device_train_microbatch_size=8 ${@}"

0 comments on commit baee669

Please sign in to comment.