diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml new file mode 100644 index 000000000..c099da378 --- /dev/null +++ b/configs/mcli/mitchish7.yaml @@ -0,0 +1,61 @@ +name: olmo-7b +image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 +compute: + cluster: r15z1p1 + gpus: 32 + gpu_type: h100_80gb +integrations: + - integration_type: git_repo + git_repo: allenai/OLMo + git_branch: train-olmo-large + pip_install: -e .[train] + ssh_clone: true +env_variables: + PIP_DISABLE_PIP_VERSION_CHECK: "1" + OMP_NUM_THREADS: "8" + LOG_FILTER_TYPE: local_rank0_only +command: |- + # Install AWS CLI (for download unsharded checkpoints). + #apt-get update + #apt-get install zip unzip + #curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + #unzip awscliv2.zip + #sudo ./aws/install + + # Make sure we have a recent flash-attn. + # NOTE: only pinning flash-attn here to future proof it. + pip install flash-attn==2.5.3 --no-build-isolation + + # Show packages for debugging. + pip freeze + + # Prepare environment. + mkdir -p /root/.cache/torch + # warm up huggingface cache + pushd /root/.cache + curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar -xzf - + popd + export HF_DATASETS_OFFLINE=1 + + cd OLMo + + torchrun \ + --master_addr "$MASTER_ADDR" \ + --master_port "$MASTER_PORT" \ + --nnodes "$NUM_NODES" \ + --node_rank "$NODE_RANK" \ + --nproc_per_node 8 \ + scripts/train.py configs/mitchish7-s3.yaml \ + --run_name=mitchish7 \ + --wandb.group=mitchish7 \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --fsdp.sharding_strategy=SHARD_GRAD_OP \ + --save_folder=runs/ \ + --activation_checkpointing=fine_grained \ + --fused_loss=true \ + --device_train_microbatch_size=2 \ + --global_train_batch_size=1024 \ + '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ + --save_overwrite \ + --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step51450-unsharded diff --git a/scripts/beaker/mitchish70-launch.sh b/scripts/beaker/mitchish70-launch.sh new file mode 100755 index 000000000..0b2c88872 --- /dev/null +++ b/scripts/beaker/mitchish70-launch.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=4 + +gantry run \ + --workspace ai2/dirkg \ + --task-name mitchish70 \ + --description "OLMo large - 70B" \ + --priority high \ + --stop-preemptible \ + --beaker-image petew/olmo-torch2-gantry \ + --cluster ai2/pluto-cirrascale \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --nfs \ + --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ + --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/mitchish70.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70-loadtest-launch.sh b/scripts/beaker/mitchish70-loadtest-launch.sh new file mode 100755 index 000000000..8718f9446 --- /dev/null +++ b/scripts/beaker/mitchish70-loadtest-launch.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=4 + +gantry run \ + --workspace ai2/dirkg \ + --task-name mitchish70-loadtest \ + --description "OLMo large - 70B - loadtest" \ + --priority high \ + --stop-preemptible \ + --beaker-image petew/olmo-torch2-gantry \ + --cluster ai2/pluto-cirrascale \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --nfs \ + --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ + --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/mitchish70-loadtest.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70-loadtest.sh b/scripts/beaker/mitchish70-loadtest.sh new file mode 100755 index 000000000..22468d343 --- /dev/null +++ b/scripts/beaker/mitchish70-loadtest.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=101 \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + scripts/train.py \ + configs/mitchish70-s3.yaml \ + --run_name=mitchish70-loadtest \ + --wandb.name=mitchish70-loadtest \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --save_folder=runs/ \ + --fused_loss=true \ + --device_train_microbatch_size=2 \ + --global_train_batch_size=512 \ + --save_overwrite \ + --remote_save_folder=null \ + --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step32300-unsharded \ No newline at end of file diff --git a/scripts/beaker/mitchish70.sh b/scripts/beaker/mitchish70.sh index a5b279ebf..06f837877 100755 --- a/scripts/beaker/mitchish70.sh +++ b/scripts/beaker/mitchish70.sh @@ -1,36 +1,31 @@ #!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' -set -ex +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift -CONFIG_PATH=configs/mitchish70-s3.yaml -NUM_NODES=4 -RUN_NAME="mitchish70-001" -ARGS="--run_name=${RUN_NAME} --device_train_microbatch_size=4" +NUM_NODES=$1 +shift -gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name mitchish70 \ - --description "OLMo large - 70B" \ - --priority high \ - --stop-preemptible \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --replicas "${NUM_NODES}" \ - --leader-selection \ - --host-networking \ - --budget ai2/oe-training \ - --nfs \ - --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --env OLMO_TASK=model \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - --timeout=-1 \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=101 \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + scripts/train.py \ + configs/mitchish70-s3.yaml \ + --run_name=mitchish70-002 \ + --wandb.name=mitchish70-official \ + --device_train_microbatch_size=3 \ + --global_train_batch_size=1536 \ + '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ + --save_overwrite \ No newline at end of file