From 2cb8b36974f82286c029420cd04ef4a4db536346 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 14:11:22 -0700 Subject: [PATCH 01/12] Load test on beaker --- ...ish70.sh => mitchish70-loadtest-launch.sh} | 16 ++++----- scripts/beaker/mitchish70-loadtest.sh | 36 +++++++++++++++++++ 2 files changed, 42 insertions(+), 10 deletions(-) rename scripts/beaker/{mitchish70.sh => mitchish70-loadtest-launch.sh} (53%) create mode 100755 scripts/beaker/mitchish70-loadtest.sh diff --git a/scripts/beaker/mitchish70.sh b/scripts/beaker/mitchish70-loadtest-launch.sh similarity index 53% rename from scripts/beaker/mitchish70.sh rename to scripts/beaker/mitchish70-loadtest-launch.sh index a5b279ebf..618cd48aa 100755 --- a/scripts/beaker/mitchish70.sh +++ b/scripts/beaker/mitchish70-loadtest-launch.sh @@ -2,20 +2,16 @@ set -ex -CONFIG_PATH=configs/mitchish70-s3.yaml NUM_NODES=4 -RUN_NAME="mitchish70-001" -ARGS="--run_name=${RUN_NAME} --device_train_microbatch_size=4" gantry run \ - --allow-dirty \ - --workspace ai2/llm-testing \ - --task-name mitchish70 \ - --description "OLMo large - 70B" \ + --workspace ai2/dirkg \ + --task-name mitchish70-loadtest \ + --description "OLMo large - 70B - loadtest" \ --priority high \ --stop-preemptible \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ + --beaker-image petew/olmo-torch2-gantry \ + --cluster ai2/pluto-cirrascale \ --gpus 8 \ --replicas "${NUM_NODES}" \ --leader-selection \ @@ -33,4 +29,4 @@ gantry run \ --venv base \ --yes \ --timeout=-1 \ - -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" + -- /bin/bash -c "scripts/beaker/mitchish70-loadtest.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70-loadtest.sh b/scripts/beaker/mitchish70-loadtest.sh new file mode 100755 index 000000000..22468d343 --- /dev/null +++ b/scripts/beaker/mitchish70-loadtest.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=101 \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + scripts/train.py \ + configs/mitchish70-s3.yaml \ + --run_name=mitchish70-loadtest \ + --wandb.name=mitchish70-loadtest \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --save_folder=runs/ \ + --fused_loss=true \ + --device_train_microbatch_size=2 \ + --global_train_batch_size=512 \ + --save_overwrite \ + --remote_save_folder=null \ + --load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step32300-unsharded \ No newline at end of file From 22cca0cd178c62b5be215d16decdc1a065ec6d80 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 14:13:42 -0700 Subject: [PATCH 02/12] Needs more secrets --- scripts/beaker/mitchish70-loadtest-launch.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/beaker/mitchish70-loadtest-launch.sh b/scripts/beaker/mitchish70-loadtest-launch.sh index 618cd48aa..8718f9446 100755 --- a/scripts/beaker/mitchish70-loadtest-launch.sh +++ b/scripts/beaker/mitchish70-loadtest-launch.sh @@ -25,6 +25,9 @@ gantry run \ --env-secret WANDB_API_KEY=WANDB_API_KEY \ --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ + --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ --shared-memory 10GiB \ --venv base \ --yes \ From bfedb821bed68eb2ba53e24b09a557f95720405f Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 15:05:25 -0700 Subject: [PATCH 03/12] Config for running the 7B on MosaicML --- configs/mcli/mitchish7.yaml | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 configs/mcli/mitchish7.yaml diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml new file mode 100644 index 000000000..8d432259f --- /dev/null +++ b/configs/mcli/mitchish7.yaml @@ -0,0 +1,60 @@ +name: olmo-7b +image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 +compute: + cluster: r15z1p1 + gpus: 64 + gpu_type: h100_80gb +integrations: + - integration_type: git_repo + git_repo: allenai/OLMo + git_branch: train-olmo-large + pip_install: -e .[train] + ssh_clone: true +env_variables: + PIP_DISABLE_PIP_VERSION_CHECK: "1" + OMP_NUM_THREADS: "8" + LOG_FILTER_TYPE: local_rank0_only +command: |- + # Install AWS CLI (for download unsharded checkpoints). + #apt-get update + #apt-get install zip unzip + #curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + #unzip awscliv2.zip + #sudo ./aws/install + + # Make sure we have a recent flash-attn. + # NOTE: only pinning flash-attn here to future proof it. + pip install flash-attn==2.5.3 --no-build-isolation + + # Show packages for debugging. + pip freeze + + # Prepare environment. + mkdir -p /root/.cache/torch + # warm up huggingface cache + pushd /root/.cache + curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar -xzf - + popd + export HF_DATASETS_OFFLINE=1 + + cd OLMo + + torchrun \ + --master_addr "$MASTER_ADDR" \ + --master_port "$MASTER_PORT" \ + --nnodes "$NUM_NODES" \ + --node_rank "$NODE_RANK" \ + --nproc_per_node 8 \ + scripts/train.py configs/mitchish7-s3.yaml \ + --run_name=mitchish7 \ + --wandb.group=mitchish7 \ + --model.flash_attention=true \ + --fsdp.wrapping_strategy=by_block_and_size \ + --fsdp.sharding_strategy=SHARD_GRAD_OP \ + --save_folder=runs/ \ + --activation_checkpointing=fine_grained \ + --fused_loss=true \ + --device_train_microbatch_size=2 \ + --global_train_batch_size=1024 \ + '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ + --save_overwrite From 37b5f2020c560e88a8795b3cf348579d300b7ad8 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 15:31:16 -0700 Subject: [PATCH 04/12] This is running on an older version of torch. --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index 8d432259f..1d49c6482 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -1,5 +1,5 @@ name: olmo-7b -image: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 +image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 compute: cluster: r15z1p1 gpus: 64 From b2a9c74901bbb36f6f406e743442073b71facf9a Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 20:47:43 -0700 Subject: [PATCH 05/12] More nodes --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index 1d49c6482..db002f007 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -2,7 +2,7 @@ name: olmo-7b image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 compute: cluster: r15z1p1 - gpus: 64 + gpus: 128 gpu_type: h100_80gb integrations: - integration_type: git_repo From 3235dbfd953c544bb47340eaa86a79791cced403 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 21:23:11 -0700 Subject: [PATCH 06/12] Start from unsharded checkpoint --- configs/mcli/mitchish7.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index db002f007..a95551a8f 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -57,4 +57,5 @@ command: |- --device_train_microbatch_size=2 \ --global_train_batch_size=1024 \ '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ - --save_overwrite + --save_overwrite \ + --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step41400-unsharded From ecc0fd37782ec8eed88ba21891ada836d12aca0d Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 21:37:37 -0700 Subject: [PATCH 07/12] 128 GPUs terminates for some reason --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index a95551a8f..e9d412517 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -2,7 +2,7 @@ name: olmo-7b image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 compute: cluster: r15z1p1 - gpus: 128 + gpus: 64 gpu_type: h100_80gb integrations: - integration_type: git_repo From 2b6fe88d5eadaa2d8c15858eaa897a5ddc57604f Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 23:42:38 -0700 Subject: [PATCH 08/12] Try 128 nodes again --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index e9d412517..a95551a8f 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -2,7 +2,7 @@ name: olmo-7b image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 compute: cluster: r15z1p1 - gpus: 64 + gpus: 128 gpu_type: h100_80gb integrations: - integration_type: git_repo From 806d6ef01eea52ed70d51e79a1ad1b22da3177ad Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Tue, 26 Mar 2024 23:59:48 -0700 Subject: [PATCH 09/12] New load path --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index a95551a8f..dde6a7355 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -58,4 +58,4 @@ command: |- --global_train_batch_size=1024 \ '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ --save_overwrite \ - --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step41400-unsharded + --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step42200-unsharded From 60b92801721c660888c301493367f7c8fe9ac9ad Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 27 Mar 2024 12:17:53 -0700 Subject: [PATCH 10/12] Have to reshard again --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index dde6a7355..0f3cb9399 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -58,4 +58,4 @@ command: |- --global_train_batch_size=1024 \ '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ --save_overwrite \ - --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step42200-unsharded + --load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step51450-unsharded From 836289300e3c36685553220126db57d6eb795868 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 27 Mar 2024 12:27:22 -0700 Subject: [PATCH 11/12] 32 GPUs :-( --- configs/mcli/mitchish7.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mcli/mitchish7.yaml b/configs/mcli/mitchish7.yaml index 0f3cb9399..c099da378 100644 --- a/configs/mcli/mitchish7.yaml +++ b/configs/mcli/mitchish7.yaml @@ -2,7 +2,7 @@ name: olmo-7b image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 compute: cluster: r15z1p1 - gpus: 128 + gpus: 32 gpu_type: h100_80gb integrations: - integration_type: git_repo From 0236cebdd49b138c70d1e46fc052516706cc9037 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 27 Mar 2024 16:03:52 -0700 Subject: [PATCH 12/12] Fix the mitchich70 launch scripts for Beaker --- scripts/beaker/mitchish70-launch.sh | 35 +++++++++++++++++++++++++++++ scripts/beaker/mitchish70.sh | 31 +++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100755 scripts/beaker/mitchish70-launch.sh create mode 100755 scripts/beaker/mitchish70.sh diff --git a/scripts/beaker/mitchish70-launch.sh b/scripts/beaker/mitchish70-launch.sh new file mode 100755 index 000000000..0b2c88872 --- /dev/null +++ b/scripts/beaker/mitchish70-launch.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ex + +NUM_NODES=4 + +gantry run \ + --workspace ai2/dirkg \ + --task-name mitchish70 \ + --description "OLMo large - 70B" \ + --priority high \ + --stop-preemptible \ + --beaker-image petew/olmo-torch2-gantry \ + --cluster ai2/pluto-cirrascale \ + --gpus 8 \ + --replicas "${NUM_NODES}" \ + --leader-selection \ + --host-networking \ + --budget ai2/oe-training \ + --nfs \ + --mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env OLMO_TASK=model \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ + --env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \ + --env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \ + --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + --timeout=-1 \ + -- /bin/bash -c "scripts/beaker/mitchish70.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}" diff --git a/scripts/beaker/mitchish70.sh b/scripts/beaker/mitchish70.sh new file mode 100755 index 000000000..06f837877 --- /dev/null +++ b/scripts/beaker/mitchish70.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -exuo pipefail +IFS=$'\n\t' + +BEAKER_LEADER_REPLICA_HOSTNAME=$1 +shift + +NUM_NODES=$1 +shift + +# Warm HF cache +mkdir -p /root/.cache +pushd /root/.cache +curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf - +popd +export HF_DATASETS_OFFLINE=1 + +torchrun \ + --nnodes ${NUM_NODES}:${NUM_NODES} \ + --nproc-per-node 8 \ + --rdzv_id=101 \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \ + scripts/train.py \ + configs/mitchish70-s3.yaml \ + --run_name=mitchish70-002 \ + --wandb.name=mitchish70-official \ + --device_train_microbatch_size=3 \ + --global_train_batch_size=1536 \ + '--load_path=${path.last_checkpoint:${remote_save_folder}}' \ + --save_overwrite \ No newline at end of file