Skip to content

Commit

Permalink
Merge pull request #529 from allenai/70b-load-test
Browse files Browse the repository at this point in the history
70b load test
  • Loading branch information
dirkgr authored Mar 27, 2024
2 parents b8d2297 + 0236ceb commit 0f96e2f
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 32 deletions.
61 changes: 61 additions & 0 deletions configs/mcli/mitchish7.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: olmo-7b
image: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
compute:
cluster: r15z1p1
gpus: 32
gpu_type: h100_80gb
integrations:
- integration_type: git_repo
git_repo: allenai/OLMo
git_branch: train-olmo-large
pip_install: -e .[train]
ssh_clone: true
env_variables:
PIP_DISABLE_PIP_VERSION_CHECK: "1"
OMP_NUM_THREADS: "8"
LOG_FILTER_TYPE: local_rank0_only
command: |-
# Install AWS CLI (for download unsharded checkpoints).
#apt-get update
#apt-get install zip unzip
#curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
#unzip awscliv2.zip
#sudo ./aws/install
# Make sure we have a recent flash-attn.
# NOTE: only pinning flash-attn here to future proof it.
pip install flash-attn==2.5.3 --no-build-isolation
# Show packages for debugging.
pip freeze
# Prepare environment.
mkdir -p /root/.cache/torch
# warm up huggingface cache
pushd /root/.cache
curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar -xzf -
popd
export HF_DATASETS_OFFLINE=1
cd OLMo
torchrun \
--master_addr "$MASTER_ADDR" \
--master_port "$MASTER_PORT" \
--nnodes "$NUM_NODES" \
--node_rank "$NODE_RANK" \
--nproc_per_node 8 \
scripts/train.py configs/mitchish7-s3.yaml \
--run_name=mitchish7 \
--wandb.group=mitchish7 \
--model.flash_attention=true \
--fsdp.wrapping_strategy=by_block_and_size \
--fsdp.sharding_strategy=SHARD_GRAD_OP \
--save_folder=runs/ \
--activation_checkpointing=fine_grained \
--fused_loss=true \
--device_train_microbatch_size=2 \
--global_train_batch_size=1024 \
'--load_path=${path.last_checkpoint:${remote_save_folder}}' \
--save_overwrite \
--load_path=s3://ai2-llm/checkpoints/OLMo-medium/mitchish7/step51450-unsharded
35 changes: 35 additions & 0 deletions scripts/beaker/mitchish70-launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

set -ex

NUM_NODES=4

gantry run \
--workspace ai2/dirkg \
--task-name mitchish70 \
--description "OLMo large - 70B" \
--priority high \
--stop-preemptible \
--beaker-image petew/olmo-torch2-gantry \
--cluster ai2/pluto-cirrascale \
--gpus 8 \
--replicas "${NUM_NODES}" \
--leader-selection \
--host-networking \
--budget ai2/oe-training \
--nfs \
--mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
--env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \
--env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \
--shared-memory 10GiB \
--venv base \
--yes \
--timeout=-1 \
-- /bin/bash -c "scripts/beaker/mitchish70.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}"
35 changes: 35 additions & 0 deletions scripts/beaker/mitchish70-loadtest-launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

set -ex

NUM_NODES=4

gantry run \
--workspace ai2/dirkg \
--task-name mitchish70-loadtest \
--description "OLMo large - 70B - loadtest" \
--priority high \
--stop-preemptible \
--beaker-image petew/olmo-torch2-gantry \
--cluster ai2/pluto-cirrascale \
--gpus 8 \
--replicas "${NUM_NODES}" \
--leader-selection \
--host-networking \
--budget ai2/oe-training \
--nfs \
--mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
--env-secret R2_ACCESS_KEY_ID=R2_ACCESS_KEY_ID \
--env-secret R2_SECRET_ACCESS_KEY=R2_SECRET_ACCESS_KEY \
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \
--shared-memory 10GiB \
--venv base \
--yes \
--timeout=-1 \
-- /bin/bash -c "scripts/beaker/mitchish70-loadtest.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES}"
36 changes: 36 additions & 0 deletions scripts/beaker/mitchish70-loadtest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -exuo pipefail
IFS=$'\n\t'

BEAKER_LEADER_REPLICA_HOSTNAME=$1
shift

NUM_NODES=$1
shift

# Warm HF cache
mkdir -p /root/.cache
pushd /root/.cache
curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf -
popd
export HF_DATASETS_OFFLINE=1

torchrun \
--nnodes ${NUM_NODES}:${NUM_NODES} \
--nproc-per-node 8 \
--rdzv_id=101 \
--rdzv_backend=c10d \
--rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
scripts/train.py \
configs/mitchish70-s3.yaml \
--run_name=mitchish70-loadtest \
--wandb.name=mitchish70-loadtest \
--model.flash_attention=true \
--fsdp.wrapping_strategy=by_block_and_size \
--save_folder=runs/ \
--fused_loss=true \
--device_train_microbatch_size=2 \
--global_train_batch_size=512 \
--save_overwrite \
--remote_save_folder=null \
--load_path=s3://ai2-llm/checkpoints/OLMo-large/mitchish70-002/step32300-unsharded
59 changes: 27 additions & 32 deletions scripts/beaker/mitchish70.sh
Original file line number Diff line number Diff line change
@@ -1,36 +1,31 @@
#!/usr/bin/env bash
set -exuo pipefail
IFS=$'\n\t'

set -ex
BEAKER_LEADER_REPLICA_HOSTNAME=$1
shift

CONFIG_PATH=configs/mitchish70-s3.yaml
NUM_NODES=4
RUN_NAME="mitchish70-001"
ARGS="--run_name=${RUN_NAME} --device_train_microbatch_size=4"
NUM_NODES=$1
shift

gantry run \
--allow-dirty \
--workspace ai2/llm-testing \
--task-name mitchish70 \
--description "OLMo large - 70B" \
--priority high \
--stop-preemptible \
--beaker-image olmo-torch2-gantry \
--cluster ai2/general-cirrascale-a100-80g-ib \
--gpus 8 \
--replicas "${NUM_NODES}" \
--leader-selection \
--host-networking \
--budget ai2/oe-training \
--nfs \
--mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
--shared-memory 10GiB \
--venv base \
--yes \
--timeout=-1 \
-- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}"
# Warm HF cache
mkdir -p /root/.cache
pushd /root/.cache
curl "https://storage.googleapis.com/dirkgr-public/huggingface_cache_v2.tar.gz" | tar --keep-newer-files -xzf -
popd
export HF_DATASETS_OFFLINE=1

torchrun \
--nnodes ${NUM_NODES}:${NUM_NODES} \
--nproc-per-node 8 \
--rdzv_id=101 \
--rdzv_backend=c10d \
--rdzv_endpoint=$BEAKER_LEADER_REPLICA_HOSTNAME:29400 \
scripts/train.py \
configs/mitchish70-s3.yaml \
--run_name=mitchish70-002 \
--wandb.name=mitchish70-official \
--device_train_microbatch_size=3 \
--global_train_batch_size=1536 \
'--load_path=${path.last_checkpoint:${remote_save_folder}}' \
--save_overwrite

0 comments on commit 0f96e2f

Please sign in to comment.