-
Notifications
You must be signed in to change notification settings - Fork 451
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #350 from allenai/mitchish
Mitchish mosaic run on its own branch
- Loading branch information
Showing
6 changed files
with
212 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
CONFIG_PATH=configs/v1_5-mix-medium-mitch-ish-s3.yaml | ||
NUM_NODES=4 | ||
ARGS='--activation_checkpointing=fine_grained wandb.name=v1_5-mix-mitch-ish-mcli-final --epoch=1 --optimizer.learning_rate=0.000023 --scheduler.t_warmup=556000 --scheduler.t_max=557000 --scheduler.alpha_f=0.001 --stop_at=557000' | ||
|
||
gantry run \ | ||
--allow-dirty \ | ||
--workspace ai2/llm-testing \ | ||
--task-name mitchish-mcli-final \ | ||
--description mitchish-mcli-final \ | ||
--priority high \ | ||
--beaker-image olmo-torch2-gantry \ | ||
--cluster ai2/general-cirrascale-a100-80g-ib \ | ||
--gpus 8 \ | ||
--replicas "${NUM_NODES}" \ | ||
--nfs \ | ||
--mount /net/nfs.cirrascale/allennlp/petew/cache:/root/.cache \ | ||
--env LOG_FILTER_TYPE=local_rank0_only \ | ||
--env OMP_NUM_THREADS=8 \ | ||
--env OLMO_TASK=model \ | ||
--env-secret WANDB_API_KEY=WANDB_API_KEY \ | ||
--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \ | ||
--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \ | ||
--shared-memory 10GiB \ | ||
--venv base \ | ||
--yes \ | ||
-- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} ${ARGS}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=v1.5-mix-medium-mitch-ish | ||
#SBATCH --account=kempner_lab | ||
#SBATCH --output=/n/holyscratch01/kempner_lab/Lab/logs-petew/%j.log | ||
#SBATCH --nodes=8 # Total number of nodes | ||
#SBATCH --ntasks-per-node=4 | ||
#SBATCH --gpus-per-node=4 # Allocate one gpu per MPI rank | ||
#SBATCH --cpus-per-task=16 | ||
#SBATCH --time=167:00:00 | ||
#SBATCH --mem=0 # All memory on the node | ||
#SBATCH --partition=kempner_project | ||
|
||
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK | ||
export MPICH_GPU_SUPPORT_ENABLED=1 | ||
export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} | ||
export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} | ||
|
||
export PYTHONPATH=.:${PYTHONPATH} | ||
|
||
# Try playing with max_split_size_mb if you run into OOM errors. | ||
# export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:512 | ||
|
||
export DATA_PATH=/n/home06/dgroeneveld/data/preprocessed/olmo-mix | ||
export EVAL_DATA_PATH=/n/home06/dgroeneveld/data/eval-data | ||
export CHECKPOINTS_PATH=/n/home06/dgroeneveld/checkpoints | ||
|
||
export PYTORCH_KERNEL_CACHE_PATH=/tmp/pytorch_kernel_cache/ | ||
mkdir -p $PYTORCH_KERNEL_CACHE_PATH | ||
|
||
LOAD_PATH=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish/step556000-unsharded | ||
# SAVE_PATH=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish-final-tulu | ||
|
||
srun \ | ||
"--cpus-per-task=$SLURM_CPUS_PER_TASK" \ | ||
--distribution=block:block \ | ||
--kill-on-bad-exit \ | ||
scripts/run_with_environment.sh \ | ||
$HOME/miniconda3/envs/LLM/bin/python -u scripts/train.py configs/v1_5-mix-medium-mitch-ish-s3.yaml \ | ||
"--run_name=kempner_${SLURM_JOB_ID}" \ | ||
--wandb.name=v1_5-mix-mitch-ish-final-tulu \ | ||
'--data.paths=[s3://ai2-llm/preprocessed/tulu-v2-sft-mixture/gpt-neox-20b-pii-special/data.npy,s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample-9B/gpt-neox-20b-pii-special/data.npy]' \ | ||
--eval_interval=100 \ | ||
--save_interval=500 \ | ||
"--load_path=${LOAD_PATH}" \ | ||
--restore_dataloader=false \ | ||
--optimizer.learning_rate=0.000023 \ | ||
--scheduler.t_warmup=556000 \ | ||
--scheduler.alpha_f=0.001 \ | ||
--scheduler.t_max=558223 \ | ||
--stop_at=558223 \ | ||
--time_limit=$((167 * 60 * 60)) \ | ||
"--save_folder=/n/holyscratch01/kempner_lab/Lab/checkpoints/${SLURM_JOB_ID}/" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=v1.5-mix-medium-mitch-ish | ||
#SBATCH --account=project_462000229 | ||
#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log | ||
#SBATCH --nodes=256 # Total number of nodes | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank | ||
#SBATCH --cpus-per-task=6 | ||
#SBATCH --time=48:00:00 | ||
#SBATCH --time-min=24:00:00 | ||
#SBATCH --mem=0 # All memory on the node | ||
#SBATCH --partition=standard-g | ||
|
||
module load LUMI/22.08 partition/G | ||
|
||
# export OLMO_CONTAINER=llm-lumi_latest.sif | ||
export OLMO_CONTAINER=llm-lumi-torch21_latest.sif | ||
|
||
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK | ||
export MPICH_GPU_SUPPORT_ENABLED=1 | ||
export NCCL_SOCKET_IFNAME=hsn | ||
export NCCL_NET_GDR_LEVEL=3 | ||
export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} | ||
export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} | ||
export CXI_FORK_SAFE=1 | ||
export CXI_FORK_SAFE_HP=1 | ||
export FI_CXI_DISABLE_CQ_HUGETLB=1 | ||
|
||
# We need to set this to avoid "Cassini Event Queue overflow detected." errors. | ||
export FI_CXI_DEFAULT_CQ_SIZE=131072 | ||
|
||
#export NCCL_DEBUG=INFO | ||
export PYTHONPATH=.:${PYTHONPATH} | ||
export ROCM_PATH=/opt/rocm | ||
export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64 | ||
|
||
# Try playing with max_split_size_mb if you run into OOM errors. | ||
#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 | ||
|
||
export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix | ||
export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints | ||
export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data | ||
|
||
srun \ | ||
--cpus-per-task=$SLURM_CPUS_PER_TASK \ | ||
--distribution=block:block \ | ||
--kill-on-bad-exit \ | ||
scripts/run_with_environment.sh \ | ||
singularity exec \ | ||
-B"$PROJECT_DIR:$PROJECT_DIR" \ | ||
-B"$FLASH_DIR:$FLASH_DIR" \ | ||
-B"$SCRATCH_DIR:$SCRATCH_DIR" \ | ||
-B /opt/cray:/opt/cray \ | ||
-B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ | ||
-B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ | ||
$PROJECT_DIR/containers/$OLMO_CONTAINER \ | ||
python scripts/train.py configs/v1_5-mix-medium-mitch-ish.yaml ${@} \ | ||
--run_name=${SLURM_JOB_ID} \ | ||
--global_train_batch_size=4096 \ | ||
--max_duration=238418 |