Skip to content

Commit

Permalink
Script for running ablations on LUMI
Browse files Browse the repository at this point in the history
  • Loading branch information
dirkgr committed Feb 27, 2024
1 parent ae538ce commit 404ea30
Showing 1 changed file with 65 additions and 0 deletions.
65 changes: 65 additions & 0 deletions scripts/lumi/olmo7-ablations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
#SBATCH --job-name=olmo7-ablation
#SBATCH --account=project_462000229
#SBATCH --output=/pfs/lustref1/flash/project_462000229/logs/%j.log
#SBATCH --nodes=128 # Total number of nodes
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank
#SBATCH --cpus-per-task=6
#SBATCH --time=39:15:00
#SBATCH --mem=0 # All memory on the node
#SBATCH --partition=standard-g

WANDB_GROUP=$1
shift

export OLMO_CONTAINER=llm-lumi-torch21_latest.sif

export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export MPICH_GPU_SUPPORT_ENABLED=1
export NCCL_SOCKET_IFNAME=hsn
export NCCL_NET_GDR_LEVEL=3
export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID}
export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
export CXI_FORK_SAFE=1
export CXI_FORK_SAFE_HP=1
export FI_CXI_DISABLE_CQ_HUGETLB=1

# We need to set this to avoid "Cassini Event Queue overflow detected." errors.
export FI_CXI_DEFAULT_CQ_SIZE=131072

#export NCCL_DEBUG=INFO
export PYTHONPATH=.:${PYTHONPATH}
export ROCM_PATH=/opt/rocm
export SINGULARITYENV_LD_LIBRARY_PATH=/usr/local/lib:/opt/cray/libfabric/1.15.2.0/lib64

# Try playing with max_split_size_mb if you run into OOM errors.
#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128

export HF_DATASETS_OFFLINE=1

export DATA_PATH=$FLASH_DIR/preprocessed/olmo-mix
export CHECKPOINTS_PATH=$FLASH_DIR/checkpoints
export EVAL_DATA_PATH=$SCRATCH_DIR/eval-data

srun \
--cpus-per-task=$SLURM_CPUS_PER_TASK \
--distribution=block:block \
--kill-on-bad-exit \
scripts/run_with_environment.sh \
singularity exec \
-B"$PROJECT_DIR:$PROJECT_DIR" \
-B"$FLASH_DIR:$FLASH_DIR" \
-B"$SCRATCH_DIR:$SCRATCH_DIR" \
-B /opt/cray:/opt/cray \
-B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \
-B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \
$PROJECT_DIR/containers/$OLMO_CONTAINER \
python scripts/train.py configs/olmo7-ablation.yaml ${@} \
--run_name=${SLURM_JOB_ID} \
--activation_checkpointing=fine_grained \
--fsdp.wrapping_strategy=one_in_four \
--fsdp.sharding_strategy=FULL_SHARD \
--sharded_checkpointer=local \
--time_limit=$((39 * 60 * 60)) \
--wandb.group=$WANDB_GROUP

0 comments on commit 404ea30

Please sign in to comment.