From 565d9f49cc6f994c25568efcc6c9d112b2545119 Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 2 Oct 2024 15:44:41 -0700 Subject: [PATCH 1/3] Add demo script for running on LUMI --- scripts/lumi/demo.sh | 64 ++++++++++++++++++++++++++++++++ scripts/lumi/run-in-container.sh | 7 ++++ 2 files changed, 71 insertions(+) create mode 100755 scripts/lumi/demo.sh create mode 100644 scripts/lumi/run-in-container.sh diff --git a/scripts/lumi/demo.sh b/scripts/lumi/demo.sh new file mode 100755 index 000000000..4b203484e --- /dev/null +++ b/scripts/lumi/demo.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# +# Demo script for running multinode jobs on LUMI. You can run this as a batch job using +# sbatch or as part of an interactive session by running this script as an executable. +# +#SBATCH --job-name=peteish13-highlr +#SBATCH --account=project_462000229 +#SBATCH --output=/scratch/project_462000229/logs/%j.log +#SBATCH --nodes=128 # Total number of nodes +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 # Allocate one gpu per MPI rank +#SBATCH --cpus-per-task=6 +#SBATCH --time=48:00:00 +#SBATCH --time-min=12:00:00 +#SBATCH --mem=0 # All memory on the node +#SBATCH --partition=standard-g + +module load LUMI/24.03 partition/G + +## Container-dependent settings +export OLMO_CONTAINER=$PROJECT_DIR/containers/lumi-torch25rc-rocm62-py312.sif +export ROCM_PATH=/opt/rocm +export CONDA_ENV=pytorch +export PYTHONPATH=.:${PYTHONPATH} + +## General LUMI settings (these rarely change) +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export MPICH_GPU_SUPPORT_ENABLED=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET_GDR_LEVEL=3 +export MIOPEN_USER_DB_PATH=/tmp/${USER}-miopen-cache-${SLURM_JOB_ID} +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} +export CXI_FORK_SAFE=1 +export CXI_FORK_SAFE_HP=1 +export FI_CXI_DISABLE_CQ_HUGETLB=1 +export GPU_MAX_HW_QUEUES=8 +# We need to set this to avoid "Cassini Event Queue overflow detected." errors. +export FI_CXI_DEFAULT_CQ_SIZE=131072 + +## Job settings +export CHECKPOINTS_PATH=$SCRATCH_DIR/checkpoints +export HF_DATASETS_OFFLINE=1 +export SINGULARITYENV_TORCH_DIST_INIT_BARRIER=1 +# Try playing with max_split_size_mb if you run into OOM errors. +#export PYTORCH_HIP_ALLOC_CONF=max_split_size_mb:128 + +## Debug settings +#export NCCL_DEBUG=INFO +#export FI_LOG_LEVEL=INFO + +srun \ + --cpus-per-task=$SLURM_CPUS_PER_TASK \ + --distribution=block:block \ + --kill-on-bad-exit \ + scripts/run_with_environment.sh \ + singularity exec \ + -B"$PROJECT_DIR:$PROJECT_DIR" \ + -B"$FLASH_DIR:$FLASH_DIR" \ + -B"$SCRATCH_DIR:$SCRATCH_DIR" \ + -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ + $OLMO_CONTAINER \ + scripts/lumi/run-in-container.sh \ + python scripts/train.py configs/mitchish1-s3.yaml \ + "${@}" diff --git a/scripts/lumi/run-in-container.sh b/scripts/lumi/run-in-container.sh new file mode 100644 index 000000000..57654031d --- /dev/null +++ b/scripts/lumi/run-in-container.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Put setup of conda in an env variable if conda is needed +if [[ ! -z "${CONDA_ENV}" ]]; then + source /opt/miniconda3/bin/activate ${CONDA_ENV} +fi + +${@} From a80782dcbf85ea9c19197a491e5b75bdd7840daf Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 2 Oct 2024 15:48:24 -0700 Subject: [PATCH 2/3] Update LUMI scripts to use new container --- scripts/lumi/log_into_node.sh | 6 ++---- scripts/lumi/lumi-interactive.sh | 6 ++---- scripts/pyspy_all_nodes.sh | 6 ++---- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/scripts/lumi/log_into_node.sh b/scripts/lumi/log_into_node.sh index c432418ba..2d4e8c8cc 100755 --- a/scripts/lumi/log_into_node.sh +++ b/scripts/lumi/log_into_node.sh @@ -7,8 +7,6 @@ srun --interactive --pty --jobid=$1 \ -B"$PROJECT_DIR:$PROJECT_DIR" \ -B"$SCRATCH_DIR:$SCRATCH_DIR" \ -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/llm-lumi-torch23_latest.sif \ + -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ + $OLMO_CONTAINER \ fish diff --git a/scripts/lumi/lumi-interactive.sh b/scripts/lumi/lumi-interactive.sh index 78c28ae4c..856a1958c 100755 --- a/scripts/lumi/lumi-interactive.sh +++ b/scripts/lumi/lumi-interactive.sh @@ -5,7 +5,5 @@ singularity shell \ -B"$PROJECT_DIR:$PROJECT_DIR" \ -B"$SCRATCH_DIR:$SCRATCH_DIR" \ -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/llm-lumi-torch23_latest.sif + -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ + $OLMO_CONTAINER diff --git a/scripts/pyspy_all_nodes.sh b/scripts/pyspy_all_nodes.sh index 4bb13ad5d..8efca7798 100755 --- a/scripts/pyspy_all_nodes.sh +++ b/scripts/pyspy_all_nodes.sh @@ -7,8 +7,6 @@ srun --overlap --jobid $1 \ -B"$PROJECT_DIR:$PROJECT_DIR" \ -B"$SCRATCH_DIR:$SCRATCH_DIR" \ -B"$FLASH_DIR:$FLASH_DIR" \ - -B /opt/cray:/opt/cray \ - -B /usr/lib64/libcxi.so.1:/usr/lib64/libcxi.so.1 \ - -B /usr/lib64/libjson-c.so.3:/usr/lib64/libjson-c.so.3 \ - $PROJECT_DIR/containers/llm-lumi_latest.sif \ + -B /var/spool/slurmd,/opt/cray/,/usr/lib64/libcxi.so.1,/usr/lib64/libjansson.so.4,/usr/lib64/libjson-c.so.3 \ + $OLMO_CONTAINER \ bash scripts/pyspy_all_processes.sh | sort -s -t: -k1,1 From d5ca5a6d0c092a2fd620aeb9bac24a94e72b108b Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 2 Oct 2024 15:49:42 -0700 Subject: [PATCH 3/3] Update demo script job name --- scripts/lumi/demo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lumi/demo.sh b/scripts/lumi/demo.sh index 4b203484e..3a876e2ec 100755 --- a/scripts/lumi/demo.sh +++ b/scripts/lumi/demo.sh @@ -3,7 +3,7 @@ # Demo script for running multinode jobs on LUMI. You can run this as a batch job using # sbatch or as part of an interactive session by running this script as an executable. # -#SBATCH --job-name=peteish13-highlr +#SBATCH --job-name=demo #SBATCH --account=project_462000229 #SBATCH --output=/scratch/project_462000229/logs/%j.log #SBATCH --nodes=128 # Total number of nodes