diff --git a/env-files/torch/horovod-deepspeed.slurm b/env-files/torch/horovod-deepspeed.slurm new file mode 100644 index 00000000..be126f7e --- /dev/null +++ b/env-files/torch/horovod-deepspeed.slurm @@ -0,0 +1,20 @@ +#!/bin/bash + +# Job configuration +#SBATCH --job-name=setup_venv +#SBATCH --account=intertwin +#SBATCH --output=horovod_ds_installation.out +#SBATCH --error=horovod_ds_installation.err +#SBATCH --time=00:30:00 + +# Resources allocation +#SBATCH --partition=develbooster +#SBATCH --nodes=1 +#SBATCH --gres=gpu + +ml --force purge +ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA +ml Python/3.11.3 CMake HDF5 PnetCDF libaio mpi4py git Clang + +source .venv/bin/activate +bash install-horovod-deepspeed-cuda.sh diff --git a/env-files/torch/install-horovod-deepspeed-cuda.sh b/env-files/torch/install-horovod-deepspeed-cuda.sh index 4709763d..e0fb2bed 100755 --- a/env-files/torch/install-horovod-deepspeed-cuda.sh +++ b/env-files/torch/install-horovod-deepspeed-cuda.sh @@ -1,23 +1,3 @@ -#!/bin/bash - -# Job configuration -#SBATCH --job-name=setup_venv -#SBATCH --account=intertwin -#SBATCH --output=horovod_ds_installation.out -#SBATCH --error=horovod_ds_installation.err -#SBATCH --time=00:30:00 - -# Resources allocation -#SBATCH --partition=develbooster -#SBATCH --nodes=1 -#SBATCH --gres=gpu - -ml --force purge -ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA -ml Python/3.11.3 CMake HDF5 PnetCDF libaio mpi4py git Clang - -source .venv/bin/activate - # DeepSpeed variables export DS_BUILD_CCL_COMM=1 export DS_BUILD_UTILS=1 @@ -28,6 +8,7 @@ export DS_BUILD_TRANSFORMER=1 export DS_BUILD_STOCHASTIC_TRANSFORMER=1 export DS_BUILD_TRANSFORMER_INFERENCE=1 +# We do --no-cache-dir because the .cache dir eats our HPC quota :( pip install --no-cache-dir --no-build-isolation "deepspeed==0.15.*" # Horovod variables