-
Notifications
You must be signed in to change notification settings - Fork 234
[Manual] Devito on TURSA [A100 GPUs].
George Bisbas edited this page Sep 20, 2024
·
16 revisions
DIRAC login page and management Login to SAFE systems.
# After completing the registration
# Do `ssh` to your login node (password only, no keys are used)
ssh <USERNAME>@tursa.dirac.ed.ac.uk
# To quickly see the available versions of any software do not forget that you can do:
module avail -t 2>&1 | grep -i <keyword>
# e.g.
module avail -t 2>&1 | grep -i nvidia
# We need to build our own Python in Tursa since the default is 3.6.
# Then add to PATH
cd Python-3.12.6/
export PATH=${PWD}:$PATH
cd ../devito/
# To build mpi4py
module load gcc/9.3.0
module load nvhpc/23.5-nompi
module load openmpi/4.1.5-cuda12.3
module list
# WIP install mpi4py
Compiled with: bash-4.4$ which mpicc
/mnt/lustre/tursafs1/apps/basestack/cuda-12.3/openmpi/4.1.5-cuda12.3-slurm/bin/mpicc
CXX=$(which nvc++) CC=$(which nvc) python -m pip install --force-reinstall --no-cache-dir mpi4py
# and
bash-4.4$ module list
Currently Loaded Modulefiles:
1) /mnt/lustre/tursafs1/home/y07/shared/tursa-modules/setup-env 4) ucx/1.15.0-cuda12.3
2) gcc/9.3.0 5) openmpi/4.1.5-cuda12.3
3) nvhpc/23.5-nompi
# MPICC=/mnt/lustre/tursafs1/apps/basestack/cuda-12.3/openmpi/4.1.5-cuda12.3-slurm/bin/mpicc CC=nvc python -m pip install --force-reinstall --no-cache-dir mpi4py
# CXX=$(which nvc++) CC=$(which nvc) python -m pip install --force-reinstall --no-cache-dir mpi4py
# This is what worked!!!
we have to rm openmpi and GCC!!!
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
bash-4.4$ module list
Currently Loaded Modulefiles:
1) /mnt/lustre/tursafs1/home/y07/shared/tursa-modules/setup-env 2) nvhpc/23.5-nompi
srun --nodes=1 --ntasks-per-node=2 --cpus-per-task=16 python examples/seismic/acoustic/acoustic_example.py -d 124 124 124 --tn 1024 -so 8
bash-4.4$ mpicxx --version
nvc++ 23.5-0 64-bit target on x86-64 Linux -tp zen2
NVIDIA Compilers and Tools
Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Requesting an interactive job
salloc --nodes=1 --ntasks-per-node=32 --cpus-per-task=1 --time=01:00:00 --partition=gpu-a100-80 --gres=gpu:2 --qos=dev --account=<code> --gpu-freq=1410
salloc --nodes=2 --cpus-per-task=1 --time=01:00:00 --partition=gpu-a100-80 --gres=gpu:4 --qos=dev --account=<> --job-name=dev_job --gpu-freq=1410
module load gcc/9.3.0
module load nvhpc/23.5-nompi
module load openmpi/4.1.5-cuda12.3
module list
# WIP
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
#!/bin/bash
# Slurm job options
#SBATCH --job-name=GPU-1-job
#SBATCH --time=01:00:00
#SBATCH --partition=gpu-a100-80
#SBATCH --qos=standard
# Replace [budget code] below with your budget code (e.g. t01)
#SBATCH --account=dp346
# Request right number of full nodes (48 cores by node for A100-80 GPU nodes))
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=48
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH -o /home/dp346/dp346/dc-bisb2/gpu-jobs/output-1-gpu.%j.out # STDOUT
# Add our Python to PATH
cd /home/dp346/dp346/dc-bisb2/Python-3.12.6/
export PATH=${PWD}:$PATH
cd /home/dp346/dp346/dc-bisb2/devito
# Load needed modules: WARNING: You need other modules to BUILD mpi4py
module load nvhpc/23.5-nompi
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
mpicxx --version
module list
# Use a custom TMPDIR
export TMPDIR=/home/dp346/dp346/dc-bisb2/devito_temp
# Devito environment
export DEVITO_MPI=1
export DEVITO_LANGUAGE=openacc
export DEVITO_LOGGING=DEBUG
export DEVITO_PROFILING=advanced2
export DEVITO_PLATFORM=nvidiaX
export DEVITO_COMPILER=nvc
# We have reserved the full nodes, now distribute the processes as
# required: 4 MPI processes per node, stride of 12 cores between
# MPI processes
#
=======================
This seems to cause trouble at least for openacc
# Note use of gpu_launch.sh wrapper script for GPU and NIC pinning (???)
==================== ???
export DEVITO_SAFE_HALO=1
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/acoustic/acoustic_example.py -d 1158 1158 1158 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/acoustic/acoustic_example.py -d 1158 1158 1158 --tn 1024 -so 12
export DEVITO_SAFE_HALO=2
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/elastic/elastic_example.py -d 832 832 832 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/elastic/elastic_example.py -d 832 832 832 --tn 1024 -so 12
export DEVITO_SAFE_HALO=1
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/tti/tti_example.py -d 896 896 896 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/tti/tti_example.py -d 896 896 896 --tn 1024 -so 12
export DEVITO_SAFE_HALO=2
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/viscoelastic/viscoelastic_example.py -d 704 704 704 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/viscoelastic/viscoelastic_example.py -d 704 704 704 --tn 1024 -so 12
watch -n 10 'squeue --me'
watch -n 10 'watch -n 10 'squeue | grep gpu-a100''
watch -n 0.1 'nvidia-smi'
ncu --version
# NVIDIA (R) Nsight Compute Command Line Profiler
# Copyright (c) 2018-2023 NVIDIA Corporation
# Version 2023.1.1.0 (build 32678585) (public-release)
srun --nodes=1 --ntasks-per-node=2 --cpus-per-task=8 --hint=nomultithread --distribution=block:block gpu_launch.sh \ ncu --section "SpeedOfLight" python examples/seismic/acoustic/acoustic_example.py -d 280 158 158 --tn 4 -so 8