Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allreduce perf #3

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions allreduce-perf/common/launch_wrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/bin/bash

# get the current process ID
pid=$$

# retrieve the CPU affinity mask in hexadecimal format
affinity_mask=$(taskset -p $pid | awk -F': ' '{print $2}')

# get numerical list (decimal) of numa nodes and cores in the affinity mask
numa_list=$(hwloc-calc --physical --intersect NUMAnode 0x$affinity_mask)
cpu_list=$(hwloc-calc --physical --intersect core 0x$affinity_mask)
cpu_list_short=$(taskset -pc $pid | awk -F': ' '{print $2}')

# calculate number of numa nodes / cpus
IFS=',' read -r -a numa_array <<< "$numa_list"
numa_count=${#numa_array[@]}
IFS=',' read -r -a cpu_array <<< "$cpu_list"
cpu_count=${#cpu_array[@]}

# each GPU is associated with a NUMA node:
gpu_list=$numa_list
gpu_count=$numa_count

# make devices visible: only export devices for which the current process has affinity
export CUDA_VISIBLE_DEVICES=$gpu_list

# export the nic - doesn't work with pytorch
#IFS=',' read -r first_node other_nodes <<< "$numa_list"
#first_nic="cxi${first_node}"
#export FI_CXI_DEVICE_NAME=$first_nic

# get local and global ranks from either Slurm or OMPI environment variables
lrank=0
grank=0
if [ -z ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]
then
let lrank=$SLURM_LOCALID
let grank=$SLURM_PROCID

# MPICH options
# is required for CUDA-aware MPI to work
export MPICH_GPU_SUPPORT_ENABLED=1
# MPICH_GPU_SUPPORT_ENABLED=1 and MPICH_SMP_SINGLE_COPY_MODE=XPMEM are
# mutually exclusive, and MPICH will fall back to CMA if GPU support is
# enabled.
#export MPICH_SMP_SINGLE_COPY_MODE=xpmem
export MPICH_SMP_SINGLE_COPY_MODE=CMA
else
let lrank=$OMPI_COMM_WORLD_LOCAL_RANK
let grank=$OMPI_COMM_WORLD_RANK

# OPENMPI options
#export OMPI_MCA_btl_ofi_mode=2
#export OMPI_MCA_pml_ob1_max_rdma_per_request=1
fi

# print info about distribution of jobs
if [[ $grank == 0 ]]
then
echo "Slurm Job Hostlist: $SLURM_JOB_NODELIST"
fi
echo "Hostname: $(hostname) Rank: $grank, Local $lrank, GPUs $gpu_list (count=$gpu_count), CPUs $cpu_list_short (count=$cpu_count)"

# export variables for pytorch
export LOCAL_RANK=${lrank}
export RANK=${grank}
export MASTER_ADDR=$(scontrol show hostname $SLURM_NODELIST | head -n1)
export MASTER_PORT=29500 # default from torch launcher
export WORLD_SIZE=$SLURM_NTASKS
export TORCH_CUDA_ARCH_LIST=9.0
export MAX_JOBS=$cpu_count
export CXX=`which g++`
export CC=`which gcc`

# would make the code print out a stacktrace in case of a crash - only works when compiled with -traceback compiler option
#export NVCOMPILER_TERM=trace

## OpenACC options
## makes small H2D copies faster
#export NVCOMPILER_ACC_DEFER_UPLOADS=1
#export NVCOMPILER_ACC_SYNCHRONOUS=1
#export NVCOMPILER_ACC_USE_GRAPH=1
#export NV_ACC_CUDA_MEMALLOCASYNC=1
#export NV_ACC_CUDA_MEMALLOCASYNC_POOLSIZE=500000000000

## is required to avoid hangs at scale
#export FI_MR_CACHE_MONITOR=disabled
#export FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD=0
#
## speeds up (or rather recovers good performance) GPU direct communications over the network. Without it MPI takes 3-5x longer when G2G is enabled
#export FI_CXI_RX_MATCH_MODE=software

# run the command
"$@"
14 changes: 14 additions & 0 deletions allreduce-perf/mpi-cpp/env/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@


build instructions for uenv

run on compute node:
====================

/path/to/stackinator/bin/stack-config -c /path/to/build-cace-config.yaml -r ./recipe -b /dev/shm/<USERNAME>/allreduce -s /path/to/alps-cluster-config/santis -m /user-environment --develop

cd /dev/shm/<USERNAME>/allreduce

env --ignore-environment PATH=/usr/bin:/bin:`pwd`/spack/bin HOME=$HOME https_proxy=$https_proxy http_proxy=$http_proxy no_proxy="$no_proxy" make store.squashfs -j288

cp store.squashfs /this/path/
5 changes: 5 additions & 0 deletions allreduce-perf/mpi-cpp/env/recipe/compilers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bootstrap:
spec: [email protected]
gcc:
specs:
- [email protected]
7 changes: 7 additions & 0 deletions allreduce-perf/mpi-cpp/env/recipe/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: test_mpi
spack:
commit: origin/develop
repo: https://github.com/spack/spack.git
store: /user-environment
description: Test MPI in CMake

26 changes: 26 additions & 0 deletions allreduce-perf/mpi-cpp/env/recipe/environments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
test_mpi-env:
compiler:
- toolchain: gcc
spec: gcc
mpi:
spec: [email protected]
gpu: cuda
unify: true
specs:
- gcc
- cray-mpich
- aws-ofi-nccl@master
- xpmem
- cmake
- [email protected]
- ninja
- fmt
- osu-micro-benchmarks
variants:
- +mpi
- +cuda
- cuda_arch=90
views:
default:
link: roots

157 changes: 157 additions & 0 deletions allreduce-perf/mpi-cpp/run/data/job_n_00004_N_0001_TPN_4.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
All nodes and tasks are healthy.
All ranks passed the checks.
loading the view test_mpi:default with activation script /user-environment/env/default/activate.sh
Hostname: nid005066 Rank: 3, Local 3, GPUs 3 (count=1), CPUs 216-287 (count=72)
Hostname: nid005066 Rank: 1, Local 1, GPUs 1 (count=1), CPUs 72-143 (count=72)
Hostname: nid005066 Rank: 2, Local 2, GPUs 2 (count=1), CPUs 144-215 (count=72)
Slurm Job Hostlist: nid005066
Hostname: nid005066 Rank: 0, Local 0, GPUs 0 (count=1), CPUs 0-71 (count=72)
Found 1 devices for local rank 0 (4) -> setting device = 0
Found 1 devices for local rank 1 (4) -> setting device = 0
Found 1 devices for local rank 2 (4) -> setting device = 0
Found 1 devices for local rank 3 (4) -> setting device = 0
mem_type: device
size: 1073741824
padding: 0
naive algo: 0

[-1] time: 0.349024
[0:0] time: 0.104134 bw: 61.8672
[0:1] time: 0.0984388 bw: 65.4462
[0:2] time: 0.0983045 bw: 65.5357
[0:3] time: 0.0981081 bw: 65.6669
[0:4] time: 0.0982657 bw: 65.5616
[1:0] time: 0.104553 bw: 61.6189
[1:1] time: 0.0985256 bw: 65.3886
[1:2] time: 0.0986397 bw: 65.313
[1:3] time: 0.0987565 bw: 65.2357
[1:4] time: 0.098472 bw: 65.4242
[2:0] time: 0.106711 bw: 60.3728
[2:1] time: 0.0985837 bw: 65.3501
[2:2] time: 0.0984828 bw: 65.417
[2:3] time: 0.0985135 bw: 65.3966
[2:4] time: 0.0985541 bw: 65.3697
[3:0] time: 0.104247 bw: 61.8001
[3:1] time: 0.0986414 bw: 65.3118
[3:2] time: 0.0987943 bw: 65.2108
[3:3] time: 0.0985138 bw: 65.3964
[3:4] time: 0.0983782 bw: 65.4866
[4:0] time: 0.104339 bw: 61.7454
[4:1] time: 0.0984836 bw: 65.4165
[4:2] time: 0.0982274 bw: 65.5871
[4:3] time: 0.0984484 bw: 65.4399
[4:4] time: 0.0982334 bw: 65.5831
[5:0] time: 0.104839 bw: 61.4511
[5:1] time: 0.0989071 bw: 65.1364
[5:2] time: 0.0991509 bw: 64.9762
[5:3] time: 0.0988247 bw: 65.1907
[5:4] time: 0.0984352 bw: 65.4487
[6:0] time: 0.10503 bw: 61.3394
[6:1] time: 0.0986091 bw: 65.3332
[6:2] time: 0.098447 bw: 65.4408
[6:3] time: 0.0987215 bw: 65.2588
[6:4] time: 0.09866 bw: 65.2995
[7:0] time: 0.104397 bw: 61.7113
[7:1] time: 0.0985353 bw: 65.3821
[7:2] time: 0.0983509 bw: 65.5048
[7:3] time: 0.0984036 bw: 65.4697
[7:4] time: 0.0985731 bw: 65.3571
[8:0] time: 0.104087 bw: 61.8951
[8:1] time: 0.0982648 bw: 65.5621
[8:2] time: 0.0981557 bw: 65.635
[8:3] time: 0.0981288 bw: 65.653
[8:4] time: 0.0985666 bw: 65.3614
[9:0] time: 0.104625 bw: 61.5768
[9:1] time: 0.0988162 bw: 65.1963
[9:2] time: 0.098727 bw: 65.2552
[9:3] time: 0.0985673 bw: 65.3609
[9:4] time: 0.0987365 bw: 65.2489

=======================================
type: device
size: 4294967296
pad: 0
naive: 0
time: 0.0985237
time0: 0.104696
bw: 65.3902
bw0: 61.5378
=======================================

Hostname: nid005066 Rank: 3, Local 3, GPUs 3 (count=1), CPUs 216-287 (count=72)
Hostname: nid005066 Rank: 1, Local 1, GPUs 1 (count=1), CPUs 72-143 (count=72)
Hostname: nid005066 Rank: 2, Local 2, GPUs 2 (count=1), CPUs 144-215 (count=72)
Slurm Job Hostlist: nid005066
Hostname: nid005066 Rank: 0, Local 0, GPUs 0 (count=1), CPUs 0-71 (count=72)
Found 1 devices for local rank 0 (4) -> setting device = 0
Found 1 devices for local rank 1 (4) -> setting device = 0
Found 1 devices for local rank 2 (4) -> setting device = 0
Found 1 devices for local rank 3 (4) -> setting device = 0
mem_type: device
size: 1073741824
padding: 1
naive algo: 0

[-1] time: 14.8462
[0:0] time: 14.8938 bw: 0.43256
[0:1] time: 0.0980822 bw: 65.6842
[0:2] time: 0.0982305 bw: 65.5851
[0:3] time: 0.0982166 bw: 65.5943
[0:4] time: 0.0980554 bw: 65.7022
[1:0] time: 14.8596 bw: 0.433556
[1:1] time: 0.0981696 bw: 65.6257
[1:2] time: 0.0982016 bw: 65.6043
[1:3] time: 0.0982873 bw: 65.5471
[1:4] time: 0.0981629 bw: 65.6302
[2:0] time: 14.8851 bw: 0.432811
[2:1] time: 0.0981743 bw: 65.6226
[2:2] time: 0.0982659 bw: 65.5614
[2:3] time: 0.0981308 bw: 65.6516
[2:4] time: 0.0982912 bw: 65.5445
[3:0] time: 14.8734 bw: 0.433154
[3:1] time: 0.0982415 bw: 65.5777
[3:2] time: 0.0983114 bw: 65.5311
[3:3] time: 0.0983579 bw: 65.5001
[3:4] time: 0.0984724 bw: 65.4239
[4:0] time: 14.8678 bw: 0.433314
[4:1] time: 0.0982261 bw: 65.588
[4:2] time: 0.0981781 bw: 65.62
[4:3] time: 0.0981592 bw: 65.6327
[4:4] time: 0.0981481 bw: 65.6401
[5:0] time: 14.8587 bw: 0.433582
[5:1] time: 0.0982778 bw: 65.5535
[5:2] time: 0.0982901 bw: 65.5453
[5:3] time: 0.0982992 bw: 65.5392
[5:4] time: 0.0983163 bw: 65.5278
[6:0] time: 14.8971 bw: 0.432465
[6:1] time: 0.0984986 bw: 65.4065
[6:2] time: 0.0984132 bw: 65.4633
[6:3] time: 0.0984688 bw: 65.4263
[6:4] time: 0.0982974 bw: 65.5404
[7:0] time: 14.8828 bw: 0.432879
[7:1] time: 0.0983158 bw: 65.5282
[7:2] time: 0.0984325 bw: 65.4505
[7:3] time: 0.0986286 bw: 65.3203
[7:4] time: 0.0985999 bw: 65.3394
[8:0] time: 14.8631 bw: 0.433452
[8:1] time: 0.098213 bw: 65.5968
[8:2] time: 0.0982337 bw: 65.5829
[8:3] time: 0.0983646 bw: 65.4956
[8:4] time: 0.0982486 bw: 65.573
[9:0] time: 14.9171 bw: 0.431884
[9:1] time: 0.0984047 bw: 65.4689
[9:2] time: 0.0981122 bw: 65.6641
[9:3] time: 0.0980827 bw: 65.6839
[9:4] time: 0.0983252 bw: 65.5219

=======================================
type: device
size: 4294967296
pad: 4
naive: 0
time: 0.0982796
time0: 14.8798
bw: 65.5524
bw0: 0.432966
=======================================

Loading