From f0089839b68a0794338505cf92273ce7217f0e2b Mon Sep 17 00:00:00 2001 From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> Date: Tue, 8 Oct 2024 18:21:43 -0500 Subject: [PATCH] Add new configs for gpu on hpe cray ex This adds the following new configurations scripts for testing gpus on HPE Cray EX systems: - `test-gpu-ex-cpu` (analogous to `test-gpu-cpu` for Cray CS) - `test-gpu-ex-cuda-12.interop` (analogous to `test-gpu-cuda.interop` for Cray CS) - `test-gpu-ex-cuda-12.specialization` (analogous to `test-gpu-cuda.specialization` for Cray CS) - `test-perf.gpu-ex-cuda-12.um` (analogous to `test-perf.gpu-cuda.um` for Cray CS) Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> --- util/cron/test-gpu-ex-cpu.bash | 14 +++++++++ util/cron/test-gpu-ex-cuda-12.bash | 1 + util/cron/test-gpu-ex-cuda-12.interop.bash | 27 +++++++++++++++++ .../test-gpu-ex-cuda-12.specialization.bash | 21 +++++++++++++ util/cron/test-perf.gpu-ex-cuda-12.um.bash | 30 +++++++++++++++++++ 5 files changed, 93 insertions(+) create mode 100755 util/cron/test-gpu-ex-cpu.bash create mode 100755 util/cron/test-gpu-ex-cuda-12.interop.bash create mode 100755 util/cron/test-gpu-ex-cuda-12.specialization.bash create mode 100755 util/cron/test-perf.gpu-ex-cuda-12.um.bash diff --git a/util/cron/test-gpu-ex-cpu.bash b/util/cron/test-gpu-ex-cpu.bash new file mode 100755 index 000000000000..d5956f5d09af --- /dev/null +++ b/util/cron/test-gpu-ex-cpu.bash @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# +# GPU native testing on a Cray EX (using none for CHPL_COMM) + +CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd) +source $CWD/common-native-gpu.bash +source $CWD/common-hpe-cray-ex.bash + +export CHPL_GPU=cpu +export CHPL_COMM=none +export CHPL_GPU_NO_CPU_MODE_WARNING=y + +export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cpu" +$CWD/nightly -cron ${nightly_args} diff --git a/util/cron/test-gpu-ex-cuda-12.bash b/util/cron/test-gpu-ex-cuda-12.bash index 69a18fe76b4a..ff13152932df 100755 --- a/util/cron/test-gpu-ex-cuda-12.bash +++ b/util/cron/test-gpu-ex-cuda-12.bash @@ -12,6 +12,7 @@ export CHPL_LLVM=bundled # CUDA 12 is only supported with bundled LLVM export CHPL_COMM=none export CHPL_LOCALE_MODEL=gpu export CHPL_LAUNCHER_PARTITION=allgriz +export CHPL_TEST_GPU=true export CHPL_GPU=nvidia # amd is also detected automatically export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12" diff --git a/util/cron/test-gpu-ex-cuda-12.interop.bash b/util/cron/test-gpu-ex-cuda-12.interop.bash new file mode 100755 index 000000000000..974132a6c3bd --- /dev/null +++ b/util/cron/test-gpu-ex-cuda-12.interop.bash @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +# GPU native testing on a Cray EX (using none for CHPL_COMM) + +CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd) +source $CWD/common.bash +source $CWD/common-hpe-cray-ex.bash + + +# We need 12.4 for the stream test because the CUDA driver on pinoak +# only supports PTX for 12.4, untill the driver is updated, we need to +# stick with 12.4 instead of 12.5 +module load cuda/12.4 # default is CUDA 12 + +# We need cublas for the cublas interop test, but since we are using 12.4 above +# pinoak doesn't have the cublas library for 12.4, so we need to use the cublas +# from 12.5 (which is compatible across minor versions) +# This can be removed once we use CUDA 12.5 +export CHPL_LIB_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/24.7/math_libs/lib64" + +export CHPL_LLVM=bundled # CUDA 12 is only supported with bundled LLVM +export CHPL_TEST_GPU=true +export CHPL_LAUNCHER_PARTITION=allgriz +export CHPL_NIGHTLY_TEST_DIRS="gpu/interop/" + +export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12.interop" +$CWD/nightly -cron ${nightly_args} diff --git a/util/cron/test-gpu-ex-cuda-12.specialization.bash b/util/cron/test-gpu-ex-cuda-12.specialization.bash new file mode 100755 index 000000000000..5228ca3f7a5b --- /dev/null +++ b/util/cron/test-gpu-ex-cuda-12.specialization.bash @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# GPU native testing on a Cray EX (using none for CHPL_COMM) + +CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd) +source $CWD/common-native-gpu.bash +source $CWD/common-hpe-cray-ex.bash + +module load cudatoolkit # default is CUDA 12 + +export CHPL_LLVM=bundled # CUDA 12 is only supported with bundled LLVM +export CHPL_COMM=none +export CHPL_LOCALE_MODEL=gpu +export CHPL_LAUNCHER_PARTITION=allgriz +export CHPL_TEST_GPU=true +export CHPL_GPU=nvidia # amd is also detected automatically + +export CHPL_GPU_SPECIALIZATION=y + +export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12.specialization" +$CWD/nightly -cron ${nightly_args} diff --git a/util/cron/test-perf.gpu-ex-cuda-12.um.bash b/util/cron/test-perf.gpu-ex-cuda-12.um.bash new file mode 100755 index 000000000000..75f5c38c52f8 --- /dev/null +++ b/util/cron/test-perf.gpu-ex-cuda-12.um.bash @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# +# Run GPU performance testing on a Cray EX + +CWD=$(cd $(dirname $0) ; pwd) +source $CWD/common-native-gpu.bash +source $CWD/common-hpe-cray-ex.bash + +module load cudatoolkit # default is CUDA 12 + +export CHPL_LLVM=bundled # CUDA 12 is only supported with bundled LLVM +export CHPL_COMM=none +export CHPL_LOCALE_MODEL=gpu +export CHPL_LAUNCHER_PARTITION=allgriz +export CHPL_GPU=nvidia # amd is detected automatically +export CHPL_GPU_MEM_STRATEGY=unified_memory + +export CHPL_NIGHTLY_TEST_CONFIG_NAME="perf.gpu-ex-cuda-12.um" + +export CHPL_TEST_PERF_CONFIG_NAME="1-node-a100" # pinoak has ampere GPUs +source $CWD/common-native-gpu-perf.bash +# make sure this comes after setting SUBDIR (set by native-gpu-perf) and +# CONFIG_NAME +source $CWD/common-perf.bash + +SHORT_NAME=um +nightly_args="${nightly_args} -performance-description $SHORT_NAME -performance-configs default:v,$SHORT_NAME:v -sync-dir-suffix $SHORT_NAME" +nightly_args="${nightly_args} -startdate 10/10/24" + +$CWD/nightly -cron ${nightly_args}