From 2be0e25fb2bdfe264c656fa1b6bb69cff577912b Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 4 Dec 2023 11:56:33 -0800
Subject: [PATCH 1/2] Use single arch and a wrapper for NVIDIA reduction
 support

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/nvidia/Makefile.share       |  5 +--
 runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 37 ++++++++++++++-------
 2 files changed, 26 insertions(+), 16 deletions(-)
diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share
index 2bd1742d7cbc..00b248078877 100644
--- a/runtime/src/gpu/nvidia/Makefile.share
+++ b/runtime/src/gpu/nvidia/Makefile.share
@@ -28,10 +28,7 @@ GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
 # a comma-separated list.
 RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version \
                     -Xclang -fcuda-allow-variadic-functions \
-                    --offload-arch=sm_60 \
-                    --offload-arch=sm_61 \
-                    --offload-arch=sm_70 \
-                    --offload-arch=sm_75
+                    --offload-arch=$(CHPL_MAKE_GPU_ARCH)
 
 $(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \
                                          $(RUNTIME_OBJ_DIR_STAMP)
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
index daae4dce16b6..e52c1cc496c8 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -31,6 +31,13 @@
 // the implementation in the rest of the runtime and the modules more
 // straightforward.
 #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
+static void __cub_val_##cub_kind##_##data_type(void* temp, size_t temp_bytes, \
+                                           data_type* data, data_type* result, \
+                                           int n, CUstream stream) { \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
+}\
+\
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
@@ -38,11 +45,10 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
-                              (CUstream)stream); \
+  __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, \
+                           (CUstream)stream); \
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
-                              (CUstream)stream); \
+  __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, (CUstream)stream); \
   CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\
                               (CUstream)stream)); \
   CUDA_CALL(cuMemFree(result)); \
@@ -54,22 +60,29 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
 
 #undef DEF_ONE_REDUCE_RET_VAL
 
+template <typename data_type>
+using kvp = cub::KeyValuePair<int, data_type>;
+
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+\
+static void __cub_val_idx_##cub_kind##_##data_type(void* temp, size_t temp_bytes, data_type* data, \
+                         kvp<data_type>* result, int n, CUstream stream) { \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp<data_type>*)result, n,\
+                              (CUstream)stream);\
+} \
+\
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
-  using kvp = cub::KeyValuePair<int,data_type>; \
   CUdeviceptr result; \
-  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp<data_type>))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
-                              (CUstream)stream);\
+  __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp<data_type>*)result, n, (CUstream)stream);\
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
-                              (CUstream)stream);\
-  kvp result_host; \
-  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
+  __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp<data_type>*)result, n, (CUstream)stream);\
+  kvp<data_type> result_host; \
+  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp<data_type>),\
                               (CUstream)stream)); \
   *val = result_host.value; \
   *idx = result_host.key; \

From 5ddef3e333a2c87a5548286d501435007efe225f Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 4 Dec 2023 12:07:23 -0800
Subject: [PATCH 2/2] Revert reduction implementation changes

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 37 +++++++--------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
index e52c1cc496c8..daae4dce16b6 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -31,13 +31,6 @@
 // the implementation in the rest of the runtime and the modules more
 // straightforward.
 #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
-static void __cub_val_##cub_kind##_##data_type(void* temp, size_t temp_bytes, \
-                                           data_type* data, data_type* result, \
-                                           int n, CUstream stream) { \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
-                              (CUstream)stream); \
-}\
-\
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
@@ -45,10 +38,11 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, \
-                           (CUstream)stream); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, (CUstream)stream); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
   CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\
                               (CUstream)stream)); \
   CUDA_CALL(cuMemFree(result)); \
@@ -60,29 +54,22 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
 
 #undef DEF_ONE_REDUCE_RET_VAL
 
-template <typename data_type>
-using kvp = cub::KeyValuePair<int, data_type>;
-
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
-\
-static void __cub_val_idx_##cub_kind##_##data_type(void* temp, size_t temp_bytes, data_type* data, \
-                         kvp<data_type>* result, int n, CUstream stream) { \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp<data_type>*)result, n,\
-                              (CUstream)stream);\
-} \
-\
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
+  using kvp = cub::KeyValuePair<int,data_type>; \
   CUdeviceptr result; \
-  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp<data_type>))); \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp<data_type>*)result, n, (CUstream)stream);\
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp<data_type>*)result, n, (CUstream)stream);\
-  kvp<data_type> result_host; \
-  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp<data_type>),\
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
+  kvp result_host; \
+  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
                               (CUstream)stream)); \
   *val = result_host.value; \
   *idx = result_host.key; \