From 2be0e25fb2bdfe264c656fa1b6bb69cff577912b Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 4 Dec 2023 11:56:33 -0800 Subject: [PATCH 1/2] Use single arch and a wrapper for NVIDIA reduction support Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/nvidia/Makefile.share | 5 +-- runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 37 ++++++++++++++------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share index 2bd1742d7cbc..00b248078877 100644 --- a/runtime/src/gpu/nvidia/Makefile.share +++ b/runtime/src/gpu/nvidia/Makefile.share @@ -28,10 +28,7 @@ GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) # a comma-separated list. RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version \ -Xclang -fcuda-allow-variadic-functions \ - --offload-arch=sm_60 \ - --offload-arch=sm_61 \ - --offload-arch=sm_70 \ - --offload-arch=sm_75 + --offload-arch=$(CHPL_MAKE_GPU_ARCH) $(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \ $(RUNTIME_OBJ_DIR_STAMP) diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc index daae4dce16b6..e52c1cc496c8 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -31,6 +31,13 @@ // the implementation in the rest of the runtime and the modules more // straightforward. #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ +static void __cub_val_##cub_kind##_##data_type(void* temp, size_t temp_bytes, \ + data_type* data, data_type* result, \ + int n, CUstream stream) { \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ +}\ +\ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream) {\ @@ -38,11 +45,10 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ - (CUstream)stream); \ + __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, \ + (CUstream)stream); \ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ - (CUstream)stream); \ + __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, (CUstream)stream); \ CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\ (CUstream)stream)); \ CUDA_CALL(cuMemFree(result)); \ @@ -54,22 +60,29 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) #undef DEF_ONE_REDUCE_RET_VAL +template +using kvp = cub::KeyValuePair; + #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +\ +static void __cub_val_idx_##cub_kind##_##data_type(void* temp, size_t temp_bytes, data_type* data, \ + kvp* result, int n, CUstream stream) { \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ +} \ +\ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream) {\ - using kvp = cub::KeyValuePair; \ CUdeviceptr result; \ - CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ + CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ - (CUstream)stream);\ + __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp*)result, n, (CUstream)stream);\ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ - (CUstream)stream);\ - kvp result_host; \ - CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ + __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp*)result, n, (CUstream)stream);\ + kvp result_host; \ + CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ (CUstream)stream)); \ *val = result_host.value; \ *idx = result_host.key; \ From 5ddef3e333a2c87a5548286d501435007efe225f Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 4 Dec 2023 12:07:23 -0800 Subject: [PATCH 2/2] Revert reduction implementation changes Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 37 +++++++-------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc index e52c1cc496c8..daae4dce16b6 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -31,13 +31,6 @@ // the implementation in the rest of the runtime and the modules more // straightforward. #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ -static void __cub_val_##cub_kind##_##data_type(void* temp, size_t temp_bytes, \ - data_type* data, data_type* result, \ - int n, CUstream stream) { \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ - (CUstream)stream); \ -}\ -\ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream) {\ @@ -45,10 +38,11 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, \ - (CUstream)stream); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - __cub_val_##cub_kind##_##data_type(temp, temp_bytes, data, (data_type*)result, n, (CUstream)stream); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\ (CUstream)stream)); \ CUDA_CALL(cuMemFree(result)); \ @@ -60,29 +54,22 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) #undef DEF_ONE_REDUCE_RET_VAL -template -using kvp = cub::KeyValuePair; - #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ -\ -static void __cub_val_idx_##cub_kind##_##data_type(void* temp, size_t temp_bytes, data_type* data, \ - kvp* result, int n, CUstream stream) { \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ - (CUstream)stream);\ -} \ -\ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream) {\ + using kvp = cub::KeyValuePair; \ CUdeviceptr result; \ - CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ + CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp*)result, n, (CUstream)stream);\ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - __cub_val_idx_##cub_kind##_##data_type(temp, temp_bytes, data, (kvp*)result, n, (CUstream)stream);\ - kvp result_host; \ - CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ + kvp result_host; \ + CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ (CUstream)stream)); \ *val = result_host.value; \ *idx = result_host.key; \