From 2b7c65f9e445ba5415c981a4546378f5486febef Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 13 Oct 2020 04:49:46 -0400 Subject: [PATCH 01/41] configure: Add the --with-hip option Signed-off-by: Aurelien Bouteiller --- configure | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/configure b/configure index dfeae934..7c61564d 100755 --- a/configure +++ b/configure @@ -132,6 +132,9 @@ cat <&2 "Python is required. Please provide a path to the python executable."; exit 3;; From d9d8a18ddc64dff65cf71902d8ce5cb841ab0466 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 19 Oct 2020 15:51:12 -0400 Subject: [PATCH 02/41] hip: Configury Signed-off-by: Aurelien Bouteiller --- CMakeLists.txt | 4 ++++ src/include/dplasma/config.h.in | 1 + tests/Testings.cmake | 41 +++++++++++++++++++++++---------- tests/common.c | 3 ++- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 606c3089..d16829ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -238,6 +238,10 @@ if(DPLASMA_HAVE_CUDA) find_package(CUDAToolkit REQUIRED) endif(NOT TARGET CUDA::cusolver) endif() +option(DPLASMA_HAVE_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) +if(DPLASMA_HAVE_HIP) + message(STATUS "HIP support for DPLASMA enabled") +endif() ############################################################################ # CTest system diff --git a/src/include/dplasma/config.h.in b/src/include/dplasma/config.h.in index 1b180b84..b72b34de 100644 --- a/src/include/dplasma/config.h.in +++ b/src/include/dplasma/config.h.in @@ -6,6 +6,7 @@ /* GPU Backends */ #cmakedefine DPLASMA_HAVE_CUDA +#cmakedefine DPLASMA_HAVE_HIP /* system feature tests */ #cmakedefine DPLASMA_HAVE_COMPLEX_H diff --git a/tests/Testings.cmake b/tests/Testings.cmake index fd1e8c79..dd0e9b83 100644 --- a/tests/Testings.cmake +++ b/tests/Testings.cmake @@ -143,14 +143,24 @@ foreach(prec ${DPLASMA_PRECISIONS} ) # GPU tests if (DPLASMA_HAVE_CUDA) - dplasma_add_test(potrf potrf 1gpu_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 1gpu_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 21) - dplasma_add_test(potrf potrf 1gpu_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 2gpu_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 1gpu_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 1gpu_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_cuda_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 2gpu_cuda_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_cuda_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_cuda_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) endif (DPLASMA_HAVE_CUDA) + if (DPLASMA_HAVE_HIP) + dplasma_add_test(potrf potrf 1gpu_hip_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_hip_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_hip_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 2gpu_hip_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_hip_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_hip_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_hip_memory_number_of_blocks 4096) + endif (DPLASMA_HAVE_HIP) + # if ( ${prec} STREQUAL "c" OR ${prec} STREQUAL "z" ) # dplasma_add_test(heev "" ${PTG2DTD}_shm -N 4000 ${OPTIONS}) @@ -250,12 +260,19 @@ if( MPI_C_FOUND ) # GPU Cholesky tests if (DPLASMA_HAVE_CUDA AND MPI_C_FOUND) - dplasma_add_test(potrf potrf 1gpu_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 1gpu_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf_1gpu 2gpu_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_cuda_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf_1gpu 2gpu_cuda_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 21) endif (DPLASMA_HAVE_CUDA AND MPI_C_FOUND) + if (DPLASMA_HAVE_HIP AND MPI_C_FOUND) + dplasma_add_test(potrf potrf 1gpu_hip_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_hip_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf_1gpu 2gpu_hip_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 21) + endif (DPLASMA_HAVE_HIP AND MPI_C_FOUND) # dplasma_add_test(potrf_pbq "" mpi:${PROCS} -N 4000 ${OPTIONS} -o PBQ) # dplasma_add_test(geqrf_pbq "" mpi:${PROCS} -N 4000 ${OPTIONS} -o PBQ) diff --git a/tests/common.c b/tests/common.c index ebec936b..aba935f7 100644 --- a/tests/common.c +++ b/tests/common.c @@ -314,7 +314,7 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam) break; case 'g': -#if !defined(DPLASMA_HAVE_CUDA) +#if !defined(DPLASMA_HAVE_CUDA) && !defined(DPLASMA_HAVE_HIP) iparam[IPARAM_NGPUS] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */ #endif if(iparam[IPARAM_NGPUS] == DPLASMA_ERR_NOT_SUPPORTED) { @@ -326,6 +326,7 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam) rc = asprintf(&value, "%d", iparam[IPARAM_NGPUS]); parsec_setenv_mca_param( "device_cuda_enabled", value, &environ ); + parsec_setenv_mca_param( "device_hip_enabled", value, &environ ); free(value); break; From 828011f84b9fddce7e793e5e869cbe08e8c79cf4 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 19 Oct 2020 15:51:50 -0400 Subject: [PATCH 03/41] hip: kernel typedefs Signed-off-by: Aurelien Bouteiller --- src/dplasmajdf.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index 8006c23f..cbdb2178 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -25,9 +25,12 @@ es->th_id, es->virtual_process->vp_id, __VA_ARGS__) # define printlogcuda(str, ...) fprintf(stderr, "cuda %d " str "\n", \ gpu_device->cuda_index, __VA_ARGS__) +# define printloghip(str, ...) fprintf(stderr, "hip %d " str "\n", \ + gpu_device->hip_index, __VA_ARGS__) #else # define printlog(...) do {} while(0) # define printlogcuda(...) do {} while(0) +# define printloghip(...) do {} while(0) #endif #ifndef PARSEC_HAVE_MPI @@ -40,6 +43,27 @@ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include + +typedef hipblasStatus_t (*hipblas_zgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, + hipblasDoubleComplex alpha, hipblasDoubleComplex *d_A, int lda, + hipblasDoubleComplex *d_B, int ldb, + hipblasDoubleComplex beta, hipblasDoubleComplex *d_C, int ldc ); +typedef hipblasStatus_t (*hipblas_cgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, + hipblasComplex alpha, hipblasComplex *d_A, int lda, + hipblasComplex *d_B, int ldb, + hipblasComplex beta, hipblasComplex *d_C, int ldc ); +typedef hipblasStatus_t (*hipblas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, + double alpha, double *d_A, int lda, + double *d_B, int ldb, + double beta, double *d_C, int ldc ); +typedef hipblasStatus_t (*hipblas_sgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, + float alpha, float *d_A, int lda, + float *d_B, int ldb, + float beta, float *d_C, int ldc ); +#endif /* defined(DPLASMA_HAVE_HIPBLAS) */ + #endif /* _DPLASMAJDF_H_ */ From 26635ca2dc5a78cc794541dce3b3f637e9f46c36 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 19 Oct 2020 15:52:43 -0400 Subject: [PATCH 04/41] hip: update for hip-enabled parsec Signed-off-by: Aurelien Bouteiller --- src/zgeqrf.jdf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index 103362b0..7b3667e0 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -483,8 +483,8 @@ BODY [type=CUDA device=%{ return n; %} int ldam_V = descA->mb; int ldam_T = descT->mb; - WORK = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); - WORKC = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); + WORK = parsec_cuda_workspace_pop(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); + WORKC = parsec_cuda_workspace_pop(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); dplasma_cuda_ztsmqr( dplasmaLeft, dplasmaConjTrans, descA->mb, tempnn, tempmm, tempnn, descA->nb, ib, From ffd7bdd1b86321609e39cc94c0c1a344fd8e4015 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 20 Oct 2020 11:28:36 -0400 Subject: [PATCH 05/41] hip: detect hipblas and rocsolvers Signed-off-by: Aurelien Bouteiller Conflicts: src/CMakeLists.txt --- CMakeLists.txt | 6 ++++++ src/CMakeLists.txt | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d16829ff..2595d7f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,6 +240,12 @@ if(DPLASMA_HAVE_CUDA) endif() option(DPLASMA_HAVE_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) if(DPLASMA_HAVE_HIP) + # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents + set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH}) + list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm) + find_package(hipblas REQUIRED) + find_package(rocsolver REQUIRED) + set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save}) message(STATUS "HIP support for DPLASMA enabled") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 60f7b7b8..9d543639 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -232,7 +232,9 @@ target_link_libraries(dplasma PaRSEC::parsec LAPACKE::LAPACKE $<$:CUDA::cublas> - $<$:CUDA::cusolver>) + $<$:CUDA::cusolver> + $<$:roc::hipblas> + $<$:roc::rocsolver>) set_target_properties(dplasma PROPERTIES VERSION ${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR} SOVERSION ${DPLASMA_VERSION_MAJOR}) From 7db5636fdbe37ce00b9c24118843ce11ed03083e Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 20 Oct 2020 11:31:10 -0400 Subject: [PATCH 06/41] hip: precision generator rules Signed-off-by: Aurelien Bouteiller --- tools/PrecisionGenerator/subs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/PrecisionGenerator/subs.py b/tools/PrecisionGenerator/subs.py index 0d81f61a..e16a64cb 100644 --- a/tools/PrecisionGenerator/subs.py +++ b/tools/PrecisionGenerator/subs.py @@ -69,7 +69,7 @@ ('#undef COMPLEX', '#undef COMPLEX', '#undef REAL', '#undef REAL' ), ('#define SINGLE', '#define DOUBLE', '#define SINGLE', '#define DOUBLE' ), ('#undef DOUBLE', '#undef SINGLE', '#undef DOUBLE', '#undef SINGLE' ), - ('float', 'double', 'dplasma_complex32_t', 'dplasma_complex64_t' ), + ('float', 'double', 'dplasma_complex32_t', 'dplasma_complex64_t'), ('PARSEC_MATRIX_FLOAT', 'PARSEC_MATRIX_DOUBLE', 'PARSEC_MATRIX_COMPLEX_FLOAT', 'PARSEC_MATRIX_COMPLEX_DOUBLE'), ('dplasma_float', 'dplasma_double', 'dplasma_complex32', 'dplasma_complex64' ), ## for doxygen categories ('dplasma_cores_float', 'dplasma_cores_double','dplasma_cores_complex32', 'dplasma_cores_complex64'), ## for doxygen categories @@ -81,11 +81,15 @@ ('smatrix', 'dmatrix', 'cmatrix', 'zmatrix' ), ('stwoDBC', 'dtwoDBC', 'ctwoDBC', 'ztwoDBC' ), ('float', 'double', 'cuFloatComplex', 'cuDoubleComplex' ), + ('float', 'double', 'hipComplex', 'hipDoubleComplex' ), + ('float', 'double', 'hipblasComplex', 'hipblasDoubleComplex'), ## both needed for make_hipComplex() ('float', 'double', 'cuCdivf', 'cuCdiv' ), + ('float', 'double', 'hipCdivf', 'hipCdiv' ), ('', '', 'crealf', 'creal' ), ('', '', 'cimagf', 'cimag' ), ('', '', 'conjf', 'conj' ), ('', '', 'cuCfmaf', 'cuCfma' ), + ('', '', 'hipCfmaf', 'hipCfma' ), ('cblas_snrm2','cblas_dnrm2','cblas_scnrm2','cblas_dznrm2'), ('cblas_sasum','cblas_dasum','cblas_scasum','cblas_dzasum'), @@ -279,6 +283,10 @@ ('cuda_s', 'cuda_d', 'cuda_c', 'cuda_z' ), ('cublasS', 'cublasD', 'cublasS', 'cublasD' ), ('cublasS', 'cublasD', 'cublasC', 'cublasZ' ), + ('hip_s', 'hip_d', 'hip_s', 'hip_d' ), + ('hip_s', 'hip_d', 'hip_c', 'hip_z' ), + ('hipblasS', 'hipblasD', 'hipblasS', 'hipblasD' ), + ('hipblasS', 'hipblasD', 'hipblasC', 'hipblasZ' ), ('example_s', 'example_d', 'example_c', 'example_z' ), ('FLOPS_SSY', 'FLOPS_DSY', 'FLOPS_CHE', 'FLOPS_ZHE' ), ('FLOPS_S', 'FLOPS_D', 'FLOPS_C', 'FLOPS_Z' ), From d2044bcc6310d2e41724c1fd68f9273e0530e801 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 20 Oct 2020 14:45:20 -0400 Subject: [PATCH 07/41] hip: cleanup unused dyld hipblas functions Signed-off-by: Aurelien Bouteiller --- src/dplasmajdf.h | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index cbdb2178..a6a5c752 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -43,26 +43,10 @@ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ + #if defined(DPLASMA_HAVE_HIP) #include - -typedef hipblasStatus_t (*hipblas_zgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, - hipblasDoubleComplex alpha, hipblasDoubleComplex *d_A, int lda, - hipblasDoubleComplex *d_B, int ldb, - hipblasDoubleComplex beta, hipblasDoubleComplex *d_C, int ldc ); -typedef hipblasStatus_t (*hipblas_cgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, - hipblasComplex alpha, hipblasComplex *d_A, int lda, - hipblasComplex *d_B, int ldb, - hipblasComplex beta, hipblasComplex *d_C, int ldc ); -typedef hipblasStatus_t (*hipblas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, - double alpha, double *d_A, int lda, - double *d_B, int ldb, - double beta, double *d_C, int ldc ); -typedef hipblasStatus_t (*hipblas_sgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, - float alpha, float *d_A, int lda, - float *d_B, int ldb, - float beta, float *d_C, int ldc ); -#endif /* defined(DPLASMA_HAVE_HIPBLAS) */ +#endif /* defined(DPLASMA_HAVE_HIP) */ #endif /* _DPLASMAJDF_H_ */ From d283bdeece777ebfc8054ec97d7751a6f6bf4100 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 27 May 2021 02:51:46 -0400 Subject: [PATCH 08/41] hip: Update lapack stagein Signed-off-by: Aurelien Bouteiller --- src/dplasmajdf_lapack_dtt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index 04b55c14..d7648c3d 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -127,9 +127,9 @@ void ADTT_INFO_internal(parsec_data_copy_t *cp, const dplasma_data_collection_t * Assuming a full tiled has been allocated on the GPU (mb*nb*size(elem)) */ static int -stage_in_lapack(parsec_gpu_task_t *gtask, +stage_in_lapack(parsec_cuda_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) + parsec_cuda_exec_stream_t *gpu_stream) { cudaError_t ret; parsec_data_copy_t * copy_in; @@ -203,9 +203,9 @@ stage_in_lapack(parsec_gpu_task_t *gtask, } static int -stage_out_lapack(parsec_gpu_task_t *gtask, +stage_out_lapack(parsec_cuda_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) + parsec_cuda_exec_stream_t *gpu_stream) { cudaError_t ret; parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; From a7de750031d7500ab874f79090d2d291628050d9 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 26 Jul 2021 11:59:54 -0400 Subject: [PATCH 09/41] Update for feature/common_gpu parsec branch changes Signed-off-by: Aurelien Bouteiller --- src/dplasmajdf_lapack_dtt.h | 24 ++++++++++++------------ src/zgemm_wrapper.c | 20 +++++++++----------- src/zpotrf_wrapper.c | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index d7648c3d..11706b23 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -127,15 +127,15 @@ void ADTT_INFO_internal(parsec_data_copy_t *cp, const dplasma_data_collection_t * Assuming a full tiled has been allocated on the GPU (mb*nb*size(elem)) */ static int -stage_in_lapack(parsec_cuda_task_t *gtask, +stage_in_lapack(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_cuda_exec_stream_t *gpu_stream) + parsec_gpu_exec_stream_t *gpu_stream) { cudaError_t ret; parsec_data_copy_t * copy_in; parsec_data_copy_t * copy_out; - parsec_device_cuda_module_t *in_elem_dev; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; + parsec_device_gpu_module_t *in_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; dplasma_data_collection_t * ddc; parsec_task_t *task = gtask->ec; int elem_sz; @@ -147,8 +147,8 @@ stage_in_lapack(parsec_cuda_task_t *gtask, ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; assert(ddc != NULL); elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index); - if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); + if( (in_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)) { ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, copy_in->device_private, gtask->flow_nb_elts[i], @@ -189,7 +189,7 @@ stage_in_lapack(parsec_cuda_task_t *gtask, src, nrows * elem_sz, cudaMemcpyHostToDevice, - gpu_stream->cuda_stream ); + cuda_stream->cuda_stream ); PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); } @@ -203,15 +203,15 @@ stage_in_lapack(parsec_cuda_task_t *gtask, } static int -stage_out_lapack(parsec_cuda_task_t *gtask, +stage_out_lapack(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_cuda_exec_stream_t *gpu_stream) + parsec_gpu_exec_stream_t *gpu_stream) { cudaError_t ret; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; parsec_data_copy_t * copy_in; parsec_data_copy_t * copy_out; - parsec_device_cuda_module_t *out_elem_dev; + parsec_device_gpu_module_t *out_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; parsec_task_t *task = gtask->ec; dplasma_data_collection_t * ddc; int elem_sz; @@ -223,7 +223,7 @@ stage_out_lapack(parsec_cuda_task_t *gtask, ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; assert(ddc != NULL); elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index); + out_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_out->device_index); if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index e2d5ebb7..ed02d751 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -14,9 +14,7 @@ #include "dplasma/types_lapack.h" #include "dplasmaaux.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" -#if defined(DPLASMA_HAVE_CUDA) -#include "parsec/mca/device/cuda/device_cuda.h" -#endif +#include "parsec/mca/device/device_gpu.h" #include "utils/dplasma_info.h" #include "zgemm_NN.h" @@ -223,12 +221,12 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); if( PARSEC_DEV_CUDA == device->type ) { - parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nbgpu++; if( 0 == gpu_mem_block_size ) - gpu_mem_block_size = cuda_device->super.mem_block_size; - if( -1 == gpu_mem_nb_blocks || cuda_device->super.mem_nb_blocks < gpu_mem_nb_blocks ) - gpu_mem_nb_blocks = cuda_device->super.mem_nb_blocks; + gpu_mem_block_size = gpu_device->mem_block_size; + if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) + gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; } } if(nbgpu == 0) { @@ -462,12 +460,12 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, for(devid = 0; devid < (int)parsec_nb_devices; devid++) { parsec_device_module_t *device = parsec_mca_device_get(devid); if( PARSEC_DEV_CUDA == device->type ) { - parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nb_gpu_devices++; if( 0 == gpu_mem_block_size ) - gpu_mem_block_size = cuda_device->super.mem_block_size; - if( -1 == gpu_mem_nb_blocks || cuda_device->super.mem_nb_blocks < gpu_mem_nb_blocks ) - gpu_mem_nb_blocks = cuda_device->super.mem_nb_blocks; + gpu_mem_block_size = gpu_device->mem_block_size; + if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) + gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; } } if(0 < nb_gpu_devices) { diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 71da5948..c4a3cc1a 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -59,7 +59,7 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb ) void *zpotrf_create_workspace(void *obj, void *user) { parsec_device_module_t *mod = (parsec_device_module_t *)obj; - zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; cusolverDnHandle_t cusolverDnHandle; cusolverStatus_t status; parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user; From 92e7fe8e34d8694ed4528690e2b5b8338fd52f67 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 6 Aug 2021 15:22:29 -0400 Subject: [PATCH 10/41] Some conflicting updates between hip and common_gpu need more resolution Signed-off-by: Aurelien Bouteiller --- src/zgeqrf.jdf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index 7b3667e0..103362b0 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -483,8 +483,8 @@ BODY [type=CUDA device=%{ return n; %} int ldam_V = descA->mb; int ldam_T = descT->mb; - WORK = parsec_cuda_workspace_pop(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); - WORKC = parsec_cuda_workspace_pop(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); + WORK = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->nb * ib * sizeof(dplasma_complex64_t)); + WORKC = parsec_gpu_pop_workspace(gpu_device, gpu_stream, descA->mb * ib * sizeof(dplasma_complex64_t)); dplasma_cuda_ztsmqr( dplasmaLeft, dplasmaConjTrans, descA->mb, tempnn, tempmm, tempnn, descA->nb, ib, From c02ebcdafe14606082c0851407232b5702a41c26 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 20 Oct 2020 12:58:21 -0400 Subject: [PATCH 11/41] hip: stream info registration --- share/help-dplasma.txt | 11 ++++---- src/dplasmaaux.c | 57 ++++++++++++++++++++++++++++++++++++++++-- src/dplasmaaux.h | 8 ++++++ src/zpotrf_L.jdf | 16 ++++++------ src/zpotrf_U.jdf | 15 +++++------ src/zpotrf_wrapper.c | 19 ++++++++++---- tests/common.c | 30 +++++++++++++++++++++- 7 files changed, 129 insertions(+), 27 deletions(-) diff --git a/share/help-dplasma.txt b/share/help-dplasma.txt index 841d89aa..18ed9187 100644 --- a/share/help-dplasma.txt +++ b/share/help-dplasma.txt @@ -1,8 +1,9 @@ -[cu*_alloc_failed] -There was not enough memory available on a CUDA device +[gpu_alloc_failed] +There was not enough memory available on a GPU device while trying to allocate a %s handle to manage tasks on -this device, or another CUDA device on the node. The +this device, or another GPU device on the node. The PaRSEC runtime system may be configured to reserve too -much memory on CUDA devices. Try reducing the amount of +much memory on GPU devices. Try reducing the amount of reserved memory by setting the PaRSEC MCA parameter -'device_cuda_memory_use' to a lower value. +'device_cuda_memory_use' (or similar for the type of +device) to a lower value. diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 8cda3295..3de476b9 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -162,7 +162,7 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) cublas_status = cublasCreate(&cublas_handle); if(CUBLAS_STATUS_SUCCESS != cublas_status) { if( CUBLAS_STATUS_ALLOC_FAILED == cublas_status ) { - parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "CUBLAS"); + parsec_show_help("help-dplasma.txt", "gpu_alloc_failed", 1, "CUBLAS"); } parsec_fatal("Unable to create CUBLAS Handle: %s", dplasma_cublas_error_to_string(cublas_status)); @@ -177,7 +177,7 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) if(CUSOLVER_STATUS_SUCCESS != cusolver_status) { cublasDestroy(cublas_handle); if( CUSOLVER_STATUS_ALLOC_FAILED == cusolver_status ) { - parsec_show_help("help-dplasma.txt", "cu*_alloc_failed", 1, "cusolver"); + parsec_show_help("help-dplasma.txt", "gpu_alloc_failed", 1, "cusolver"); } parsec_fatal("Unable to create a cuSolver handle: %s", dplasma_cusolver_error_to_string(cusolver_status)); @@ -194,3 +194,56 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) } #endif + +#if defined(DPLASMA_HAVE_HIP) +#include +#include "potrf_wrapper.h" +#include "parsec/utils/zone_malloc.h" + +/* Unfortunately, HIPBLAS does not provide a error to string function */ +static char *dplasma_hipblas_error_to_string(hipblasStatus_t hipblas_status) +{ + switch(hipblas_status) + { + case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS"; + case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED"; + case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED"; + case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; + case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; + case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR"; + case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; + case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; + default: return "unknown HIPBLAS error"; + } +} + + +void *dplasma_create_cuda_handles(void *obj, void *_n) +{ + parsec_hip_exec_stream_t *stream = (parsec_hip_exec_stream_t *)obj; + dplasma_hip_handles_t *new; + hipblasHandle_t hipblas_handle; + hipblasStatus_t cublas_status; + + (void)_n; + + + /* No need to call hipSetDevice, as this has been done by PaRSEC before calling the task body */ + hipblas_status = hipblasCreate(&hipblas_handle); + if(HIPBLAS_STATUS_SUCCESS != hipblas_status) { + if( HIPBLAS_STATUS_ALLOC_FAILED == hipblas_status) { + parsec_show_help("help-dplasma.txt", "gpu_alloc_failed", 1, "HIPBLAS"); + } + parsec_fatal("Unable to create HIPBLAS Handle: %s", dplasma_hipblas_error_to_string(hipblas_status)); + return NULL; + } + hipblas_status = hipblasSetStream(hipblas_handle, stream->hip_stream); + assert(HIPBLAS_STATUS_SUCCESS == hipblas_status); + + new = malloc(sizeof(dplasma_hip_handles_t)); + new->hipblas_handle = hipblas_handle; + + return new; +} +#endif + diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index e27e304a..91a92624 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -117,7 +117,15 @@ typedef struct { void * cusolverDn_handle; } dplasma_cuda_handles_t; void *dplasma_create_cuda_handles(void *obj, void *user); +#endif +#if defined(DPLASMA_HAVE_HIP) +#include +#include "parsec/mca/device/hip/device_hip.h" +typedef struct { + hipblasHandle_t hipblas_handle; +} dplasma_hip_handles_t; +void *dplasma_create_hip_handles(void *obj, void *user); #endif #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 58f47e33..0b37e978 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -78,8 +78,10 @@ INFO [type = "int*"] PRI_CHANGE [type = "int" hidden = on default = 0 ] PRI_MAX [type = "int" hidden = on default = "(descA->mt * ( 6 + descA->mt * ( 6 + descA->mt )))" ] smallnb [type = "int" hidden = on default = "descA->mb" ] -CuHandlesID [type = "int" hidden = on default = -1 ] -POWorkspaceID [type = "int" hidden = on default = -1 ] + +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -153,9 +155,9 @@ BODY [type=CUDA if( PlasmaUpper == uplo ) cublas_uplo = CUBLAS_FILL_MODE_UPPER; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; @@ -226,7 +228,7 @@ BODY [type=CUDA] double zone = 1.; #endif cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); status = cublasZtrsm_v2(handles->cublas_handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, @@ -333,7 +335,7 @@ BODY [type=CUDA] double mzone = -1.; cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, @@ -452,7 +454,7 @@ BODY [type=CUDA assert( ldan_B <= descA->mb ); assert( ldam_C <= descA->mb ); - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); status = cublasZgemm_v2( handles->cublas_handle, diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index d64194e7..54b637d4 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -78,8 +78,9 @@ PRI_CHANGE [type = "int" hidden = on default = 0 ] PRI_MAX [type = "int" hidden = on default = "(descA->mt * ( 6 + descA->mt * ( 6 + descA->mt )))" ] smallnb [type = "int" hidden = on default = "descA->mb" ] -CuHandlesID [type = "int" hidden = on default = -1 ] -POWorkspaceID [type = "int" hidden = on default = -1 ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -152,9 +153,9 @@ BODY [type=CUDA if( PlasmaUpper == uplo ) cublas_uplo = CUBLAS_FILL_MODE_UPPER; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; @@ -228,7 +229,7 @@ BODY [type=CUDA] #endif cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); status = cublasZtrsm_v2(handles->cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, @@ -335,7 +336,7 @@ BODY [type=CUDA] double mzone = -1.; cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); status = cublasZherk_v2( handles->cublas_handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, @@ -457,7 +458,7 @@ BODY [type=CUDA assert( ldak_B <= descA->mb ); assert( ldam_C <= descA->mb ); - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetKernelStream( parsec_body.stream ); diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index c4a3cc1a..351986a9 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -175,8 +175,10 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, int *info ) { parsec_zpotrf_L_taskpool_t *parsec_zpotrf = NULL; +#if defined(DPLASMA_HAVE_CUDA) char workspace_info_name[64]; static int uid = 0; +#endif parsec_taskpool_t *tp = NULL; dplasma_data_collection_t * ddc_A = dplasma_wrap_data_collection(A); @@ -201,16 +203,23 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, #if defined(DPLASMA_HAVE_CUDA) /* It doesn't cost anything to define these infos if we have CUDA but * don't have GPUs on the current machine, so we do it non-conditionally */ - parsec_zpotrf->_g_CuHandlesID = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); + parsec_zpotrf->_g_cuda_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); snprintf(workspace_info_name, 64, "DPLASMA::ZPOTRF(%d)::WS", uid++); - parsec_zpotrf->_g_POWorkspaceID = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + parsec_zpotrf->_g_cuda_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, destroy_workspace, NULL, zpotrf_create_workspace, parsec_zpotrf, NULL); #else - parsec_zpotrf->_g_CuHandlesID = PARSEC_INFO_ID_UNDEFINED; - parsec_zpotrf->_g_POWorkspaceID = PARSEC_INFO_ID_UNDEFINED; + parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; (void)uid; (void)workspace_info_name; +#endif +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + parsec_zpotrf->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; #endif int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, @@ -250,7 +259,7 @@ dplasma_zpotrf_Destruct( parsec_taskpool_t *tp ) dplasma_data_collection_t * ddc_A = parsec_zpotrf->_g_ddescA; #if defined(DPLASMA_HAVE_CUDA) - parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_POWorkspaceID, NULL); + parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_cuda_workspaces_infokey, NULL); #endif parsec_taskpool_free(tp); diff --git a/tests/common.c b/tests/common.c index aba935f7..59b342ee 100644 --- a/tests/common.c +++ b/tests/common.c @@ -31,6 +31,10 @@ #include #include #endif +#if defined(DPLASMA_HAVE_HIP) +#include "dplasmaaux.h" +#include +#endif char *PARSEC_SCHED_NAME[] = { "", /* default */ @@ -638,6 +642,16 @@ static void destroy_cuda_handles(void *_h, void *_n) } #endif +#if defined(DPLASMA_HAVE_HIP) +static void destroy_hip_handles(void *_h, void *_n) +{ + dplasma_hip_handles_t *handles = (dplasma_hip_handles_t*)_h; + (void)_n; + hipblasDestroy(handles->hipblas_handle); + free(handles); +} +#endif + parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) { #ifdef PARSEC_PROF_TRACE @@ -725,6 +739,14 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) NULL); } #endif +#if defined(DPLASMA_HAVE_HIP) + /* Unsupported/unecessary + hipblasStatus_t status = hipblasInit(); + assert(HIPBLAS_STATUS_SUCCESS == status); + */ + parsec_info_register(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", + destroy_hip_handles, NULL, NULL); +#endif if(verbose > 2) TIME_PRINT(iparam[IPARAM_RANK], ("PaRSEC initialized\n")); return ctx; @@ -737,7 +759,13 @@ void cleanup_parsec(parsec_context_t* parsec, int *iparam) parsec_info_unregister(&parsec_per_stream_infos, CuHI, NULL); cublasShutdown(); #endif - +#if defined(DPLASMA_HAVE_HIP) + parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); + /* Unsupported/unecessary + hipblasShutdown(); + */ +#endif parsec_fini(&parsec); #ifdef PARSEC_HAVE_MPI From 0d2367a5d9bbcdbc7152b013e5cc0efca6dc3775 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 20 Oct 2020 14:38:57 -0400 Subject: [PATCH 12/41] hip: potrf on AMD Signed-off-by: Aurelien Bouteiller --- src/zpotrf_L.jdf | 107 +++++++++++++++++++++++++++++++++++++++++++++ src/zpotrf_U.jdf | 110 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 0b37e978..833ef4d9 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -20,6 +20,12 @@ static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #include #include "potrf_cublas_utils.h" #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#include +#include +#include "potrf_wrapper.h" +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -169,6 +175,29 @@ BODY [type=CUDA } END +BODY [type=HIP_DISABLED + weigth=k] +{ + int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + hipblasFillMode_t hipblas_uplo; + + if( PlasmaLower == uplo ) + hipblas_uplo = HIPBLAS_FILL_MODE_LOWER; + if( PlasmaUpper == uplo ) + hipblas_uplo = HIPBLAS_FILL_MODE_UPPER; + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + status = rocsolver_zpotrf( handles->hipblas_handle, hipblas_uplo, tempkm, T, ldak, INFO); + assert( HIPBLAS_STATUS_SUCCESS == status ); +} +END + BODY { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; @@ -240,6 +269,31 @@ BODY [type=CUDA] } END +BODY [type=HIP + weight=(m+k)] +{ + int tempmm = m == descA->mt - 1 ? descA->m - m * descA->mb : descA->mb; + int ldak = BLKLDD(descA, k); + int ldan = BLKLDD(descA, n); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); +#else + double zone = 1.; +#endif + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + hipblasStatus_t status; + status = hipblasZtrsm(handles->hipblas_handle, + HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, + HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, + tempmm, descA->nb, + &zone, T, ldak, C, ldan); + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END BODY [type=RECURSIVE] { @@ -347,6 +401,27 @@ BODY [type=CUDA] } END +BODY [type=HIP + weight=(m+k)] +{ + int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; + int ldam = BLKLDD( descA, m ); + double zone = 1.; + double mzone = -1.; + hipblasStatus_t status; + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, + HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, + tempmm, descA->mb, + &mzone, A, ldam, + &zone, T, ldam); + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END + BODY [type=RECURSIVE] { int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; @@ -468,6 +543,38 @@ BODY [type=CUDA } END +BODY [type=HIP + weight=(n+1-k)] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); + hipDoubleComplex mzone = make_hipDoubleComplex(-1., 0.); +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + int ldam = BLKLDD( descA, m ); + int ldan = BLKLDD( descA, n ); + + hipblasStatus_t status; + assert( ldan <= descA->mb ); + assert( ldam <= descA->mb ); + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + status = hipblasZgemm( handles->hipblas_handle, + HIPBLAS_OP_N, HIPBLAS_OP_C, + tempmm, descA->mb, descA->mb, + &mzone, (hipDoubleComplex*)A, ldam, + (hipDoubleComplex*)B, ldan, + &zone, (hipDoubleComplex*)C, ldam ); + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END + BODY [type=RECURSIVE] { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 54b637d4..8916f462 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -4,6 +4,7 @@ extern "C" %{ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. + * $COPYRIGHT * * @precisions normal z -> s d c * @@ -19,6 +20,12 @@ static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #include #include "potrf_cublas_utils.h" #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#include +#include +#include "potrf_wrapper.h" +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -167,6 +174,29 @@ BODY [type=CUDA } END +BODY [type=DISABLED_HIP + weigth=k] +{ + int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; + int ldak = BLKLDD( descA, k ); + + hipblasStatus_t status; + hipblasFillMode_t hipblas_uplo; + + if( PlasmaLower == uplo ) + hipblas_uplo = HIPBLAS_FILL_MODE_LOWER; + if( PlasmaUpper == uplo ) + hipblas_uplo = HIPBLAS_FILL_MODE_UPPER; + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + status = rocsolver_zpotrf( handles->hipblas_handle, hipblas_uplo, tempkn, T, ldak, INFO); + assert( HIPBLAS_STATUS_SUCCESS == status ); +} +END + BODY { int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; @@ -241,6 +271,30 @@ BODY [type=CUDA] } END +BODY [type=HIP + weight=(k+n)] +{ + int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; + int ldak = BLKLDD(descA, k); + dplasma_hip_handles_t *handles; +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); +#else + double zone = 1.; +#endif + + hipblasStatus_t status; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm(handles->hipblas_handle, + HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, + HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, + tempnn, descA->mb, + &zone, T, ldak, C, ldak); + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; @@ -349,6 +403,28 @@ BODY [type=CUDA] } END +BODY [type=HIP + weight=(k+n)] +{ + int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; + int ldan = BLKLDD( descA, n ); + int ldak = BLKLDD( descA, k ); + double zone = 1.; + double mzone = -1.; + hipblasStatus_t status; + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, + HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, + tempnn, descA->mb, + &mzone, A, ldak, + &zone, T, ldan); + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; @@ -481,6 +557,40 @@ BODY [type=CUDA } END +BODY [type=HIP + weight=(m+1-k)] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); + hipDoubleComplex mzone = make_hipDoubleComplex(-1., 0.); +#else + double zone = 1.; + double mzone = -1.; +#endif + + int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + int ldak = BLKLDD( descA, k ); + int ldam = BLKLDD( descA, m ); + + hipblasStatus_t status; + assert( ldak <= descA->mb ); + assert( ldam <= descA->mb ); + + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + + status = hipblasZgemm( handles->hipblas_handle, + HIPBLAS_OP_C, HIPBLAS_OP_N, + descA->mb, tempnn, descA->mb, + &mzone, (hipDoubleComplex*)A, ldak, + (hipDoubleComplex*)B, ldak, + &zone, (hipDoubleComplex*)C, ldam ); + + assert(HIPBLAS_STATUS_SUCCESS == status); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; From 1fc2d1431a76650feef6fc6d9bb05cd4389bf1d3 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 11 Aug 2021 15:25:33 -0400 Subject: [PATCH 13/41] hip:po: Some errors introduced when merging Signed-off-by: Aurelien Bouteiller --- src/dplasmaaux.c | 5 ++--- src/zpotrf_L.jdf | 1 - src/zpotrf_U.jdf | 1 - tests/common.c | 4 +++- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 3de476b9..a17338bb 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -197,7 +197,6 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) #if defined(DPLASMA_HAVE_HIP) #include -#include "potrf_wrapper.h" #include "parsec/utils/zone_malloc.h" /* Unfortunately, HIPBLAS does not provide a error to string function */ @@ -218,12 +217,12 @@ static char *dplasma_hipblas_error_to_string(hipblasStatus_t hipblas_status) } -void *dplasma_create_cuda_handles(void *obj, void *_n) +void *dplasma_create_hip_handles(void *obj, void *_n) { parsec_hip_exec_stream_t *stream = (parsec_hip_exec_stream_t *)obj; dplasma_hip_handles_t *new; hipblasHandle_t hipblas_handle; - hipblasStatus_t cublas_status; + hipblasStatus_t hipblas_status; (void)_n; diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 833ef4d9..da10fd13 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -24,7 +24,6 @@ static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #include #include #include -#include "potrf_wrapper.h" #endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 8916f462..ba3c6f6f 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -24,7 +24,6 @@ static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #include #include #include -#include "potrf_wrapper.h" #endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ diff --git a/tests/common.c b/tests/common.c index 59b342ee..7c6a8a54 100644 --- a/tests/common.c +++ b/tests/common.c @@ -745,7 +745,9 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) assert(HIPBLAS_STATUS_SUCCESS == status); */ parsec_info_register(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", - destroy_hip_handles, NULL, NULL); + destroy_hip_handles, NULL, + dplasma_create_hip_handles, NULL, + NULL); #endif if(verbose > 2) TIME_PRINT(iparam[IPARAM_RANK], ("PaRSEC initialized\n")); From 0cbeece9eb208fecce70d7a8d8fd3cb5362083c4 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 30 Sep 2021 02:36:16 -0400 Subject: [PATCH 14/41] Add HIP to the lookahead gpu gemm Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_gpu.jdf | 68 ++++++++++++++++++++++++++++++++++++++++++++ src/zgemm_wrapper.c | 23 +++++++++------ 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index ca8dc92c..8f668622 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -19,6 +19,11 @@ extern "C" %{ #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#include +#endif /* defined(DPLASMA_HAVE_HIP) */ + static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) { int xn = *x, yn = *y, zn = *z; @@ -151,6 +156,8 @@ xMax [ type = int default = "-1" hidden=on ] yMax [ type = int default = "-1" hidden=on ] zMax [ type = int default = "-1" hidden=on ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /********************************************************* * READ_A * * A is broadcast to all target GEMMs from the beginning * @@ -426,6 +433,67 @@ BODY [type=CUDA] } END +BODY [type=HIP + weight=(descA->nt-k)] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipDoubleComplex lbeta = (k == 0) ? make_hipDoubleComplex(creal(beta), cimag(beta)) + : make_hipDoubleComplex(1.0, 0.0); +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int cAmb = descA->mb; + int cAnb = descA->nb; + int cBmb = descB->nb; + int cBnb = descB->nb; + int cCmb = cAmb; + int cCnb = cBnb; + + int tempmm = cCmb; + int tempnn = cCnb; + int tempkk = cAnb; + int ldam = cAmb; + int ldbk = cBmb; + int ldcm = cCmb; + + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, + "HIP: gemm( %d, %d, %d ) > A(%d,%d) * B(%d,%d) C(%d,%d)\n", + m, n, k, cAmb, cAnb, cBmb, cBnb, cCmb, cCnb); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipDoubleComplex*)A, ldam, + (hipDoubleComplex*)B, ldbk, + &lbeta, (hipDoubleComplex*)C, ldcm ); + assert(HIPBLAS_STATUS_SUCCESS == status); + PARSEC_HIP_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_DONE;} ); + + /* Quick and dirty emulation of the next GEMM */ + if( k == descC->mt -1 ) { + __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm; + memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t)); + next_gemm.locals.k.value = descC->mt -1; + assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[this_task->chore_id].type ); + if(NULL != next_gemm.task_class->incarnations[this_task->chore_id].evaluate) { + if( next_gemm.task_class->incarnations[this_task->chore_id].evaluate((parsec_task_t*)&next_gemm) == + PARSEC_HOOK_RETURN_NEXT ) { + /* The next GEMM wants to run on the CPUs... */ + gpu_task->pushout |= (1 << 0); + } + } + } +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index ed02d751..5dd316d1 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -186,7 +186,7 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, return zgemm_tp; } -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) static parsec_taskpool_t* dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, dplasma_complex64_t alpha, const parsec_tiled_matrix_t* A, const parsec_tiled_matrix_t* B, @@ -220,7 +220,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nbgpu++; if( 0 == gpu_mem_block_size ) @@ -237,7 +237,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu= 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { dev_index[nbgpu++] = device->device_index; } } @@ -356,8 +356,15 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, K = B->mt; tp->_g_zMax = (K + d - 1) / d - 1; - zgemm_tp = (parsec_taskpool_t *) tp; +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + zgemm_tp = (parsec_taskpool_t *) tp; return zgemm_tp; } @@ -366,7 +373,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, free(dev_index); return NULL; } -#endif /* DPLASMA_HAVE_CUDA */ +#endif /* DPLASMA_HAVE_CUDA || DPLASMA_HAVE_HIP */ /** ******************************************************************************* @@ -451,7 +458,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, } if ( C->dtype & parsec_matrix_block_cyclic_type ) { -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) int nb_gpu_devices = 0, devid; int p = ((parsec_matrix_block_cyclic_t*)C)->grid.rows; int q = ((parsec_matrix_block_cyclic_t*)C)->grid.cols; @@ -459,7 +466,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, int64_t gpu_mem_nb_blocks = -1; for(devid = 0; devid < (int)parsec_nb_devices; devid++) { parsec_device_module_t *device = parsec_mca_device_get(devid); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nb_gpu_devices++; if( 0 == gpu_mem_block_size ) @@ -482,7 +489,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, return zgemm_tp; } } -#endif /* DPLASMA_HAVE_CUDA */ +#endif /* DPLASMA_HAVE_CUDA || DPLASMA_HAVE_HIP */ zgemm_tp = dplasma_zgemm_summa_new(transA, transB, alpha, A, B, beta, C, opt); return zgemm_tp; } From b27190beb5d514b312a4704def7d36e29dbe58bc Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 30 Sep 2021 04:28:18 -0400 Subject: [PATCH 15/41] Add HIP to zgemm_summa Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_summa.jdf | 86 ++++++++++++++++++++++++++++++++---------- src/zgemm_wrapper.c | 28 ++++++++++++++ 2 files changed, 94 insertions(+), 20 deletions(-) diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index d1cfa5ea..9fc01913 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -17,6 +17,9 @@ extern "C" %{ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define A_SHAPE 0 @@ -85,6 +88,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -206,6 +211,34 @@ RW C <- (k == 0) ? ddescC(m, n) [ type = %{ return CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(k+lookP, n, m%P) +BODY +{ + + dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = LDA(ddescA, A); + int ldbk = LDA(ddescB, B); + int ldcm = LDA(ddescC, C); + + CORE_zgemm(transA, transB, + tempmm, tempnn, tempkk, + alpha, A /*A(m, k)*/, ldam, + B /*B(k, n)*/, ldbk, + lbeta, C /*C(m, n)*/, ldcm); + + printlog("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); +} +END + BODY [type=CUDA A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} @@ -242,30 +275,43 @@ BODY [type=CUDA } END -BODY +BODY [type=HIP + weight=(descA->nt-k) + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=stage_in_lapack + stage_out=stage_out_lapack] { - - dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); + hipblasDoubleComplex lbeta = (k == 0) ? make_hipDoubleComplex(creal(beta), cimag(beta)) + : make_hipDoubleComplex(1.0, 0.0); +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; - int ldam = LDA(ddescA, A); - int ldbk = LDA(ddescB, B); - int ldcm = LDA(ddescC, C); - - CORE_zgemm(transA, transB, - tempmm, tempnn, tempkk, - alpha, A /*A(m, k)*/, ldam, - B /*B(k, n)*/, ldbk, - lbeta, C /*C(m, n)*/, ldcm); + int ldam = descA->mb; + int ldbk = descB->mb; + int ldcm = descC->mb; - printlog("gemm( %d, %d, %d )\n" - " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", - m, n, k, - &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), - tempmm, tempnn, tempkk, - creal(alpha), m, k, ldam, - k, n, ldbk, - creal(lbeta), m, n, ldcm ); + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + assert(HIPBLAS_STATUS_SUCCESS == status); + PARSEC_HIP_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_DONE;} ); } END diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 5dd316d1..78c92808 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -78,12 +78,26 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_NN_summa_taskpool_t* tp; tp = parsec_zgemm_NN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } else { PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NT_summa\n"); parsec_zgemm_NT_summa_taskpool_t* tp; tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } } else { @@ -92,6 +106,13 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_TN_summa_taskpool_t* tp; tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } else { PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TT_summa\n"); @@ -99,6 +120,13 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_TT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(PARSEC_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } } From 8d88461d030bdd1dcedbef67ebade227360f3230 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 24 Nov 2021 12:28:57 -0500 Subject: [PATCH 16/41] hip: rework of PO and workspaces Signed-off-by: Aurelien Bouteiller --- src/dplasmaaux.c | 3 -- src/potrf_cublas_utils.h | 42 -------------------- src/potrf_gpu_workspaces.h | 16 ++++++++ src/zpotrf_L.jdf | 68 +++++++++++++++++--------------- src/zpotrf_U.jdf | 79 +++++++++++++++++++++----------------- src/zpotrf_wrapper.c | 47 ++++++++++++++++++++--- 6 files changed, 137 insertions(+), 118 deletions(-) delete mode 100644 src/potrf_cublas_utils.h create mode 100644 src/potrf_gpu_workspaces.h diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index a17338bb..d5cc8bee 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -113,8 +113,6 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A ) #if defined(DPLASMA_HAVE_CUDA) #include #include -#include "potrf_cublas_utils.h" -#include "parsec/utils/zone_malloc.h" /* Unfortunately, CUBLAS does not provide a error to string function */ static char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) @@ -197,7 +195,6 @@ void *dplasma_create_cuda_handles(void *obj, void *_n) #if defined(DPLASMA_HAVE_HIP) #include -#include "parsec/utils/zone_malloc.h" /* Unfortunately, HIPBLAS does not provide a error to string function */ static char *dplasma_hipblas_error_to_string(hipblasStatus_t hipblas_status) diff --git a/src/potrf_cublas_utils.h b/src/potrf_cublas_utils.h deleted file mode 100644 index 41767f53..00000000 --- a/src/potrf_cublas_utils.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020-2021 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - */ -#ifndef DPLASMA_POTRF_CUBLAS_UTILS_H -#define DPLASMA_POTRF_CUBLAS_UTILS_H - -#if defined(DPLASMA_HAVE_CUDA) -#include -#include - -typedef struct { - char *tmpmem; - void *memory; - int lwork; -} dplasma_potrf_workspace_t; - -typedef cusolverStatus_t (*cublas_spotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, float *A, int lda, - float *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_dpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, double *A, int lda, - double *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_cpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, cuComplex *A, int lda, - cuComplex *Workspace, int Lwork, int *devInfo ); - -typedef cusolverStatus_t (*cublas_zpotrf_v2_t) ( - cusolverDnHandle_t handle, cublasFillMode_t uplo, - int n, cuDoubleComplex *A, int lda, - cuDoubleComplex *Workspace, int Lwork, int *devInfo ); - -#endif - -#endif //DPLASMA_POTRF_CUBLAS_UTILS_H diff --git a/src/potrf_gpu_workspaces.h b/src/potrf_gpu_workspaces.h new file mode 100644 index 00000000..f182d04c --- /dev/null +++ b/src/potrf_gpu_workspaces.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2020-2021 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + */ +#ifndef DPLASMA_POTRF_GPU_WORKSPACES_H +#define DPLASMA_POTRF_GPU_WORKSPACES_H + +typedef struct { + char *tmpmem; + void *memory; + int lwork; +} dplasma_potrf_workspace_t; + +#endif //DPLASMA_POTRF_GPU_WORKSPACES_H diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index da10fd13..08e9e07b 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -16,13 +16,13 @@ extern "C" %{ #include "parsec/recursive.h" static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data); +#include "potrf_gpu_workspaces.h" + #if defined(DPLASMA_HAVE_CUDA) #include -#include "potrf_cublas_utils.h" #endif /* defined(DPLASMA_HAVE_CUDA) */ #if defined(DPLASMA_HAVE_HIP) #include -#include #include #endif /* defined(DPLASMA_HAVE_HIP) */ @@ -87,6 +87,7 @@ smallnb [type = "int" hidden = on default = "descA->mb" ] cuda_handles_infokey [type = "int" hidden = on default = -1 ] cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] +hip_workspaces_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -174,26 +175,32 @@ BODY [type=CUDA } END -BODY [type=HIP_DISABLED +BODY [type=HIP weigth=k] { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; int ldak = BLKLDD( descA, k ); - hipblasStatus_t status; - hipblasFillMode_t hipblas_uplo; + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_workspace_t *wp; + int *d_iinfo; if( PlasmaLower == uplo ) - hipblas_uplo = HIPBLAS_FILL_MODE_LOWER; + rocblas_uplo = rocblas_fill_lower; if( PlasmaUpper == uplo ) - hipblas_uplo = HIPBLAS_FILL_MODE_UPPER; + rocblas_uplo = rocblas_fill_upper; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = rocsolver_zpotrf( handles->hipblas_handle, hipblas_uplo, tempkm, T, ldak, INFO); - assert( HIPBLAS_STATUS_SUCCESS == status ); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo); + assert(rocblas_status_success == status ); } END @@ -271,25 +278,24 @@ END BODY [type=HIP weight=(m+k)] { - int tempmm = m == descA->mt - 1 ? descA->m - m * descA->mb : descA->mb; - int ldak = BLKLDD(descA, k); - int ldan = BLKLDD(descA, n); #if defined(PRECISION_z) || defined(PRECISION_c) - hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); + hipblasDoubleComplex zone = { 1., 0. }; #else double zone = 1.; #endif + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + int ldak = LDA(ddescA, T); + int ldam = LDA(ddescA, C); + hipblasStatus_t status; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - - hipblasStatus_t status; status = hipblasZtrsm(handles->hipblas_handle, HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, tempmm, descA->nb, - &zone, T, ldak, C, ldan); + &zone, T, ldak, C, ldam); assert(HIPBLAS_STATUS_SUCCESS == status); } END @@ -403,20 +409,21 @@ END BODY [type=HIP weight=(m+k)] { - int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; - int ldam = BLKLDD( descA, m ); double zone = 1.; double mzone = -1.; - hipblasStatus_t status; + int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; + int ldam_A = LDA(ddescA, A); + int ldam_T = LDA(ddescA, T); + hipblasStatus_t status; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZherk( handles->hipblas_handle, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, tempmm, descA->mb, - &mzone, A, ldam, - &zone, T, ldam); + &mzone, A, ldam_A, + &zone, T, ldam_T); assert(HIPBLAS_STATUS_SUCCESS == status); } END @@ -546,30 +553,29 @@ BODY [type=HIP weight=(n+1-k)] { #if defined(PRECISION_z) || defined(PRECISION_c) - hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); - hipDoubleComplex mzone = make_hipDoubleComplex(-1., 0.); + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; #else double zone = 1.; double mzone = -1.; #endif int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; - int ldam = BLKLDD( descA, m ); - int ldan = BLKLDD( descA, n ); - - hipblasStatus_t status; + int ldam_A = LDA(ddescA, A); + int ldan_B = LDA(ddescA, B); + int ldam_C = LDA(ddescA, C); assert( ldan <= descA->mb ); assert( ldam <= descA->mb ); + hipblasStatus_t status; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_C, tempmm, descA->mb, descA->mb, - &mzone, (hipDoubleComplex*)A, ldam, - (hipDoubleComplex*)B, ldan, - &zone, (hipDoubleComplex*)C, ldam ); + &mzone, A, ldam_A, + B, ldan_B, + &zone, C, ldam_C ); assert(HIPBLAS_STATUS_SUCCESS == status); } END diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index ba3c6f6f..3a2570f9 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -16,13 +16,13 @@ extern "C" %{ #include "parsec/recursive.h" static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_callback_t* data); +#include "potrf_gpu_workspaces.h" + #if defined(DPLASMA_HAVE_CUDA) #include -#include "potrf_cublas_utils.h" #endif /* defined(DPLASMA_HAVE_CUDA) */ #if defined(DPLASMA_HAVE_HIP) #include -#include #include #endif /* defined(DPLASMA_HAVE_HIP) */ @@ -87,6 +87,7 @@ smallnb [type = "int" hidden = on default = "descA->mb" ] cuda_handles_infokey [type = "int" hidden = on default = -1 ] cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] +hip_workspaces_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -173,26 +174,32 @@ BODY [type=CUDA } END -BODY [type=DISABLED_HIP - weigth=k] +BODY [type=HIP + weigth=k] { int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; - int ldak = BLKLDD( descA, k ); + int ldak = LDA(ddescA, T); - hipblasStatus_t status; - hipblasFillMode_t hipblas_uplo; + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_workspace_t *wp; + int *d_iinfo; if( PlasmaLower == uplo ) - hipblas_uplo = HIPBLAS_FILL_MODE_LOWER; + rocblas_uplo = rocblas_fill_lower; if( PlasmaUpper == uplo ) - hipblas_uplo = HIPBLAS_FILL_MODE_UPPER; + rocblas_uplo = rocblas_fill_upper; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = rocsolver_zpotrf( handles->hipblas_handle, hipblas_uplo, tempkn, T, ldak, INFO); - assert( HIPBLAS_STATUS_SUCCESS == status ); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkn, T, ldak, d_iinfo); + assert(rocblas_status_success == status ); } END @@ -273,23 +280,24 @@ END BODY [type=HIP weight=(k+n)] { - int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; - int ldak = BLKLDD(descA, k); - dplasma_hip_handles_t *handles; #if defined(PRECISION_z) || defined(PRECISION_c) - hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); + hipblasDoubleComplex zone = { 1., 0. }; #else double zone = 1.; #endif + int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; + int ldak_T = LDA(ddescA, T); + int ldak_C = LDA(ddescA, C); hipblasStatus_t status; + dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZtrsm(handles->hipblas_handle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, - tempnn, descA->mb, - &zone, T, ldak, C, ldak); + descA->mb, tempnn, + &zone, T, ldak_T, C, ldak_C); assert(HIPBLAS_STATUS_SUCCESS == status); } END @@ -405,18 +413,18 @@ END BODY [type=HIP weight=(k+n)] { - int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; - int ldan = BLKLDD( descA, n ); - int ldak = BLKLDD( descA, k ); double zone = 1.; double mzone = -1.; - hipblasStatus_t status; + int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; + int ldak = LDA(ddescA, A ); + int ldan = LDA(ddescA, T ); + hipblasStatus_t status; dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = hipblasZherk( handles->hipblas_handle, - HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_C, + status = hipblasZherk(handles->hipblas_handle, + HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); @@ -560,32 +568,31 @@ BODY [type=HIP weight=(m+1-k)] { #if defined(PRECISION_z) || defined(PRECISION_c) - hipDoubleComplex zone = make_hipDoubleComplex( 1., 0.); - hipDoubleComplex mzone = make_hipDoubleComplex(-1., 0.); + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; #else double zone = 1.; double mzone = -1.; #endif int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; - int ldak = BLKLDD( descA, k ); - int ldam = BLKLDD( descA, m ); + int ldak_A = LDA(ddescA, A); + int ldak_B = LDA(ddescA, B); + int ldam_C = LDA(ddescA, C); + assert( ldak_A <= descA->mb ); + assert( ldak_B <= descA->mb ); + assert( ldam_C <= descA->mb ); hipblasStatus_t status; - assert( ldak <= descA->mb ); - assert( ldam <= descA->mb ); - dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_C, HIPBLAS_OP_N, - descA->mb, tempnn, descA->mb, - &mzone, (hipDoubleComplex*)A, ldak, - (hipDoubleComplex*)B, ldak, - &zone, (hipDoubleComplex*)C, ldam ); - + descA->mb, tempnn, descA->nb, + &mzone, A, ldak_A, + B, ldak_B, + &zone, C, ldam_C); assert(HIPBLAS_STATUS_SUCCESS == status); } END diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 351986a9..6fcadf3b 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -13,7 +13,7 @@ #include "dplasma/types.h" #include "dplasma/types_lapack.h" #include "dplasmaaux.h" -#include "potrf_cublas_utils.h" +#include "potrf_gpu_workspaces.h" #include "parsec/utils/zone_malloc.h" #include "zpotrf_U.h" @@ -56,7 +56,7 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb ) } #if defined(DPLASMA_HAVE_CUDA) -void *zpotrf_create_workspace(void *obj, void *user) +static void *zpotrf_create_cuda_workspace(void *obj, void *user) { parsec_device_module_t *mod = (parsec_device_module_t *)obj; zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; @@ -94,7 +94,33 @@ void *zpotrf_create_workspace(void *obj, void *user) return wp; } -static void destroy_workspace(void *_ws, void *_n) +static void zpotrf_destroy_cuda_workspace(void *_ws, void *_n) +{ + dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; + zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); + free(ws); + (void)_n; +} +#endif + +#if defined(DPLASMA_HAVE_HIP) +static void *zpotrf_create_hip_workspace(void *obj, void *user) +{ + parsec_device_module_t *mod = (parsec_device_module_t *)obj; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; + dplasma_potrf_workspace_t *wp = NULL; + (void)user; + + wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); + wp->tmpmem = zone_malloc(memory, sizeof(int)); + assert(NULL != wp->tmpmem); + wp->lwork = 0; + wp->memory = memory; + + return wp; +} + +static void zpotrf_destroy_hip_workspace(void *_ws, void *_n) { dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); @@ -175,7 +201,7 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, int *info ) { parsec_zpotrf_L_taskpool_t *parsec_zpotrf = NULL; -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) char workspace_info_name[64]; static int uid = 0; #endif @@ -206,8 +232,8 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_cuda_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); snprintf(workspace_info_name, 64, "DPLASMA::ZPOTRF(%d)::WS", uid++); parsec_zpotrf->_g_cuda_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, - destroy_workspace, NULL, - zpotrf_create_workspace, parsec_zpotrf, + zpotrf_destroy_cuda_workspace, NULL, + zpotrf_create_cuda_workspace, parsec_zpotrf, NULL); #else parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; @@ -218,8 +244,14 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ parsec_zpotrf->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + snprintf(workspace_info_name, 64, "DPLASMA::ZPOTRF(%d)::WS", uid++); + parsec_zpotrf->_g_hip_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + zpotrf_destroy_hip_workspace, NULL, + zpotrf_create_hip_workspace, parsec_zpotrf, + NULL); #else parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, @@ -261,6 +293,9 @@ dplasma_zpotrf_Destruct( parsec_taskpool_t *tp ) #if defined(DPLASMA_HAVE_CUDA) parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_cuda_workspaces_infokey, NULL); #endif +#if defined(DPLASMA_HAVE_HIP) + parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_hip_workspaces_infokey, NULL); +#endif parsec_taskpool_free(tp); /* free the dplasma_data_collection_t */ From 3afda75dab1eeb701248ba5263a5f057e6528f70 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 24 Nov 2021 12:29:43 -0500 Subject: [PATCH 17/41] hip: remove unecessary hiblas init calls Signed-off-by: Aurelien Bouteiller --- tests/common.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/common.c b/tests/common.c index 7c6a8a54..d13ab37f 100644 --- a/tests/common.c +++ b/tests/common.c @@ -740,10 +740,6 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) } #endif #if defined(DPLASMA_HAVE_HIP) - /* Unsupported/unecessary - hipblasStatus_t status = hipblasInit(); - assert(HIPBLAS_STATUS_SUCCESS == status); - */ parsec_info_register(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", destroy_hip_handles, NULL, dplasma_create_hip_handles, NULL, @@ -764,9 +760,6 @@ void cleanup_parsec(parsec_context_t* parsec, int *iparam) #if defined(DPLASMA_HAVE_HIP) parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); - /* Unsupported/unecessary - hipblasShutdown(); - */ #endif parsec_fini(&parsec); From 9b92289e2a996e5c96acc866b9500c357c3fb545 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 29 Nov 2021 10:21:25 -0500 Subject: [PATCH 18/41] hip:po:errors in ldam asserts Signed-off-by: Aurelien Bouteiller --- src/zpotrf_L.jdf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 08e9e07b..18bf5a6a 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -563,8 +563,9 @@ BODY [type=HIP int ldam_A = LDA(ddescA, A); int ldan_B = LDA(ddescA, B); int ldam_C = LDA(ddescA, C); - assert( ldan <= descA->mb ); - assert( ldam <= descA->mb ); + assert( ldam_A <= descA->mb ); + assert( ldan_B <= descA->mb ); + assert( ldam_C <= descA->mb ); hipblasStatus_t status; dplasma_hip_handles_t *handles; From d41c3b62670c29202b036a79a24d239dd19ae825 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 29 Nov 2021 14:23:29 -0500 Subject: [PATCH 19/41] hip:po: some of the changes had broken cusolver Signed-off-by: Aurelien Bouteiller --- src/zpotrf_L.jdf | 1 + src/zpotrf_U.jdf | 2 +- src/zpotrf_wrapper.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 18bf5a6a..1de6f3f8 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -20,6 +20,7 @@ static void zpotrf_L_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #if defined(DPLASMA_HAVE_CUDA) #include +#include #endif /* defined(DPLASMA_HAVE_CUDA) */ #if defined(DPLASMA_HAVE_HIP) #include diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 3a2570f9..ddf55e6e 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -20,6 +20,7 @@ static void zpotrf_U_update_INFO(parsec_taskpool_t* _tp, const parsec_recursive_ #if defined(DPLASMA_HAVE_CUDA) #include +#include #endif /* defined(DPLASMA_HAVE_CUDA) */ #if defined(DPLASMA_HAVE_HIP) #include @@ -404,7 +405,6 @@ BODY [type=CUDA] tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); - PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return -1;} ); } diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 6fcadf3b..d002220e 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -56,6 +56,8 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb ) } #if defined(DPLASMA_HAVE_CUDA) +#include + static void *zpotrf_create_cuda_workspace(void *obj, void *user) { parsec_device_module_t *mod = (parsec_device_module_t *)obj; From 0db93e75f04b47be216b6ae0c7d3db5cb8b88da9 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 29 Nov 2021 15:21:12 -0500 Subject: [PATCH 20/41] fix printlogcuda/hip Signed-off-by: Aurelien Bouteiller --- src/dplasmajdf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index a6a5c752..9df15ba7 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -24,9 +24,9 @@ # define printlog(str, ...) fprintf(stderr, "thread %d VP %d " str "\n", \ es->th_id, es->virtual_process->vp_id, __VA_ARGS__) # define printlogcuda(str, ...) fprintf(stderr, "cuda %d " str "\n", \ - gpu_device->cuda_index, __VA_ARGS__) + gpu_device->super.device_index, __VA_ARGS__) # define printloghip(str, ...) fprintf(stderr, "hip %d " str "\n", \ - gpu_device->hip_index, __VA_ARGS__) + gpu_device->super.device_index, __VA_ARGS__) #else # define printlog(...) do {} while(0) # define printlogcuda(...) do {} while(0) From ecc4e77a9458a5a1739049756cfcb91547d4abe3 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 3 Dec 2021 11:41:00 -0500 Subject: [PATCH 21/41] Auto-generate hip stage-in/out functions Use proper error checks instead of asserts --- src/CMakeLists.txt | 19 ++++ src/cuda/README.md | 3 + src/cuda/lapack_cuda_stage_in.c | 165 ++++++++++++++++++++++++++++ src/dplasmaaux.h | 24 ++++- src/dplasmajdf_lapack_dtt.h | 184 ++++++-------------------------- src/potrf_gpu_workspaces.h | 2 +- src/zgemm_NN_gpu.jdf | 3 +- src/zgemm_NN_summa.jdf | 97 ++++++++++------- src/zgemm_NT.jdf | 4 +- src/zgemm_NT_summa.jdf | 4 +- src/zgemm_TN.jdf | 4 +- src/zgemm_TN_summa.jdf | 4 +- src/zgemm_TT.jdf | 4 +- src/zgemm_TT_summa.jdf | 4 +- src/zgeqrf.jdf | 4 +- src/zgetrf_nopiv.jdf | 4 +- src/zpotrf_L.jdf | 18 ++-- src/zpotrf_U.jdf | 18 ++-- src/zpotrf_wrapper.c | 12 +-- 19 files changed, 347 insertions(+), 230 deletions(-) create mode 100644 src/cuda/README.md create mode 100644 src/cuda/lapack_cuda_stage_in.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9d543639..9cbac5b4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,10 +23,29 @@ set(EXTRA_SOURCES utils/dplasma_arena_datatype.c utils/dplasma_lapack_adtt.c utils/dplasma_info.c + cuda/lapack_cuda_stage_in.c ) if( NOT DPLASMA_HAVE_COMPLEX_H ) list(APPEND EXTRA_SOURCES complex.c) endif() +if( DPLASMA_HAVE_HIP ) + FILE(GLOB cuda_sources cuda/[^\\.]*.[ch]) + find_package(Perl REQUIRED) + find_program(HIPIFY_PERL_COMMAND NAMES hipify-perl HINTS ${HIP_BIN_INSTALL_DIR} REQUIRED) + foreach(cuda_file ${cuda_sources}) + file(RELATIVE_PATH cuda_filename ${CMAKE_CURRENT_SOURCE_DIR}/cuda ${cuda_file}) + string(REPLACE cuda hip hip_file ${cuda_filename}) + string(PREPEND hip_file "${CMAKE_CURRENT_BINARY_DIR}/hip/") + add_custom_command(OUTPUT ${hip_file} + DEPENDS ${cuda_file} # do not use MAIN_DEPENDENCY, that overides the default .c.o rule + COMMAND ${CMAKE_COMMAND} -E copy "${cuda_file}" "${hip_file}.prehip" + COMMAND ${PERL_EXECUTABLE} ${HIPIFY_PERL_COMMAND} --inplace --print-stats "${hip_file}" + COMMAND ${PERL_EXECUTABLE} -i -pe "s{(cuda)}{ substr uc hip | (uc \$1 ^ \$1), 0, 3 }egi" "${hip_file}" VERBATIM) # Convert all remaining cuda/CUDA + if(${hip_file} MATCHES [^\\.]*.c) # do not add .h to sources + list(APPEND EXTRA_SOURCES ${hip_file}) + endif() + endforeach() +endif( DPLASMA_HAVE_HIP ) ### Generate .c files from .jdf for all required precisions set(JDF diff --git a/src/cuda/README.md b/src/cuda/README.md new file mode 100644 index 00000000..1b5a22c5 --- /dev/null +++ b/src/cuda/README.md @@ -0,0 +1,3 @@ +This directory contains files that are automatically converted from CUDA to HIP using Hipify. +If your file is not automatically convertible, put it somewhere else. + diff --git a/src/cuda/lapack_cuda_stage_in.c b/src/cuda/lapack_cuda_stage_in.c new file mode 100644 index 00000000..4e2c4a4a --- /dev/null +++ b/src/cuda/lapack_cuda_stage_in.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2020-2021 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT + * + */ + +#include "dplasma.h" +#include "dplasmajdf_lapack_dtt.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#include + +/* Use cudaMemcpy2DAsync or loop with cudaMemcpyAsync for data transfers to device */ +#define USE_COPY_2D + +int +dplasma_cuda_lapack_stage_in(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream) +{ + cudaError_t ret; + parsec_data_copy_t * copy_in; + parsec_data_copy_t * copy_out; + parsec_device_gpu_module_t *in_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + dplasma_data_collection_t * ddc; + parsec_task_t *task = gtask->ec; + int elem_sz; + int i; + for(i = 0; i < task->task_class->nb_flows; i++){ + if(flow_mask & (1U << i)){ + copy_in = task->data[i].data_in; + copy_out = task->data[i].data_out; + ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; + assert(ddc != NULL); + elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); + in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); + if( (in_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, + copy_in->device_private, + gtask->flow_nb_elts[i], + (in_elem_dev->super.type != PARSEC_DEV_CUDA)? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + }else{ + +#ifdef USE_COPY_2D + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); + size_t dpitch = ddc->dc_original->mb * elem_sz; + size_t spitch = ldd * elem_sz; + size_t width = nrows * elem_sz; + size_t height = ncols; + /* copy width bytes heigth times, skipping pitch - width bytes every time */ + ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, + dpitch, /*dst pitch bytes*/ + copy_in->device_private, + spitch, /*src pitch bytes*/ + width, height, + cudaMemcpyHostToDevice, + cuda_stream->cuda_stream ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); + + +#else + + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); + + int j; + for(j=0; jdevice_private) + j * ldd * elem_sz; + char*dst = ((char*)copy_out->device_private) + j * ddc->dc_original->mb * elem_sz; + ret = cudaMemcpyAsync(dst, + src, + nrows * elem_sz, + cudaMemcpyHostToDevice, + cuda_stream->cuda_stream ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + + } +#endif + + + } + } + } + return PARSEC_SUCCESS; +} + +int +dplasma_cuda_lapack_stage_out(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream) +{ + cudaError_t ret; + parsec_data_copy_t * copy_in; + parsec_data_copy_t * copy_out; + parsec_device_gpu_module_t *out_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + parsec_task_t *task = gtask->ec; + dplasma_data_collection_t * ddc; + int elem_sz; + int i; + for(i = 0; i < task->task_class->nb_flows; i++){ + if(flow_mask & (1U << i)){ + copy_in = task->data[i].data_out; + copy_out = copy_in->original->device_copies[0]; + ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; + assert(ddc != NULL); + elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); + out_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_out->device_index); + + if( (out_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, + copy_in->device_private, + gtask->flow_nb_elts[i], + out_elem_dev->super.type != PARSEC_DEV_CUDA ? + cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + }else{ + +#ifdef USE_COPY_2D + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); + size_t dpitch = ldd * elem_sz; + size_t spitch = ddc->dc_original->mb * elem_sz; + size_t width = nrows * elem_sz; + size_t height = ncols; + /* copy width bytes heigth times, skipping pitch - width bytes every time */ + ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, + dpitch, /*dst pitch bytes*/ + copy_in->device_private, + spitch, /*src pitch bytes*/ + width, height, + cudaMemcpyDeviceToHost, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); +#else + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); + int j; + for(j=0; jdevice_private) + j * ddc->dc_original->mb * elem_sz; + char*dst = ((char*)copy_out->device_private) + j * ldd * elem_sz; + ret = cudaMemcpyAsync(dst, + src, + nrows * elem_sz, + cudaMemcpyDeviceToHost, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + } +#endif + } + } + } + return PARSEC_SUCCESS; +} +#endif /* defined(DPLASMA_HAVE_CUDA) */ diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index 91a92624..39055d91 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -122,10 +122,32 @@ void *dplasma_create_cuda_handles(void *obj, void *user); #if defined(DPLASMA_HAVE_HIP) #include #include "parsec/mca/device/hip/device_hip.h" + +#define DPLASMA_ROCBLAS_CHECK_ERROR(STR, ERROR, CODE) \ + do { \ + rocblas_status __error = (rocblas_status) (ERROR); \ + if(rocblas_status_success != __error) { \ + parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ + (STR), rocblas_status_to_string(__error)); \ + CODE; \ + } \ + } while(0) + +/* For some reason the error values are not the same... */ +#define DPLASMA_HIPBLAS_CHECK_ERROR(STR, ERROR, CODE) \ + do { \ + hipblasStatus_t __error = (hipblasStatus_t) (ERROR); \ + if(HIPBLAS_STATUS_SUCCESS != __error) { \ + parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ + (STR), hipblasStatusToString(__error)); \ + CODE; \ + } \ + } while(0) + typedef struct { hipblasHandle_t hipblas_handle; } dplasma_hip_handles_t; void *dplasma_create_hip_handles(void *obj, void *user); -#endif +#endif /* defined(DPLASMA_HAVE_HIP) */ #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index 11706b23..deab7380 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -1,11 +1,22 @@ -#ifndef _DPLASMAJDF_LAPACK_DTT_H_ -#define _DPLASMAJDF_LAPACK_DTT_H_ +/* + * Copyright (c) 2020-2021 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT + * + */ +#ifndef INCLUDE_DPLASMA_LAPACK_DTT_H +#define INCLUDE_DPLASMA_LAPACK_DTT_H +#include "dplasma/config.h" +#include +#include #include "dplasma/types.h" #include "dplasma/types_lapack.h" /* DON'T CHANGE SHAPE */ -#define SAME -1 +#define DPLASMA_SHAPE_SAME -1 /* Obtain location on matrix. */ @@ -80,7 +91,7 @@ parsec_arena_datatype_t* ADTT_CP(parsec_data_copy_t *cp, const dplasma_data_coll rc = dplasma_get_info_from_datatype(ddc, cp->dtt, &info, &adt); assert(rc == 0); - if(( info.shape == target_shape )||(target_shape == SAME)){ + if(( info.shape == target_shape )||(target_shape == DPLASMA_SHAPE_SAME)){ PARSEC_DEBUG_VERBOSE(8, parsec_debug_output, "CP %p [type %p] -> target_shape %d target_loc %d dtt %p", cp, cp->dtt, target_shape, target_loc, adt->opaque_dtt); @@ -119,159 +130,32 @@ void ADTT_INFO_internal(parsec_data_copy_t *cp, const dplasma_data_collection_t ADTT_INFO_internal(_f_##FLOW_NAME, ddc, lda, rows, cols) -#if defined(DPLASMA_HAVE_CUDA) -/* Use cudaMemcpy2DAsync or loop with cudaMemcpyAsync for data transfers to device */ -#define CUDA_COPY_2D /* Functions to transfer data in and out of the GPU. * Assuming a full tiled has been allocated on the GPU (mb*nb*size(elem)) */ -static int -stage_in_lapack(parsec_gpu_task_t *gtask, +#if defined(DPLASMA_HAVE_CUDA) +int +dplasma_cuda_lapack_stage_in(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) -{ - cudaError_t ret; - parsec_data_copy_t * copy_in; - parsec_data_copy_t * copy_out; - parsec_device_gpu_module_t *in_elem_dev; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; - dplasma_data_collection_t * ddc; - parsec_task_t *task = gtask->ec; - int elem_sz; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ - copy_in = task->data[i].data_in; - copy_out = task->data[i].data_out; - ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; - assert(ddc != NULL); - elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); - if( (in_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)) { - ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, - copy_in->device_private, - gtask->flow_nb_elts[i], - (in_elem_dev->super.super.type != PARSEC_DEV_CUDA)? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - }else{ - -#ifdef CUDA_COPY_2D - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); - size_t dpitch = ddc->dc_original->mb * elem_sz; - size_t spitch = ldd * elem_sz; - size_t width = nrows * elem_sz; - size_t height = ncols; - /* copy width bytes heigth times, skipping pitch - width bytes every time */ - ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, - dpitch, /*dst pitch bytes*/ - copy_in->device_private, - spitch, /*src pitch bytes*/ - width, height, - cudaMemcpyHostToDevice, - cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); - - -#else - - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); - - int j; - for(j=0; jdevice_private) + j * ldd * elem_sz; - char*dst = ((char*)copy_out->device_private) + j * ddc->dc_original->mb * elem_sz; - ret = cudaMemcpyAsync(dst, - src, - nrows * elem_sz, - cudaMemcpyHostToDevice, - cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - - } -#endif - + parsec_gpu_exec_stream_t *gpu_stream); - } - } - } - return PARSEC_SUCCESS; -} - -static int -stage_out_lapack(parsec_gpu_task_t *gtask, +int +dplasma_cuda_lapack_stage_out(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) -{ - cudaError_t ret; - parsec_data_copy_t * copy_in; - parsec_data_copy_t * copy_out; - parsec_device_gpu_module_t *out_elem_dev; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; - parsec_task_t *task = gtask->ec; - dplasma_data_collection_t * ddc; - int elem_sz; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ - copy_in = task->data[i].data_out; - copy_out = copy_in->original->device_copies[0]; - ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; - assert(ddc != NULL); - elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - out_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_out->device_index); - - if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ - ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, - copy_in->device_private, - gtask->flow_nb_elts[i], - out_elem_dev->super.super.type != PARSEC_DEV_CUDA ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - }else{ - -#ifdef CUDA_COPY_2D - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); - size_t dpitch = ldd * elem_sz; - size_t spitch = ddc->dc_original->mb * elem_sz; - size_t width = nrows * elem_sz; - size_t height = ncols; - /* copy width bytes heigth times, skipping pitch - width bytes every time */ - ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, - dpitch, /*dst pitch bytes*/ - copy_in->device_private, - spitch, /*src pitch bytes*/ - width, height, - cudaMemcpyDeviceToHost, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); -#else - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); - int j; - for(j=0; jdevice_private) + j * ddc->dc_original->mb * elem_sz; - char*dst = ((char*)copy_out->device_private) + j * ldd * elem_sz; - ret = cudaMemcpyAsync(dst, - src, - nrows * elem_sz, - cudaMemcpyDeviceToHost, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - } -#endif - } - } - } - return PARSEC_SUCCESS; -} + parsec_gpu_exec_stream_t *gpu_stream); #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +int +dplasma_hip_lapack_stage_in(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream); + +int +dplasma_hip_lapack_stage_out(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream); +#endif /* defined(DPLASMA_HAVE_HIP) */ -#endif /* _DPLASMAJDF_LAPACK_DTT_H_ */ +#endif /* INCLUDE_DPLASMA_LAPACK_DTT_H */ diff --git a/src/potrf_gpu_workspaces.h b/src/potrf_gpu_workspaces.h index f182d04c..ab4b1de9 100644 --- a/src/potrf_gpu_workspaces.h +++ b/src/potrf_gpu_workspaces.h @@ -11,6 +11,6 @@ typedef struct { char *tmpmem; void *memory; int lwork; -} dplasma_potrf_workspace_t; +} dplasma_potrf_gpu_workspaces_t; #endif //DPLASMA_POTRF_GPU_WORKSPACES_H diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 8f668622..962b88e4 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -473,8 +473,7 @@ BODY [type=HIP &lalpha, (hipDoubleComplex*)A, ldam, (hipDoubleComplex*)B, ldbk, &lbeta, (hipDoubleComplex*)C, ldcm ); - assert(HIPBLAS_STATUS_SUCCESS == status); - PARSEC_HIP_CHECK_ERROR( "hipblasZgemm ", status, + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_DONE;} ); /* Quick and dirty emulation of the next GEMM */ diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 9fc01913..481b38d2 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -211,41 +211,14 @@ RW C <- (k == 0) ? ddescC(m, n) [ type = %{ return CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(k+lookP, n, m%P) -BODY -{ - - dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; - int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; - int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; - int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; - int ldam = LDA(ddescA, A); - int ldbk = LDA(ddescB, B); - int ldcm = LDA(ddescC, C); - - CORE_zgemm(transA, transB, - tempmm, tempnn, tempkk, - alpha, A /*A(m, k)*/, ldam, - B /*B(k, n)*/, ldbk, - lbeta, C /*C(m, n)*/, ldcm); - - printlog("gemm( %d, %d, %d )\n" - " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", - m, n, k, - &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), - tempmm, tempnn, tempkk, - creal(alpha), m, k, ldam, - k, n, ldbk, - creal(lbeta), m, n, ldcm ); -} -END - BODY [type=CUDA + weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); @@ -262,6 +235,15 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; + printlogcuda("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), @@ -281,13 +263,16 @@ BODY [type=HIP B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) - hipblasDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); - hipblasDoubleComplex lbeta = (k == 0) ? make_hipDoubleComplex(creal(beta), cimag(beta)) - : make_hipDoubleComplex(1.0, 0.0); + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } #else double lalpha = alpha; double lbeta = (k == 0) ? beta : 1.0; @@ -299,6 +284,15 @@ BODY [type=HIP int ldbk = descB->mb; int ldcm = descC->mb; + printloghip("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + hipblasStatus_t status; hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; @@ -310,8 +304,35 @@ BODY [type=HIP &lalpha, (hipblasDoubleComplex*)A, ldam, (hipblasDoubleComplex*)B, ldbk, &lbeta, (hipblasDoubleComplex*)C, ldcm ); - assert(HIPBLAS_STATUS_SUCCESS == status); - PARSEC_HIP_CHECK_ERROR( "hipblasZgemm ", status, - {return PARSEC_HOOK_RETURN_DONE;} ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + +BODY +{ + + dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = LDA(ddescA, A); + int ldbk = LDA(ddescB, B); + int ldcm = LDA(ddescC, C); + + CORE_zgemm(transA, transB, + tempmm, tempnn, tempkk, + alpha, A /*A(m, k)*/, ldam, + B /*B(k, n)*/, ldbk, + lbeta, C /*C(m, n)*/, ldcm); + + printlog("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); } END diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index 91def4a0..a6b876ef 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -169,8 +169,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 08aa5660..1bde8b01 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -211,8 +211,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index 7232d3d9..5a0b7da4 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -169,8 +169,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index f1e86ba8..aa3a6584 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -210,8 +210,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index 30d82d0c..9f752f49 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -169,8 +169,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 385021c1..2bd6f363 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -210,8 +210,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index 103362b0..99181b56 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -472,8 +472,8 @@ BODY [type=CUDA device=%{ return n; %} V.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} T.size=%{ return descT->mb*descT->nb*parsec_datadist_getsizeoftype(descT->mtype);%} A1.dc=ddescA A2.dc=ddescA V.dc=ddescA T.dc=ddescT - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { dplasma_complex64_t *WORK, *WORKC; int tempmm = ((m)==(descA->mt-1)) ? (descA->m-(m*descA->mb)) : descA->mb; diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index 2fd2dfea..c11ace65 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -200,8 +200,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 1de6f3f8..85ef5498 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -152,7 +152,7 @@ BODY [type=CUDA cusolverStatus_t status; cublasFillMode_t cublas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; cuDoubleComplex *workspace; int *d_iinfo; dplasma_cuda_handles_t *handles; @@ -184,7 +184,7 @@ BODY [type=HIP rocblas_status status; rocblas_fill rocblas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; int *d_iinfo; if( PlasmaLower == uplo ) @@ -201,7 +201,7 @@ BODY [type=HIP d_iinfo = (int*)wp->tmpmem; status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo); - assert(rocblas_status_success == status ); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return -1;}); } END @@ -297,7 +297,7 @@ BODY [type=HIP HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, tempmm, descA->nb, &zone, T, ldak, C, ldam); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return -1;}); } END @@ -425,7 +425,7 @@ BODY [type=HIP tempmm, descA->mb, &mzone, A, ldam_A, &zone, T, ldam_T); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return -1;}); } END @@ -515,8 +515,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); @@ -551,6 +551,8 @@ BODY [type=CUDA END BODY [type=HIP + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out weight=(n+1-k)] { #if defined(PRECISION_z) || defined(PRECISION_c) @@ -578,7 +580,7 @@ BODY [type=HIP &mzone, A, ldam_A, B, ldan_B, &zone, C, ldam_C ); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return -1;}); } END diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index ddf55e6e..d260f289 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -151,7 +151,7 @@ BODY [type=CUDA cusolverStatus_t status; cublasFillMode_t cublas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; cuDoubleComplex *workspace; int *d_iinfo; dplasma_cuda_handles_t *handles; @@ -183,7 +183,7 @@ BODY [type=HIP rocblas_status status; rocblas_fill rocblas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; int *d_iinfo; if( PlasmaLower == uplo ) @@ -200,7 +200,7 @@ BODY [type=HIP d_iinfo = (int*)wp->tmpmem; status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkn, T, ldak, d_iinfo); - assert(rocblas_status_success == status ); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return -1;}); } END @@ -299,7 +299,7 @@ BODY [type=HIP HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, descA->mb, tempnn, &zone, T, ldak_T, C, ldak_C); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return -1;}); } END @@ -428,7 +428,7 @@ BODY [type=HIP tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return -1;}); } END @@ -518,8 +518,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); @@ -565,6 +565,8 @@ BODY [type=CUDA END BODY [type=HIP + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out weight=(m+1-k)] { #if defined(PRECISION_z) || defined(PRECISION_c) @@ -593,7 +595,7 @@ BODY [type=HIP &mzone, A, ldak_A, B, ldak_B, &zone, C, ldam_C); - assert(HIPBLAS_STATUS_SUCCESS == status); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return -1;}); } END diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index d002220e..3eac244a 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -65,7 +65,7 @@ static void *zpotrf_create_cuda_workspace(void *obj, void *user) cusolverDnHandle_t cusolverDnHandle; cusolverStatus_t status; parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user; - dplasma_potrf_workspace_t *wp = NULL; + dplasma_potrf_gpu_workspaces_t *wp = NULL; int workspace_size; int mb = tp->_g_descA->mb; int nb = tp->_g_descA->nb; @@ -87,7 +87,7 @@ static void *zpotrf_create_cuda_workspace(void *obj, void *user) cusolverDnDestroy(cusolverDnHandle); - wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); assert(NULL != wp->tmpmem); wp->lwork = workspace_size; @@ -98,7 +98,7 @@ static void *zpotrf_create_cuda_workspace(void *obj, void *user) static void zpotrf_destroy_cuda_workspace(void *_ws, void *_n) { - dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); free(ws); (void)_n; @@ -110,10 +110,10 @@ static void *zpotrf_create_hip_workspace(void *obj, void *user) { parsec_device_module_t *mod = (parsec_device_module_t *)obj; zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; - dplasma_potrf_workspace_t *wp = NULL; + dplasma_potrf_gpu_workspaces_t *wp = NULL; (void)user; - wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); wp->tmpmem = zone_malloc(memory, sizeof(int)); assert(NULL != wp->tmpmem); wp->lwork = 0; @@ -124,7 +124,7 @@ static void *zpotrf_create_hip_workspace(void *obj, void *user) static void zpotrf_destroy_hip_workspace(void *_ws, void *_n) { - dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); free(ws); (void)_n; From b546f672f6f2a7835bf146a1539d8e75f4e71e57 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 3 Dec 2021 12:05:04 -0500 Subject: [PATCH 22/41] hip:zgemm_gpu: don't use hipComplex --- src/zgemm_NN_gpu.jdf | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 962b88e4..21e8ed01 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -21,7 +21,6 @@ extern "C" %{ #if defined(DPLASMA_HAVE_HIP) #include -#include #endif /* defined(DPLASMA_HAVE_HIP) */ static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) @@ -437,9 +436,9 @@ BODY [type=HIP weight=(descA->nt-k)] { #if defined(PRECISION_z) || defined(PRECISION_c) - hipDoubleComplex lalpha = make_hipDoubleComplex(creal(alpha), cimag(alpha)); - hipDoubleComplex lbeta = (k == 0) ? make_hipDoubleComplex(creal(beta), cimag(beta)) - : make_hipDoubleComplex(1.0, 0.0); + hipblasDoubleComplex lalpha = { creal(alpha), cimag(alpha) }; + hipblasDoubleComplex lbeta = { 1., 0. }; + if( k == 0 ) { lbeta.x = creal(beta); lbeta.y = cimag(beta); }; #else double lalpha = alpha; double lbeta = (k == 0) ? beta : 1.0; @@ -470,9 +469,9 @@ BODY [type=HIP status = hipblasZgemm( handles->hipblas_handle, opA, opB, tempmm, tempnn, tempkk, - &lalpha, (hipDoubleComplex*)A, ldam, - (hipDoubleComplex*)B, ldbk, - &lbeta, (hipDoubleComplex*)C, ldcm ); + &lalpha, A, ldam, + B, ldbk, + &lbeta, C, ldcm ); DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_DONE;} ); @@ -481,7 +480,7 @@ BODY [type=HIP __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm; memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t)); next_gemm.locals.k.value = descC->mt -1; - assert( PARSEC_DEV_CUDA == next_gemm.task_class->incarnations[this_task->chore_id].type ); + assert( PARSEC_DEV_HIP == next_gemm.task_class->incarnations[this_task->chore_id].type ); if(NULL != next_gemm.task_class->incarnations[this_task->chore_id].evaluate) { if( next_gemm.task_class->incarnations[this_task->chore_id].evaluate((parsec_task_t*)&next_gemm) == PARSEC_HOOK_RETURN_NEXT ) { From a6c605339c7b9e6abda81e5af9ab355de4193f05 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 3 Dec 2021 12:28:46 -0500 Subject: [PATCH 23/41] Return the proper PARSEC_HOOK_RETURN_ERROR in GPU error cases Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN.jdf | 2 +- src/zgemm_NN_summa.jdf | 2 +- src/zgemm_NT.jdf | 2 +- src/zgemm_NT_summa.jdf | 2 +- src/zgemm_TN.jdf | 2 +- src/zgemm_TN_summa.jdf | 2 +- src/zgemm_TT.jdf | 2 +- src/zgemm_TT_summa.jdf | 2 +- src/zgetrf_nopiv.jdf | 2 +- src/zpoinv_L.jdf | 6 +++--- src/zpoinv_U.jdf | 6 +++--- src/zpotrf_L.jdf | 16 ++++++++-------- src/zpotrf_U.jdf | 16 ++++++++-------- src/ztrsm_LLN.jdf | 2 +- src/ztrsm_LLT.jdf | 2 +- src/ztrsm_LUN.jdf | 2 +- src/ztrsm_LUT.jdf | 2 +- src/ztrsm_RLN.jdf | 2 +- src/ztrsm_RLT.jdf | 2 +- src/ztrsm_RUN.jdf | 2 +- src/ztrsm_RUT.jdf | 2 +- 21 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index 9bcb9b2c..edda3316 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -191,7 +191,7 @@ BODY [type=CUDA] lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 481b38d2..213a0c3c 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -253,7 +253,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index a6b876ef..75b87343 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -196,7 +196,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 1bde8b01..11c09ada 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -238,7 +238,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index 5a0b7da4..f02dd320 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -196,7 +196,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index aa3a6584..c3fc7193 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -237,7 +237,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index 9f752f49..676bf5e3 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -196,7 +196,7 @@ BODY [type=CUDA lbeta, (cuDoubleComplex*)C, ldcm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 2bd6f363..0c419bf5 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -240,7 +240,7 @@ BODY [type=CUDA printf("ISSUE\n"); } PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index c11ace65..2ee8ac4f 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -231,7 +231,7 @@ BODY [type=CUDA zone, (cuDoubleComplex*)C, ldam_C); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zpoinv_L.jdf b/src/zpoinv_L.jdf index 5841af35..0e07c78b 100644 --- a/src/zpoinv_L.jdf +++ b/src/zpoinv_L.jdf @@ -162,7 +162,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldam ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -269,7 +269,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldam ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -438,7 +438,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldam ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zpoinv_U.jdf b/src/zpoinv_U.jdf index 2eb9a9b7..0c0fc720 100644 --- a/src/zpoinv_U.jdf +++ b/src/zpoinv_U.jdf @@ -161,7 +161,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldan ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -267,7 +267,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldam ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -435,7 +435,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)C, ldam ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 85ef5498..24b598eb 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -172,7 +172,7 @@ BODY [type=CUDA status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkm, T, ldak, workspace, wp->lwork, d_iinfo); PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -201,7 +201,7 @@ BODY [type=HIP d_iinfo = (int*)wp->tmpmem; status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo); - DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return -1;}); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -272,7 +272,7 @@ BODY [type=CUDA] tempmm, descA->nb, &zone, T, ldak, C, ldam); PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -297,7 +297,7 @@ BODY [type=HIP HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, tempmm, descA->nb, &zone, T, ldak, C, ldam); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -403,7 +403,7 @@ BODY [type=CUDA] &mzone, A, ldam_A, &zone, T, ldam_T); PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -425,7 +425,7 @@ BODY [type=HIP tempmm, descA->mb, &mzone, A, ldam_A, &zone, T, ldam_T); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -546,7 +546,7 @@ BODY [type=CUDA (cuDoubleComplex*)B, ldan_B, &zone, (cuDoubleComplex*)C, ldam_C ); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -580,7 +580,7 @@ BODY [type=HIP &mzone, A, ldam_A, B, ldan_B, &zone, C, ldam_C ); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index d260f289..5f8eb5cf 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -171,7 +171,7 @@ BODY [type=CUDA status = cusolverDnZpotrf( handles->cusolverDn_handle, cublas_uplo, tempkn, T, ldak, workspace, wp->lwork, d_iinfo); PARSEC_CUDA_CHECK_ERROR( "cublasZpotrf_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -200,7 +200,7 @@ BODY [type=HIP d_iinfo = (int*)wp->tmpmem; status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkn, T, ldak, d_iinfo); - DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return -1;}); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -274,7 +274,7 @@ BODY [type=CUDA] descA->mb, tempnn, &zone, T, ldak_T, C, ldak_C); PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -299,7 +299,7 @@ BODY [type=HIP HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, descA->mb, tempnn, &zone, T, ldak_T, C, ldak_C); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -406,7 +406,7 @@ BODY [type=CUDA] &mzone, A, ldak, &zone, T, ldan); PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -428,7 +428,7 @@ BODY [type=HIP tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END @@ -552,7 +552,7 @@ BODY [type=CUDA (cuDoubleComplex*)B, ldak_B, &zone, (cuDoubleComplex*)C, ldam_C); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); printlogcuda("CUDA_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", m, n, k, @@ -595,7 +595,7 @@ BODY [type=HIP &mzone, A, ldak_A, B, ldak_B, &zone, C, ldam_C); - DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return -1;}); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); } END diff --git a/src/ztrsm_LLN.jdf b/src/ztrsm_LLN.jdf index 242fe6e4..dffb9560 100644 --- a/src/ztrsm_LLN.jdf +++ b/src/ztrsm_LLN.jdf @@ -116,7 +116,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_LLT.jdf b/src/ztrsm_LLT.jdf index 666fc78a..ac228a80 100644 --- a/src/ztrsm_LLT.jdf +++ b/src/ztrsm_LLT.jdf @@ -117,7 +117,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldbm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_LUN.jdf b/src/ztrsm_LUN.jdf index de922c9f..840d80e9 100644 --- a/src/ztrsm_LUN.jdf +++ b/src/ztrsm_LUN.jdf @@ -116,7 +116,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldbm ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_LUT.jdf b/src/ztrsm_LUT.jdf index a156ab88..56eb920e 100644 --- a/src/ztrsm_LUT.jdf +++ b/src/ztrsm_LUT.jdf @@ -116,7 +116,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_RLN.jdf b/src/ztrsm_RLN.jdf index 9383c66f..9a843f4d 100644 --- a/src/ztrsm_RLN.jdf +++ b/src/ztrsm_RLN.jdf @@ -115,7 +115,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_RLT.jdf b/src/ztrsm_RLT.jdf index 33de63c4..35c5f492 100644 --- a/src/ztrsm_RLT.jdf +++ b/src/ztrsm_RLT.jdf @@ -114,7 +114,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_RUN.jdf b/src/ztrsm_RUN.jdf index 5bceec3b..9e7a9ef2 100644 --- a/src/ztrsm_RUN.jdf +++ b/src/ztrsm_RUN.jdf @@ -116,7 +116,7 @@ BODY [type=CUDA] lalpha, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END diff --git a/src/ztrsm_RUT.jdf b/src/ztrsm_RUT.jdf index 1d028794..2b6586d6 100644 --- a/src/ztrsm_RUT.jdf +++ b/src/ztrsm_RUT.jdf @@ -114,7 +114,7 @@ BODY [type=CUDA] zone, (cuDoubleComplex*)E, ldb ); status = cublasGetError(); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm ", status, - {return -1;} ); + {return PARSEC_HOOK_RETURN_ERROR;} ); } END From ffe59a8acdf400b76bb63b741524a68b6ed80248 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 31 Mar 2022 12:32:33 -0400 Subject: [PATCH 24/41] Update for the new device mask for incarnations Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_gpu.jdf | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 21e8ed01..dcd70741 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -477,12 +477,18 @@ BODY [type=HIP /* Quick and dirty emulation of the next GEMM */ if( k == descC->mt -1 ) { + unsigned int chore_id = 0; + for(chore_id = 0; chore_id < 8*sizeof(this_task->chore_mask); chore_id++) { + if( (this_task->chore_mask & (1<chore_mask)); __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm; memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t)); next_gemm.locals.k.value = descC->mt -1; - assert( PARSEC_DEV_HIP == next_gemm.task_class->incarnations[this_task->chore_id].type ); - if(NULL != next_gemm.task_class->incarnations[this_task->chore_id].evaluate) { - if( next_gemm.task_class->incarnations[this_task->chore_id].evaluate((parsec_task_t*)&next_gemm) == + assert( PARSEC_DEV_HIP == next_gemm.task_class->incarnations[chore_id].type ); + if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) { + if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) == PARSEC_HOOK_RETURN_NEXT ) { /* The next GEMM wants to run on the CPUs... */ gpu_task->pushout |= (1 << 0); From ab2668ab97ad84faa2e7b7c9acff343d736fdae0 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 31 Mar 2022 12:33:08 -0400 Subject: [PATCH 25/41] So far only NN gemm can run with HIP Signed-off-by: Aurelien Bouteiller --- src/zgemm_wrapper.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 78c92808..227d368f 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -91,12 +91,14 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_NT_summa_taskpool_t* tp; tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if 0 #if defined(PARSEC_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } @@ -106,12 +108,14 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_TN_summa_taskpool_t* tp; tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if 0 #if defined(PARSEC_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } else { @@ -120,12 +124,14 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_TT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if 0 #if defined(PARSEC_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } From 87efcd9891b0fcdcf0b7e1f3f52906fc28ce1c6b Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 12 Jun 2023 10:03:57 -0400 Subject: [PATCH 26/41] Use the correct DPLASMA_HAVE_HIP Signed-off-by: Aurelien Bouteiller --- src/zgemm_wrapper.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 227d368f..3f9ec96b 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -78,7 +78,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_NN_summa_taskpool_t* tp; tp = parsec_zgemm_NN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); @@ -92,7 +92,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); #if 0 -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); @@ -109,7 +109,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); #if 0 -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); @@ -125,7 +125,7 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); #if 0 -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); @@ -390,7 +390,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, K = B->mt; tp->_g_zMax = (K + d - 1) / d - 1; -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); From 2079846fd4ef61d2a4b2d7be80bf6d9eb4c6901b Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 27 Jun 2023 16:37:35 -0400 Subject: [PATCH 27/41] Remove weight properties from HIP bodies Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_gpu.jdf | 3 +-- src/zgemm_NN_summa.jdf | 2 -- src/zpotrf_L.jdf | 15 +++++---------- src/zpotrf_U.jdf | 15 +++++---------- 4 files changed, 11 insertions(+), 24 deletions(-) diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index dcd70741..1e2fa52a 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -432,8 +432,7 @@ BODY [type=CUDA] } END -BODY [type=HIP - weight=(descA->nt-k)] +BODY [type=HIP] { #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex lalpha = { creal(alpha), cimag(alpha) }; diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 213a0c3c..8edfae79 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -212,7 +212,6 @@ CTL ctla -> (k < (descA->nt-lookQ)) ? ctla RING_A(m, k+lookQ, n%Q) CTL ctlb -> (k < (descA->nt-lookP)) ? ctlb RING_B(k+lookP, n, m%P) BODY [type=CUDA - weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} @@ -258,7 +257,6 @@ BODY [type=CUDA END BODY [type=HIP - weight=(descA->nt-k) A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 24b598eb..c1c052f0 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -144,8 +144,7 @@ BODY [type=RECURSIVE] } END -BODY [type=CUDA - weigth=k] +BODY [type=CUDA] { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; int ldak = LDA(ddescA, T); @@ -176,8 +175,7 @@ BODY [type=CUDA } END -BODY [type=HIP - weigth=k] +BODY [type=HIP] { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; int ldak = BLKLDD( descA, k ); @@ -276,8 +274,7 @@ BODY [type=CUDA] } END -BODY [type=HIP - weight=(m+k)] +BODY [type=HIP] { #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; @@ -407,8 +404,7 @@ BODY [type=CUDA] } END -BODY [type=HIP - weight=(m+k)] +BODY [type=HIP] { double zone = 1.; double mzone = -1.; @@ -552,8 +548,7 @@ END BODY [type=HIP stage_in=dplasma_hip_lapack_stage_in - stage_out=dplasma_hip_lapack_stage_out - weight=(n+1-k)] + stage_out=dplasma_hip_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 5f8eb5cf..66e3a68e 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -143,8 +143,7 @@ BODY [type=RECURSIVE] } END -BODY [type=CUDA - weigth=k] +BODY [type=CUDA] { int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; int ldak = LDA(ddescA, T); @@ -175,8 +174,7 @@ BODY [type=CUDA } END -BODY [type=HIP - weigth=k] +BODY [type=HIP] { int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; int ldak = LDA(ddescA, T); @@ -278,8 +276,7 @@ BODY [type=CUDA] } END -BODY [type=HIP - weight=(k+n)] +BODY [type=HIP] { #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; @@ -410,8 +407,7 @@ BODY [type=CUDA] } END -BODY [type=HIP - weight=(k+n)] +BODY [type=HIP] { double zone = 1.; double mzone = -1.; @@ -566,8 +562,7 @@ END BODY [type=HIP stage_in=dplasma_hip_lapack_stage_in - stage_out=dplasma_hip_lapack_stage_out - weight=(m+1-k)] + stage_out=dplasma_hip_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; From 65782c156ec0313b327d2e989421e3c50bd75ce1 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 27 Jun 2023 17:02:35 -0400 Subject: [PATCH 28/41] Reorder and uniformize cuda and hip bodies Signed-off-by: Aurelien Bouteiller --- src/zpotrf_L.jdf | 190 ++++++++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 94 deletions(-) diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index c1c052f0..6d8fd07c 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -178,24 +178,24 @@ END BODY [type=HIP] { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; - int ldak = BLKLDD( descA, k ); + int ldak = LDA(descA, T); rocblas_status status; rocblas_fill rocblas_uplo; dplasma_potrf_gpu_workspaces_t *wp; int *d_iinfo; + dplasma_hip_handles_t *handles; if( PlasmaLower == uplo ) rocblas_uplo = rocblas_fill_lower; if( PlasmaUpper == uplo ) rocblas_uplo = rocblas_fill_upper; - dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo); @@ -250,6 +250,42 @@ RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - k - m - 1) * (m - k) : PRI_MAX +BODY [type=RECURSIVE] +{ + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + + if ( (tempmm > smallnb) || (descA->nb > smallnb) ) + { + subtile_desc_t *small_descT; + subtile_desc_t *small_descC; + parsec_taskpool_t* parsec_ztrsm; + + + small_descT = subtile_desc_create( descA, k, k, + smallnb, smallnb, 0, 0, descA->nb, descA->nb ); + small_descT->mat = T; + + small_descC = subtile_desc_create( descA, m, k, + smallnb, smallnb, 0, 0, tempmm, descA->nb ); + small_descC->mat = C; + + parsec_ztrsm = dplasma_ztrsm_New(dplasmaRight, dplasmaLower, + dplasmaConjTrans, dplasmaNonUnit, + (dplasma_complex64_t)1.0, + (parsec_tiled_matrix_t *)small_descT, + (parsec_tiled_matrix_t *)small_descC ); + + parsec_recursivecall((parsec_task_t*)this_task, + parsec_ztrsm, dplasma_ztrsm_Destruct, + 2, small_descT, small_descC ); + + return PARSEC_HOOK_RETURN_ASYNC; + } + /* Go for the sequential CPU version */ + return PARSEC_HOOK_RETURN_NEXT; +} +END + BODY [type=CUDA] { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; @@ -269,24 +305,22 @@ BODY [type=CUDA] CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, tempmm, descA->nb, &zone, T, ldak, C, ldam); - PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, - {return PARSEC_HOOK_RETURN_ERROR;} ); + PARSEC_CUDA_CHECK_ERROR( "cublasZtrsm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END BODY [type=HIP] { + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + int ldak = LDA(ddescA, T); + int ldam = LDA(ddescA, C); + dplasma_hip_handles_t *handles; #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; #else double zone = 1.; #endif - int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; - int ldak = LDA(ddescA, T); - int ldam = LDA(ddescA, C); - hipblasStatus_t status; - dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZtrsm(handles->hipblas_handle, @@ -298,42 +332,6 @@ BODY [type=HIP] } END -BODY [type=RECURSIVE] -{ - int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; - - if ( (tempmm > smallnb) || (descA->nb > smallnb) ) - { - subtile_desc_t *small_descT; - subtile_desc_t *small_descC; - parsec_taskpool_t* parsec_ztrsm; - - - small_descT = subtile_desc_create( descA, k, k, - smallnb, smallnb, 0, 0, descA->nb, descA->nb ); - small_descT->mat = T; - - small_descC = subtile_desc_create( descA, m, k, - smallnb, smallnb, 0, 0, tempmm, descA->nb ); - small_descC->mat = C; - - parsec_ztrsm = dplasma_ztrsm_New(dplasmaRight, dplasmaLower, - dplasmaConjTrans, dplasmaNonUnit, - (dplasma_complex64_t)1.0, - (parsec_tiled_matrix_t *)small_descT, - (parsec_tiled_matrix_t *)small_descC ); - - parsec_recursivecall((parsec_task_t*)this_task, - parsec_ztrsm, dplasma_ztrsm_Destruct, - 2, small_descT, small_descC ); - - return PARSEC_HOOK_RETURN_ASYNC; - } - /* Go for the sequential CPU version */ - return PARSEC_HOOK_RETURN_NEXT; -} -END - BODY { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; @@ -406,14 +404,14 @@ END BODY [type=HIP] { - double zone = 1.; - double mzone = -1.; int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; int ldam_A = LDA(ddescA, A); int ldam_T = LDA(ddescA, T); - - hipblasStatus_t status; dplasma_hip_handles_t *handles; + double zone = 1.; + double mzone = -1.; + hipblasStatus_t status; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); status = hipblasZherk( handles->hipblas_handle, @@ -506,6 +504,47 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * ((2 * descA->mt) - m - n - 3) * (m - n) + 6 * (m - k) : PRI_MAX +BODY [type=RECURSIVE] +{ + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + + if ( (tempmm > smallnb) || (descA->nb > smallnb) ) + { + subtile_desc_t *small_descA; + subtile_desc_t *small_descB; + subtile_desc_t *small_descC; + parsec_taskpool_t *parsec_zgemm; + + small_descA = subtile_desc_create( descA, m, k, + smallnb, smallnb, 0, 0, tempmm, descA->nb ); + small_descA->mat = A; + + small_descB = subtile_desc_create( descA, n, k, + smallnb, smallnb, 0, 0, descA->mb, descA->nb ); + small_descB->mat = B; + + small_descC = subtile_desc_create( descA, m, n, + smallnb, smallnb, 0, 0, tempmm, descA->nb ); + small_descC->mat = C; + + parsec_zgemm = dplasma_zgemm_New(dplasmaNoTrans, dplasmaConjTrans, + (dplasma_complex64_t)-1.0, + (parsec_tiled_matrix_t *)small_descA, + (parsec_tiled_matrix_t *)small_descB, + (dplasma_complex64_t) 1.0, + (parsec_tiled_matrix_t *)small_descC); + + parsec_recursivecall((parsec_task_t*)this_task, + parsec_zgemm, dplasma_zgemm_Destruct, + 3, small_descA, small_descB, small_descC ); + + return PARSEC_HOOK_RETURN_ASYNC; + } + /* Go to CPU sequential kernel */ + return PARSEC_HOOK_RETURN_NEXT; +} +END + BODY [type=CUDA A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} @@ -525,13 +564,12 @@ BODY [type=CUDA int ldam_A = LDA(ddescA, A); int ldan_B = LDA(ddescA, B); int ldam_C = LDA(ddescA, C); - - dplasma_cuda_handles_t *handles; - cublasStatus_t status; assert( ldam_A <= descA->mb ); assert( ldan_B <= descA->mb ); assert( ldam_C <= descA->mb ); + cublasStatus_t status; + dplasma_cuda_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); @@ -547,6 +585,10 @@ BODY [type=CUDA END BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A.dc=ddescA B.dc=ddescA C.dc=ddescA stage_in=dplasma_hip_lapack_stage_in stage_out=dplasma_hip_lapack_stage_out] { @@ -569,6 +611,7 @@ BODY [type=HIP dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_C, tempmm, descA->mb, descA->mb, @@ -579,47 +622,6 @@ BODY [type=HIP } END -BODY [type=RECURSIVE] -{ - int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; - - if ( (tempmm > smallnb) || (descA->nb > smallnb) ) - { - subtile_desc_t *small_descA; - subtile_desc_t *small_descB; - subtile_desc_t *small_descC; - parsec_taskpool_t *parsec_zgemm; - - small_descA = subtile_desc_create( descA, m, k, - smallnb, smallnb, 0, 0, tempmm, descA->nb ); - small_descA->mat = A; - - small_descB = subtile_desc_create( descA, n, k, - smallnb, smallnb, 0, 0, descA->mb, descA->nb ); - small_descB->mat = B; - - small_descC = subtile_desc_create( descA, m, n, - smallnb, smallnb, 0, 0, tempmm, descA->nb ); - small_descC->mat = C; - - parsec_zgemm = dplasma_zgemm_New(dplasmaNoTrans, dplasmaConjTrans, - (dplasma_complex64_t)-1.0, - (parsec_tiled_matrix_t *)small_descA, - (parsec_tiled_matrix_t *)small_descB, - (dplasma_complex64_t) 1.0, - (parsec_tiled_matrix_t *)small_descC); - - parsec_recursivecall((parsec_task_t*)this_task, - parsec_zgemm, dplasma_zgemm_Destruct, - 3, small_descA, small_descB, small_descC ); - - return PARSEC_HOOK_RETURN_ASYNC; - } - /* Go to CPU sequential kernel */ - return PARSEC_HOOK_RETURN_NEXT; -} -END - BODY { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; From 799f0f702fe496b409323caae3cb80ef9cb76cdc Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 27 Jun 2023 17:42:42 -0400 Subject: [PATCH 29/41] A PARSEC_HAVE_HIP was still present Signed-off-by: Aurelien Bouteiller --- src/zpotrf_wrapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 3eac244a..0721e6ac 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -242,7 +242,7 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; (void)uid; (void)workspace_info_name; #endif -#if defined(PARSEC_HAVE_HIP) +#if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ parsec_zpotrf->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); From e90a0e335eeafa322896562a594a83b905888c6b Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 27 Jun 2023 17:52:47 -0400 Subject: [PATCH 30/41] Rework zpotrf_U Signed-off-by: Aurelien Bouteiller --- src/zpotrf_U.jdf | 235 ++++++++++++++++++++++++----------------------- 1 file changed, 119 insertions(+), 116 deletions(-) diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 66e3a68e..2c322d14 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -183,16 +183,15 @@ BODY [type=HIP] rocblas_fill rocblas_uplo; dplasma_potrf_gpu_workspaces_t *wp; int *d_iinfo; + dplasma_hip_handles_t *handles; if( PlasmaLower == uplo ) rocblas_uplo = rocblas_fill_lower; if( PlasmaUpper == uplo ) rocblas_uplo = rocblas_fill_upper; - dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); assert(NULL != wp); d_iinfo = (int*)wp->tmpmem; @@ -251,6 +250,42 @@ RW C <- (k == 0) ? ddescA(k, n) [ type = %{ return ADTT_ ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - k - n - 1) * (n - k) : PRI_MAX +BODY [type=RECURSIVE] +{ + int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + + if ( (tempnn > smallnb) || (descA->mb > smallnb) ) + { + subtile_desc_t *small_descT; + subtile_desc_t *small_descC; + parsec_taskpool_t* parsec_ztrsm; + + + small_descT = subtile_desc_create( descA, k, k, + smallnb, smallnb, 0, 0, descA->mb, descA->mb ); + small_descT->mat = T; + + small_descC = subtile_desc_create( descA, k, n, + smallnb, smallnb, 0, 0, descA->mb, tempnn ); + small_descC->mat = C; + + parsec_ztrsm = dplasma_ztrsm_New(dplasmaLeft, dplasmaUpper, + dplasmaConjTrans, dplasmaNonUnit, + (dplasma_complex64_t)1.0, + (parsec_tiled_matrix_t *)small_descT, + (parsec_tiled_matrix_t *)small_descC ); + + parsec_recursivecall((parsec_task_t*)this_task, + parsec_ztrsm, dplasma_ztrsm_Destruct, + 2, small_descT, small_descC ); + + return PARSEC_HOOK_RETURN_ASYNC; + } + /* Go for the sequential CPU version */ + return PARSEC_HOOK_RETURN_NEXT; +} +END + BODY [type=CUDA] { int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; @@ -278,14 +313,14 @@ END BODY [type=HIP] { + int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; + int ldak_T = LDA(ddescA, T); + int ldak_C = LDA(ddescA, C); #if defined(PRECISION_z) || defined(PRECISION_c) hipblasDoubleComplex zone = { 1., 0. }; #else double zone = 1.; #endif - int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; - int ldak_T = LDA(ddescA, T); - int ldak_C = LDA(ddescA, C); hipblasStatus_t status; dplasma_hip_handles_t *handles; @@ -300,42 +335,6 @@ BODY [type=HIP] } END -BODY [type=RECURSIVE] -{ - int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; - - if ( (tempnn > smallnb) || (descA->mb > smallnb) ) - { - subtile_desc_t *small_descT; - subtile_desc_t *small_descC; - parsec_taskpool_t* parsec_ztrsm; - - - small_descT = subtile_desc_create( descA, k, k, - smallnb, smallnb, 0, 0, descA->mb, descA->mb ); - small_descT->mat = T; - - small_descC = subtile_desc_create( descA, k, n, - smallnb, smallnb, 0, 0, descA->mb, tempnn ); - small_descC->mat = C; - - parsec_ztrsm = dplasma_ztrsm_New(dplasmaLeft, dplasmaUpper, - dplasmaConjTrans, dplasmaNonUnit, - (dplasma_complex64_t)1.0, - (parsec_tiled_matrix_t *)small_descT, - (parsec_tiled_matrix_t *)small_descC ); - - parsec_recursivecall((parsec_task_t*)this_task, - parsec_ztrsm, dplasma_ztrsm_Destruct, - 2, small_descT, small_descC ); - - return PARSEC_HOOK_RETURN_ASYNC; - } - /* Go for the sequential CPU version */ - return PARSEC_HOOK_RETURN_NEXT; -} -END - BODY { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; @@ -385,6 +384,38 @@ RW T <- (k == 0) ? ddescA(n, n) [ type = %{ return ADTT_REA ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * (n - k) : PRI_MAX +BODY [type=RECURSIVE] +{ + int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; + + if ( (tempnn > smallnb) || (descA->mb > smallnb) ) + { + subtile_desc_t *small_descT; + subtile_desc_t *small_descA; + parsec_taskpool_t* parsec_zherk; + + small_descT = subtile_desc_create( descA, n, n, + smallnb, smallnb, 0, 0, tempnn, tempnn ); + small_descT->mat = T; + + small_descA = subtile_desc_create( descA, k, n, + smallnb, smallnb, 0, 0, descA->mb, tempnn ); + small_descA->mat = A; + + parsec_zherk = dplasma_zherk_New( dplasmaUpper, dplasmaConjTrans, + (double)-1.0, (parsec_tiled_matrix_t*) small_descA, + (double)1.0, (parsec_tiled_matrix_t*) small_descT); + + parsec_recursivecall((parsec_task_t*)this_task, + parsec_zherk, dplasma_zherk_Destruct, + 2, small_descA, small_descT); + return PARSEC_HOOK_RETURN_ASYNC; + } + /* Go for the sequential CPU version */ + return PARSEC_HOOK_RETURN_NEXT; +} +END + BODY [type=CUDA] { int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; @@ -409,11 +440,11 @@ END BODY [type=HIP] { - double zone = 1.; - double mzone = -1.; int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; int ldak = LDA(ddescA, A ); int ldan = LDA(ddescA, T ); + double zone = 1.; + double mzone = -1.; hipblasStatus_t status; dplasma_hip_handles_t *handles; @@ -428,38 +459,6 @@ BODY [type=HIP] } END -BODY [type=RECURSIVE] -{ - int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; - - if ( (tempnn > smallnb) || (descA->mb > smallnb) ) - { - subtile_desc_t *small_descT; - subtile_desc_t *small_descA; - parsec_taskpool_t* parsec_zherk; - - small_descT = subtile_desc_create( descA, n, n, - smallnb, smallnb, 0, 0, tempnn, tempnn ); - small_descT->mat = T; - - small_descA = subtile_desc_create( descA, k, n, - smallnb, smallnb, 0, 0, descA->mb, tempnn ); - small_descA->mat = A; - - parsec_zherk = dplasma_zherk_New( dplasmaUpper, dplasmaConjTrans, - (double)-1.0, (parsec_tiled_matrix_t*) small_descA, - (double)1.0, (parsec_tiled_matrix_t*) small_descT); - - parsec_recursivecall((parsec_task_t*)this_task, - parsec_zherk, dplasma_zherk_Destruct, - 2, small_descA, small_descT); - return PARSEC_HOOK_RETURN_ASYNC; - } - /* Go for the sequential CPU version */ - return PARSEC_HOOK_RETURN_NEXT; -} -END - BODY { int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; @@ -509,6 +508,47 @@ RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_ ; (n >= (descA->nt - PRI_CHANGE)) ? (descA->nt - n) * (descA->nt - n) * (descA->nt - n) + 3 * ((2 * descA->nt) - m - n - 3) * (n - m) + 6 * (n - k) : PRI_MAX +BODY [type=RECURSIVE] +{ + int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + + if ( (tempnn > smallnb) || (descA->mb > smallnb) ) + { + subtile_desc_t *small_descA; + subtile_desc_t *small_descB; + subtile_desc_t *small_descC; + parsec_taskpool_t *parsec_zgemm; + + small_descA = subtile_desc_create( descA, k, m, + smallnb, smallnb, 0, 0, descA->mb, descA->nb ); + small_descA->mat = A; + + small_descB = subtile_desc_create( descA, k, n, + smallnb, smallnb, 0, 0, descA->mb, tempnn ); + small_descB->mat = B; + + small_descC = subtile_desc_create( descA, m, n, + smallnb, smallnb, 0, 0, descA->mb, tempnn ); + small_descC->mat = C; + + parsec_zgemm = dplasma_zgemm_New(dplasmaConjTrans, dplasmaNoTrans, + (dplasma_complex64_t)-1.0, + (parsec_tiled_matrix_t *)small_descA, + (parsec_tiled_matrix_t *)small_descB, + (dplasma_complex64_t) 1.0, + (parsec_tiled_matrix_t *)small_descC); + + parsec_recursivecall((parsec_task_t*)this_task, + parsec_zgemm, dplasma_zgemm_Destruct, + 3, small_descA, small_descB, small_descC ); + + return PARSEC_HOOK_RETURN_ASYNC; + } + /* Go to CPU sequential kernel */ + return PARSEC_HOOK_RETURN_NEXT; +} +END + BODY [type=CUDA A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} @@ -561,6 +601,10 @@ BODY [type=CUDA END BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A.dc=ddescA B.dc=ddescA C.dc=ddescA stage_in=dplasma_hip_lapack_stage_in stage_out=dplasma_hip_lapack_stage_out] { @@ -594,47 +638,6 @@ BODY [type=HIP } END -BODY [type=RECURSIVE] -{ - int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; - - if ( (tempnn > smallnb) || (descA->mb > smallnb) ) - { - subtile_desc_t *small_descA; - subtile_desc_t *small_descB; - subtile_desc_t *small_descC; - parsec_taskpool_t *parsec_zgemm; - - small_descA = subtile_desc_create( descA, k, m, - smallnb, smallnb, 0, 0, descA->mb, descA->nb ); - small_descA->mat = A; - - small_descB = subtile_desc_create( descA, k, n, - smallnb, smallnb, 0, 0, descA->mb, tempnn ); - small_descB->mat = B; - - small_descC = subtile_desc_create( descA, m, n, - smallnb, smallnb, 0, 0, descA->mb, tempnn ); - small_descC->mat = C; - - parsec_zgemm = dplasma_zgemm_New(dplasmaConjTrans, dplasmaNoTrans, - (dplasma_complex64_t)-1.0, - (parsec_tiled_matrix_t *)small_descA, - (parsec_tiled_matrix_t *)small_descB, - (dplasma_complex64_t) 1.0, - (parsec_tiled_matrix_t *)small_descC); - - parsec_recursivecall((parsec_task_t*)this_task, - parsec_zgemm, dplasma_zgemm_Destruct, - 3, small_descA, small_descB, small_descC ); - - return PARSEC_HOOK_RETURN_ASYNC; - } - /* Go to CPU sequential kernel */ - return PARSEC_HOOK_RETURN_NEXT; -} -END - BODY { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; From 0cef4fc591bdad460686ba6e1ee71b7e63215ef6 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 19 Oct 2023 14:17:24 -0400 Subject: [PATCH 31/41] hip: add NT/TN/TT cases to gemm_summa Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_summa.jdf | 2 +- src/zgemm_NT_summa.jdf | 58 +++++++++++++++++++++++++++++++++++++++++- src/zgemm_TN_summa.jdf | 58 +++++++++++++++++++++++++++++++++++++++++- src/zgemm_TT_summa.jdf | 58 +++++++++++++++++++++++++++++++++++++++++- src/zgemm_wrapper.c | 8 +----- 5 files changed, 173 insertions(+), 11 deletions(-) diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 3bef37c2..976becd8 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 11c09ada..9e34ed16 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -17,6 +17,9 @@ extern "C" %{ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define A_SHAPE 0 @@ -85,6 +88,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -242,6 +247,57 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + printloggpu("HIP_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index c3fc7193..25076288 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -17,6 +17,9 @@ extern "C" %{ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define A_SHAPE 0 @@ -85,6 +88,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -241,6 +246,57 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->m - k * descA->nb : descA->nb; + int ldak = descA->mb; + int ldbk = descB->mb; + int ldcm = descC->mb; + + printloggpu("HIP_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldak, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 0c419bf5..e6ddd57b 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -17,6 +17,9 @@ extern "C" %{ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +#include +#endif /* defined(DPLASMA_HAVE_HIP) */ /* Define the different shapes this JDF is using */ #define A_SHAPE 0 @@ -85,6 +88,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -244,6 +249,57 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->mt-1 ? descA->m - k * descA->nb : descA->nb; + int ldak = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + printloggpu("HIP_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldak, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 3f9ec96b..8d6d7ada 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -91,14 +91,12 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_NT_summa_taskpool_t* tp; tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); -#if 0 #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; -#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } @@ -108,14 +106,12 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_TN_summa_taskpool_t* tp; tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); -#if 0 #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; -#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } else { @@ -124,14 +120,12 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_TT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); -#if 0 #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); #else tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; -#endif #endif zgemm_tp = (parsec_taskpool_t*)tp; } From 26446de9d85c2c98a8224f23efb05bd7a1825a0e Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 19 Oct 2023 14:23:16 -0400 Subject: [PATCH 32/41] Update parsec to a version that works with GPUs Signed-off-by: Aurelien Bouteiller --- parsec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsec b/parsec index 9f82ba39..73b92ddb 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit 9f82ba391c093a42ce6a8f768945eed12d7d15e0 +Subproject commit 73b92ddbb895eac5bee048e8ddaf467cc21c231c From a98dc4fd42a4f35d4c915767c39230a03f9ca00b Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 19 Oct 2023 14:42:02 -0400 Subject: [PATCH 33/41] zpotrf_wrapper: uid and handles don't exist when not using a GPU device Signed-off-by: Aurelien Bouteiller --- src/zpotrf_wrapper.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 0721e6ac..381f3a6f 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -240,7 +240,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, #else parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; - (void)uid; (void)workspace_info_name; #endif #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but From aa6b93361f4d42a09f62e7a4640d7886eaed8e91 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 31 Jan 2024 16:43:57 -0500 Subject: [PATCH 34/41] Update dtd for hip/cuda specializations for the dtd workspaces Signed-off-by: Aurelien Bouteiller --- src/dplasmaaux_hip.c | 15 +++++++++++++++ src/dplasmaaux_hip.h | 4 ++-- src/dtd_wrappers/zherk.c | 2 +- src/dtd_wrappers/zpotrf.c | 4 ++-- src/dtd_wrappers/ztrsm.c | 2 +- tests/common.c | 13 +++++++++---- 6 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/dplasmaaux_hip.c b/src/dplasmaaux_hip.c index 79ce7ae0..b041ad23 100644 --- a/src/dplasmaaux_hip.c +++ b/src/dplasmaaux_hip.c @@ -14,6 +14,14 @@ #include #include +/* + * Global info ID's for cublas handles and workspaces + * Should be initialized in the tests + * with the return of parsec_info_register + * or parsec_info_lookup + */ +parsec_info_id_t dplasma_dtd_hip_infoid = -1; + /* Unfortunately, HIPBLAS does not provide a error to string function */ char *dplasma_hipblas_error_to_string(hipblasStatus_t hipblas_status) { @@ -78,3 +86,10 @@ void *dplasma_create_hip_handles(void *obj, void *_n) return new; } +void dplasma_destroy_hip_handles(void *_h, void *_n) +{ + dplasma_hip_handles_t *handles = (dplasma_hip_handles_t*)_h; + (void)_n; + hipblasDestroy(handles->hipblas_handle); + free(handles); +} diff --git a/src/dplasmaaux_hip.h b/src/dplasmaaux_hip.h index ea582be4..d021bd6b 100644 --- a/src/dplasmaaux_hip.h +++ b/src/dplasmaaux_hip.h @@ -58,14 +58,14 @@ trans = (trans == dplasmaNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; #endif /* PRECISION_z || PRECISION_c */ -extern parsec_info_id_t CuHI; -extern parsec_info_id_t WoSI; +extern parsec_info_id_t dplasma_dtd_hip_infoid; typedef struct { hipblasHandle_t hipblas_handle; } dplasma_hip_handles_t; void *dplasma_create_hip_handles(void *obj, void *user); +void dplasma_destroy_hip_handles(void *_h, void *_n); #define DPLASMA_ROCBLAS_CHECK_ERROR(STR, ERROR, CODE) \ do { \ diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c index f0aed388..692e7dbc 100644 --- a/src/dtd_wrappers/zherk.c +++ b/src/dtd_wrappers/zherk.c @@ -60,7 +60,7 @@ parsec_core_zherk_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_op(trans); dplasma_cublas_fill(uplo); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); #if defined(PARSEC_DEBUG_NOISIER) { diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c index 6f026b61..c3db1036 100644 --- a/src/dtd_wrappers/zpotrf.c +++ b/src/dtd_wrappers/zpotrf.c @@ -116,9 +116,9 @@ parsec_core_zpotrf_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_fill(uplo); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, WoSI); + wp = parsec_info_get(&gpu_device->super.infos, dplasma_dtd_cuda_workspace_infoid); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c index 612ec99e..89e77473 100644 --- a/src/dtd_wrappers/ztrsm.c +++ b/src/dtd_wrappers/ztrsm.c @@ -62,7 +62,7 @@ parsec_core_ztrsm_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_op(trans); dplasma_cublas_diag(diag); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex alphag = make_cuDoubleComplex( creal(alpha), cimag(alpha)); diff --git a/tests/common.c b/tests/common.c index 31e1aba2..2af37fda 100644 --- a/tests/common.c +++ b/tests/common.c @@ -704,29 +704,34 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) print_arguments(iparam); #if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) - int dev, nbgpu = 0; + int dev, nb_cuda_gpu = 0, nb_hip_gpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); if( PARSEC_DEV_CUDA == device->type ) { - nbgpu++; + nb_cuda_gpu++; + } + else if( PARSEC_DEV_HIP == device->type ) { + nb_hip_gpu++; } } - if( nbgpu > 0 ) { #if defined(DPLASMA_HAVE_CUDA) + if( nb_cuda_gpu > 0 ) { dplasma_dtd_cuda_infoid = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", dplasma_destroy_cuda_handles, NULL, dplasma_create_cuda_handles, NULL, NULL); assert(-1 != dplasma_dtd_cuda_infoid); + } #endif #if defined(DPLASMA_HAVE_HIP) + if( nb_hip_gpu > 0 ) { dplasma_dtd_hip_infoid = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", dplasma_destroy_hip_handles, NULL, dplasma_create_hip_handles, NULL, NULL); assert(-1 != dplasma_dtd_hip_infoid); -#endif } +#endif #endif if(verbose > 2) TIME_PRINT(iparam[IPARAM_RANK], ("PaRSEC initialized\n")); From 4d9970cf56828a12f7d05f3ac7de0ff15a1e3bcc Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 10 May 2024 04:31:10 -0400 Subject: [PATCH 35/41] Make all gemm_summa the same between hip/cuda Signed-off-by: Aurelien Bouteiller --- src/zgemm_NN_summa.jdf | 2 +- src/zgemm_NT_summa.jdf | 11 ++++++++++- src/zgemm_TN_summa.jdf | 11 ++++++++++- src/zgemm_TT_summa.jdf | 9 +++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index f7909a19..fedbce71 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 8559cdbe..bdd71186 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -234,6 +234,15 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; + printloggpu("CUDA_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index e341565f..0c6b402e 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -233,6 +233,15 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; + printloggpu("CUDA_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index d1b482f0..d036f412 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -233,6 +233,15 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; + printloggpu("CUDA_gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), From ca06f54a66af8b3910d0c9675e39607bfd4d850d Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 10 May 2024 23:15:23 -0400 Subject: [PATCH 36/41] Use the same controls as parsec for GPU_WITH_CUDA/HIP --- CMakeLists.txt | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2595d7f9..e4e7122e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,10 @@ set(DPLASMA_VERSION "${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR}") ############################################################################ # CMake Policies Tuning +if(POLICY CMP0144) + # CMP0144: find_package uses upper-case _ROOT variables in addition to _ROOT + cmake_policy(SET CMP0144 NEW) +endif(POLICY CMP0144) set(CMAKE_NO_SYSTEM_FROM_IMPORTED True) ############################################################################ @@ -231,22 +235,24 @@ endif(NOT TARGET PaRSEC::parsec AND NOT TARGET PaRSEC::parsec_ptgpp) ############################################################################ # Resume configuring dplasma -option(DPLASMA_HAVE_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA}) -if(DPLASMA_HAVE_CUDA) +option(DPLASMA_GPU_WITH_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA}) +if(DPLASMA_GPU_WITH_CUDA) message(STATUS "CUDA support for DPLASMA enabled") if(NOT TARGET CUDA::cusolver) find_package(CUDAToolkit REQUIRED) endif(NOT TARGET CUDA::cusolver) + set(DPLASMA_HAVE_CUDA ${PARSEC_HAVE_CUDA} CACHE BOOL "True if DPLASMA provide support for CUDA") endif() -option(DPLASMA_HAVE_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) -if(DPLASMA_HAVE_HIP) +option(DPLASMA_GPU_WITH_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) +if(DPLASMA_GPU_WITH_HIP) + message(STATUS "HIP support for DPLASMA enabled") # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH}) list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm) find_package(hipblas REQUIRED) find_package(rocsolver REQUIRED) set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save}) - message(STATUS "HIP support for DPLASMA enabled") + set(DPLASMA_HAVE_HIP ${PARSEC_HAVE_HIP} CACHE BOOL "True if DPLASMA provide support for HIP") endif() ############################################################################ From 6b908167fd830b49c81a3423d3a6856f9db229b2 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 16 May 2024 17:58:12 -0400 Subject: [PATCH 37/41] hip: merge error: the device count must be updated in both hip and cuda builds --- tests/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/common.c b/tests/common.c index e2746cb2..f663feff 100644 --- a/tests/common.c +++ b/tests/common.c @@ -707,6 +707,10 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) nb_hip_gpu++; } } + iparam[IPARAM_NGPUS] = nb_cuda_gpu + nb_hip_gpu; + if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) { + parsec_setenv_mca_param( "device_show_statistics", "1", &environ ); + } #if defined(DPLASMA_HAVE_CUDA) if( nb_cuda_gpu > 0 ) { dplasma_dtd_cuda_infoid = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", @@ -715,10 +719,6 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) NULL); assert(-1 != dplasma_dtd_cuda_infoid); } - iparam[IPARAM_NGPUS] = nb_cuda_gpu + nb_hip_gpu; - if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) { - parsec_setenv_mca_param( "device_show_statistics", "1", &environ ); - } #endif #if defined(DPLASMA_HAVE_HIP) if( nb_hip_gpu > 0 ) { From b547fce84c32b4b89cf09beccbc1e86573f4e87f Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 16 May 2024 21:36:08 -0400 Subject: [PATCH 38/41] hip: printlog hipblascomplex not compatible with creal --- src/zgemm_NN_summa.jdf | 2 +- src/zgemm_NT_summa.jdf | 2 +- src/zgemm_TN_summa.jdf | 2 +- src/zgemm_TT_summa.jdf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index fedbce71..af15f4e5 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -288,7 +288,7 @@ BODY [type=HIP tempmm, tempnn, tempkk, creal(alpha), m, k, ldam, k, n, ldbk, - creal(lbeta), m, n, ldcm ); + (k==0)? creal(beta): 1.0, m, n, ldcm ); hipblasStatus_t status; hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index bdd71186..7c9864a7 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -288,7 +288,7 @@ BODY [type=HIP tempmm, tempnn, tempkk, creal(alpha), m, k, ldam, n, k, ldbn, - creal(lbeta), m, n, ldcm ); + (k==0)? creal(beta): 1.0, m, n, ldcm ); hipblasStatus_t status; hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index 0c6b402e..1c09415b 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -287,7 +287,7 @@ BODY [type=HIP tempmm, tempnn, tempkk, creal(alpha), k, m, ldak, k, n, ldbk, - creal(lbeta), m, n, ldcm ); + (k==0)? creal(beta): 1.0, m, n, ldcm ); hipblasStatus_t status; hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index d036f412..62ee6912 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -290,7 +290,7 @@ BODY [type=HIP tempmm, tempnn, tempkk, creal(alpha), k, m, ldak, n, k, ldbn, - creal(lbeta), m, n, ldcm ); + (k==0)? creal(beta): 1.0, m, n, ldcm ); hipblasStatus_t status; hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; From 83585b18bfee7a5735fea673bb82f49b95854d2c Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Mon, 20 May 2024 23:25:00 -0400 Subject: [PATCH 39/41] hip: final cleanup --- src/cuda/lapack_cuda_stage_in.c | 2 +- src/dplasmaaux.c | 3 ++- src/dplasmaaux.h | 2 +- src/dplasmaaux_cuda.c | 17 ++++++++++------- src/dplasmaaux_cuda.h | 6 +++--- src/dplasmaaux_hip.c | 27 +++++---------------------- src/dplasmaaux_hip.h | 26 +++++++++----------------- src/dplasmajdf.h | 4 ++-- src/dplasmajdf_lapack_dtt.h | 2 +- src/dtd_wrappers/dplasma_z_dtd.h | 2 +- src/dtd_wrappers/zgemm.c | 2 +- src/dtd_wrappers/zherk.c | 2 +- src/dtd_wrappers/zpotrf.c | 2 +- src/dtd_wrappers/ztrsm.c | 2 +- src/potrf_gpu_workspaces.h | 2 +- src/zgemm_NN.jdf | 2 +- src/zgemm_NN_gpu.jdf | 2 +- src/zgemm_NN_summa.jdf | 4 ++-- src/zgemm_NT.jdf | 2 +- src/zgemm_NT_summa.jdf | 4 ++-- src/zgemm_TN.jdf | 2 +- src/zgemm_TN_summa.jdf | 4 ++-- src/zgemm_TT.jdf | 2 +- src/zgemm_TT_summa.jdf | 6 +++--- src/zgemm_wrapper.c | 2 +- src/zgetrf_nopiv.jdf | 4 ++-- src/zpotrf_L.jdf | 7 +++---- src/zpotrf_U.jdf | 6 +++--- tests/common.c | 15 ++++++++++----- tests/testing_zgemm_dtd.c | 2 +- tests/testing_zpotrf_dtd.c | 2 +- 31 files changed, 75 insertions(+), 92 deletions(-) diff --git a/src/cuda/lapack_cuda_stage_in.c b/src/cuda/lapack_cuda_stage_in.c index 4e2c4a4a..7261342a 100644 --- a/src/cuda/lapack_cuda_stage_in.c +++ b/src/cuda/lapack_cuda_stage_in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 The University of Tennessee and The University + * Copyright (c) 2020-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 09eab0d0..86a6b189 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -109,3 +109,4 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A ) return dplasma_imax( ceil( alpha ), 2 ); } } + diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index 2e145d2f..a849cfb8 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. diff --git a/src/dplasmaaux_cuda.c b/src/dplasmaaux_cuda.c index 3d24d909..c85242b0 100644 --- a/src/dplasmaaux_cuda.c +++ b/src/dplasmaaux_cuda.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -8,11 +8,13 @@ #include "dplasma/config.h" #include "parsec/utils/zone_malloc.h" #include "parsec/utils/show_help.h" +#include "potrf_gpu_workspaces.h" + #include +#include #include "dplasmaaux_cuda.h" -#include "potrf_gpu_workspaces.h" -/* +/* * Global info ID's for cublas handles and workspaces * Should be initialized in the tests * with the return of parsec_info_register @@ -21,8 +23,8 @@ parsec_info_id_t dplasma_dtd_cuda_infoid = -1; parsec_info_id_t dplasma_dtd_cuda_workspace_infoid = -1; -/* Unfortunately, CUBLAS does not provide a error to string function */ -char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) +/* Unfortunately, CUBLAS < 11.4.2 does not provide a error to string function */ +const char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) { switch(cublas_status) { @@ -38,8 +40,8 @@ char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) } } -/* Unfortunately, cuSolver does not provide a error to string function */ -char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status) +/* Unfortunately, cuSolver < 11.4.2 does not provide a error to string function */ +const char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status) { switch(cusolver_status) { case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS"; @@ -106,3 +108,4 @@ void dplasma_destroy_cuda_handles(void *_h, void *_n) cusolverDnDestroy(handles->cusolverDn_handle); free(handles); } + diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h index 038ce19d..d7d5a540 100644 --- a/src/dplasmaaux_cuda.h +++ b/src/dplasmaaux_cuda.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -70,7 +70,7 @@ typedef struct { void *dplasma_create_cuda_handles(void *obj, void *user); void dplasma_destroy_cuda_handles(void *_h, void *_n); -char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status); +const char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status); #define DPLASMA_CUBLAS_CHECK_STATUS( STR, STATUS, CODE ) \ do { \ @@ -86,7 +86,7 @@ char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status); /* Support for cusolve requires cublas_v2 */ #include -char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); +const char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); #define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE ) \ do { \ diff --git a/src/dplasmaaux_hip.c b/src/dplasmaaux_hip.c index b041ad23..40574ae7 100644 --- a/src/dplasmaaux_hip.c +++ b/src/dplasmaaux_hip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -14,7 +14,7 @@ #include #include -/* +/* * Global info ID's for cublas handles and workspaces * Should be initialized in the tests * with the return of parsec_info_register @@ -22,25 +22,8 @@ */ parsec_info_id_t dplasma_dtd_hip_infoid = -1; -/* Unfortunately, HIPBLAS does not provide a error to string function */ -char *dplasma_hipblas_error_to_string(hipblasStatus_t hipblas_status) -{ - switch(hipblas_status) - { - case HIPBLAS_STATUS_SUCCESS: return "HIPBLAS_STATUS_SUCCESS"; - case HIPBLAS_STATUS_NOT_INITIALIZED: return "HIPBLAS_STATUS_NOT_INITIALIZED"; - case HIPBLAS_STATUS_ALLOC_FAILED: return "HIPBLAS_STATUS_ALLOC_FAILED"; - case HIPBLAS_STATUS_INVALID_VALUE: return "HIPBLAS_STATUS_INVALID_VALUE"; - case HIPBLAS_STATUS_ARCH_MISMATCH: return "HIPBLAS_STATUS_ARCH_MISMATCH"; - case HIPBLAS_STATUS_MAPPING_ERROR: return "HIPBLAS_STATUS_MAPPING_ERROR"; - case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED"; - case HIPBLAS_STATUS_INTERNAL_ERROR: return "HIPBLAS_STATUS_INTERNAL_ERROR"; - default: return "unknown HIPBLAS error"; - } -} - -/* Unfortunately, cuSolver does not provide a error to string function */ -char *dplasma_hipsolver_error_to_string(hipsolverStatus_t hipsolver_status) +/* Unfortunately, hipSolver does not provide a error to string function */ +const char *dplasma_hipsolver_error_to_string(hipsolverStatus_t hipsolver_status) { switch(hipsolver_status) { case HIPSOLVER_STATUS_SUCCESS: return "HIPSOLVER_STATUS_SUCCESS"; @@ -74,7 +57,7 @@ void *dplasma_create_hip_handles(void *obj, void *_n) parsec_show_help("help-dplasma.txt", "gpu_alloc_failed", 1, "HIPBLAS"); } parsec_fatal("Unable to create HIPBLAS Handle: %s", - dplasma_hipblas_error_to_string(hipblas_status)); + hipblasStatusToString(hipblas_status)); return NULL; } hipblas_status = hipblasSetStream(hipblas_handle, stream->hip_stream); diff --git a/src/dplasmaaux_hip.h b/src/dplasmaaux_hip.h index d021bd6b..4aac87c1 100644 --- a/src/dplasmaaux_hip.h +++ b/src/dplasmaaux_hip.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -12,26 +12,18 @@ #if defined(DPLASMA_HAVE_HIP) #include "parsec/mca/device/hip/device_hip.h" -/** - * DPLASMA currently supports a mix of hipblas v1 and v2, but not in the same source file. Thus, - * the simplest way to provide common headers is to require the developer to manually specify - * when hipblas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include - * hipblas.h (v1) automatically if HIP is enabled. - */ -#if !defined(HIPBLAS_V2_H_) #include -#endif /* !defined(HIPBLAS_V2_H_) */ -#define dplasma_hipblas_side(side) \ +#define dplasma_hipblas_side(side) \ assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \ side = (side == dplasmaRight) ? HIPBLAS_SIDE_RIGHT : HIPBLAS_SIDE_LEFT; -#define dplasma_hipblas_diag(diag) \ +#define dplasma_hipblas_diag(diag) \ assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \ diag = (diag == dplasmaNonUnit) ? HIPBLAS_DIAG_NON_UNIT : HIPBLAS_DIAG_UNIT; -#define dplasma_hipblas_fill(fill) \ +#define dplasma_hipblas_fill(fill) \ assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \ fill = (fill == dplasmaLower) ? HIPBLAS_FILL_MODE_LOWER : HIPBLAS_FILL_MODE_UPPER; @@ -40,20 +32,20 @@ assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \ switch(trans){ \ case dplasmaNoTrans: \ - trans = HIPBLAS_OP_N; \ + trans = HIPBLAS_OP_N; \ break; \ case dplasmaTrans: \ - trans = HIPBLAS_OP_T; \ + trans = HIPBLAS_OP_T; \ break; \ case dplasmaConjTrans: \ - trans = HIPBLAS_OP_C; \ + trans = HIPBLAS_OP_C; \ break; \ default: \ - trans = HIPBLAS_OP_N; \ + trans = HIPBLAS_OP_N; \ break; \ } #else -#define dplasma_hipblas_op(trans) \ +#define dplasma_hipblas_op(trans) \ assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \ trans = (trans == dplasmaNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; #endif /* PRECISION_z || PRECISION_c */ diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index 8c8daffc..8d09cb5f 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -23,8 +23,8 @@ # include # define printlog(str, ...) fprintf(stderr, "thread %d VP %d " str "\n", \ es->th_id, es->virtual_process->vp_id, __VA_ARGS__) -# define printloggpu(str, ...) fprintf(stderr, "GPU %d " str "\n", \ - gpu_device->super.device_index, __VA_ARGS__) +# define printloggpu(str, ...) fprintf(stderr, "GPU %s " str "\n", \ + gpu_device->super.device_name, __VA_ARGS__) #else # define printlog(...) do {} while(0) # define printloggpu(...) do {} while(0) diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index 7324a08e..21019004 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 The University of Tennessee and The University + * Copyright (c) 2020-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dtd_wrappers/dplasma_z_dtd.h b/src/dtd_wrappers/dplasma_z_dtd.h index c90552d5..e6db6208 100644 --- a/src/dtd_wrappers/dplasma_z_dtd.h +++ b/src/dtd_wrappers/dplasma_z_dtd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dtd_wrappers/zgemm.c b/src/dtd_wrappers/zgemm.c index b463126b..6a13bb37 100644 --- a/src/dtd_wrappers/zgemm.c +++ b/src/dtd_wrappers/zgemm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c index 692e7dbc..c1943425 100644 --- a/src/dtd_wrappers/zherk.c +++ b/src/dtd_wrappers/zherk.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c index c3db1036..b74e6293 100644 --- a/src/dtd_wrappers/zpotrf.c +++ b/src/dtd_wrappers/zpotrf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c index 89e77473..b54c0ec8 100644 --- a/src/dtd_wrappers/ztrsm.c +++ b/src/dtd_wrappers/ztrsm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/potrf_gpu_workspaces.h b/src/potrf_gpu_workspaces.h index cf31d4ba..5366e80a 100644 --- a/src/potrf_gpu_workspaces.h +++ b/src/potrf_gpu_workspaces.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 The University of Tennessee and The University + * Copyright (c) 2020-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index fe04a879..407ebbc8 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 61995b5c..97467030 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2017-2023 The University of Tennessee and The University + * Copyright (c) 2017-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index af15f4e5..3c38167b 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -234,7 +234,7 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; - printloggpu("CUDA_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), @@ -281,7 +281,7 @@ BODY [type=HIP int ldbk = descB->mb; int ldcm = descC->mb; - printloggpu("HIP_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index 051802e1..051bb603 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 7c9864a7..03fd212d 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -234,7 +234,7 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; - printloggpu("CUDA_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), @@ -281,7 +281,7 @@ BODY [type=HIP int ldbn = descB->mb; int ldcm = descC->mb; - printloggpu("HIP_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index ca54fd96..6d2eca1e 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index 1c09415b..af3a113a 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -233,7 +233,7 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; - printloggpu("CUDA_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), @@ -280,7 +280,7 @@ BODY [type=HIP int ldbk = descB->mb; int ldcm = descC->mb; - printloggpu("HIP_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index 27e20c97..e3595ca3 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 62ee6912..99b3da2a 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -233,7 +233,7 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; - printloggpu("CUDA_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), @@ -283,7 +283,7 @@ BODY [type=HIP int ldbn = descB->mb; int ldcm = descC->mb; - printloggpu("HIP_gemm( %d, %d, %d )\n" + printloggpu("gemm( %d, %d, %d )\n" " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", m, n, k, &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index 8d6d7ada..701db625 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index ba73607a..d77bedda 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -219,7 +219,7 @@ BODY [type=CUDA cublasStatus_t status; - printloggpu("CUDA_zgemm(%d, %d, %d)\n" + printloggpu("zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descA->mb, -1, A(%d,%d)[%p], ldam %d, A(%d,%d)[%p], ldak %d, 1.000000, A(%d,%d)[%p], ldam %d)\n", k, n, m, m, k, A, ldam_A, k, n, B, ldak_B, m, n, C, ldam_C); diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index a3c5968a..6d3ba6d0 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -1,9 +1,10 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. + * * $COPYRIGHT * * @precisions normal z -> s d c @@ -471,8 +472,7 @@ BODY tempmm, descA->mb, (double)-1.0, A /*A(m, k)*/, ldam_A, (double) 1.0, T /*A(m, m)*/, ldam_T); - printlog( - "CORE_zherk( %d, %d )\n\t( %s, %s, %d, %d, %f, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", + printlog("CORE_zherk( %d, %d )\n\t( %s, %s, %d, %d, %f, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", k, m, &dplasma_lapack_const( dplasmaLower ), &dplasma_lapack_const( dplasmaNoTrans ), tempmm, descA->mb, @@ -576,7 +576,6 @@ BODY [type=HIP dplasma_hip_handles_t *handles; handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); assert(NULL != handles); - status = hipblasZgemm( handles->hipblas_handle, HIPBLAS_OP_N, HIPBLAS_OP_C, tempmm, descA->mb, descA->mb, diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index b0f84e5e..0473ca84 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -1,9 +1,10 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. + * * $COPYRIGHT * * @precisions normal z -> s d c @@ -407,7 +408,6 @@ BODY [type=CUDA] tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); - PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END @@ -554,7 +554,7 @@ BODY [type=CUDA &zone, (cuDoubleComplex*)C, ldam_C); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); - printloggpu("CUDA_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", + printloggpu("zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", m, n, k, &dplasma_lapack_const( dplasmaConjTrans ), &dplasma_lapack_const( dplasmaNoTrans ), descA->mb, tempnn, descA->nb, diff --git a/tests/common.c b/tests/common.c index f663feff..feef5a29 100644 --- a/tests/common.c +++ b/tests/common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2021 The University of Tennessee and The University + * Copyright (c) 2009-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -26,13 +26,12 @@ #ifdef PARSEC_HAVE_MPI #include #endif +#include "dplasmaaux.h" #if defined(DPLASMA_HAVE_CUDA) #include -#include "dplasmaaux.h" #include #endif #if defined(DPLASMA_HAVE_HIP) -#include "dplasmaaux.h" #include #endif @@ -696,6 +695,10 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) iparam[IPARAM_NCORES] = nb_total_comp_threads; } + if(iparam[IPARAM_VERBOSE] >= 4) { + parsec_setenv_mca_param( "device_show_capabilities", "1", &environ ); + } + #if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) int dev, nb_cuda_gpu = 0, nb_hip_gpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { @@ -739,12 +742,14 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) void cleanup_parsec(parsec_context_t* parsec, int *iparam) { #if defined(DPLASMA_HAVE_CUDA) - parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); + { parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); + } #endif #if defined(DPLASMA_HAVE_HIP) - parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + { parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); + } #endif if(NULL != dev_stats) parsec_devices_free_statistics(&dev_stats); diff --git a/tests/testing_zgemm_dtd.c b/tests/testing_zgemm_dtd.c index 6161bba4..a9e5a602 100644 --- a/tests/testing_zgemm_dtd.c +++ b/tests/testing_zgemm_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2023 The University of Tennessee and The University + * Copyright (c) 2015-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/tests/testing_zpotrf_dtd.c b/tests/testing_zpotrf_dtd.c index 709368c3..2b6e1841 100644 --- a/tests/testing_zpotrf_dtd.c +++ b/tests/testing_zpotrf_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2023 The University of Tennessee and The University + * Copyright (c) 2013-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * From 750f3cb800399d66dcd06aa01d19dbb7c75a52c6 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Tue, 25 Jun 2024 12:48:10 -0400 Subject: [PATCH 40/41] hip: rework compile-time guards for cuda and hip inclusion --- CMakeLists.txt | 6 ++++++ configure | 6 +++--- src/dplasmaaux.h | 5 ++++- src/dplasmaaux_cuda.h | 4 +++- src/dplasmaaux_hip.h | 2 ++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4e7122e..56ac451b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,6 +237,9 @@ endif(NOT TARGET PaRSEC::parsec AND NOT TARGET PaRSEC::parsec_ptgpp) # Resume configuring dplasma option(DPLASMA_GPU_WITH_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA}) if(DPLASMA_GPU_WITH_CUDA) + if(NOT PARSEC_HAVE_CUDA) + message(FATAL_ERROR "CUDA support for DPLASMA requested, but detected PaRSEC does not support it") + endif() message(STATUS "CUDA support for DPLASMA enabled") if(NOT TARGET CUDA::cusolver) find_package(CUDAToolkit REQUIRED) @@ -245,6 +248,9 @@ if(DPLASMA_GPU_WITH_CUDA) endif() option(DPLASMA_GPU_WITH_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) if(DPLASMA_GPU_WITH_HIP) + if(NOT PARSEC_HAVE_HIP) + message(FATAL_ERROR "HIP support for DPLASMA requested, but detected PaRSEC does not support it") + endif() message(STATUS "HIP support for DPLASMA enabled") # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH}) diff --git a/configure b/configure index 7c61564d..c31a719c 100755 --- a/configure +++ b/configure @@ -630,10 +630,10 @@ x) ;; esac case x$with_hip in -xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=OFF";; -xyes) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON";; +xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=OFF -DDPLASMA_GPU_WITH_HIP=OFF";; +xyes) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON -DDPLASMA_GPU_WITH_HIP=ON";; x) ;; -*) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON -DHIP_ROOT=$(printf %q "$with_hip")";; +*) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON -DHIP_ROOT=$(printf %q "$with_hip") -DDPLASMA_GPU_WITH_HIP=ON";; esac case x$with_python in diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index a849cfb8..28ae2039 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -109,7 +109,10 @@ extern void *dplasma_pcomm; #define dplasma_error(__func, __msg) do { fprintf(stderr, "%s: %s\n", (__func), (__msg)); } while(0) #endif /* defined(DPLASMA_DEBUG) */ +#if defined(DPLASMA_HAVE_CUDA) #include "dplasmaaux_cuda.h" +#endif +#if defined(DPLASMA_HAVE_HIP) #include "dplasmaaux_hip.h" - +#endif #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h index d7d5a540..5336613a 100644 --- a/src/dplasmaaux_cuda.h +++ b/src/dplasmaaux_cuda.h @@ -98,6 +98,8 @@ const char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); } \ } while(0) #endif /* defined(CUBLAS_V2_H_) */ -#endif /* defined(DPLASMA_HAVE_CUDA) */ +#else +#warning "DPLASMA_HAVE_CUDA not defined, this file should not be included then." +#endif /* defined(DPLASMA_HAVE_CUDA) */ #endif /* __DPLAMAAUX_CUDA_H__ */ diff --git a/src/dplasmaaux_hip.h b/src/dplasmaaux_hip.h index 4aac87c1..5dc24460 100644 --- a/src/dplasmaaux_hip.h +++ b/src/dplasmaaux_hip.h @@ -80,5 +80,7 @@ void dplasma_destroy_hip_handles(void *_h, void *_n); } \ } while(0) +#else +#warning "DPLASMA_HAVE_HIP not defined, this file should not be included then." #endif /* defined(DPLASMA_HAVE_HIP */ #endif /* __DPLAMAAUX_HIP_H__ */ From a985fa6371ac4e25eefbce2d2c36b15e2969bb28 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 9 Aug 2024 13:23:19 -0400 Subject: [PATCH 41/41] Cublas: sometimes we would include cublas.h before including cusolver.h use cublasv2 by default (req. for cusolver anyway) except if cublas(v1) is included before the dplasmajdf/aux.h include This reverts the logic for cublas.h determination, and is a first steps toward deprecating cublas(v1) usage entirely --- src/dplasmaaux_cuda.h | 8 ++++---- src/dtd_wrappers/zgemm.c | 5 ----- src/dtd_wrappers/zherk.c | 5 ----- src/dtd_wrappers/zpotrf.c | 5 ----- src/dtd_wrappers/ztrsm.c | 5 ----- src/zgemm_NN.jdf | 11 +++++------ src/zgemm_NN_gpu.jdf | 14 +++++--------- src/zgemm_NN_summa.jdf | 12 ++++-------- src/zgemm_NT.jdf | 11 +++++------ src/zgemm_NT_summa.jdf | 12 ++++-------- src/zgemm_TN.jdf | 11 +++++------ src/zgemm_TN_summa.jdf | 12 ++++-------- src/zgemm_TT.jdf | 11 +++++------ src/zgemm_TT_summa.jdf | 12 ++++-------- src/zgetrf_nopiv.jdf | 8 ++++---- src/zpoinv_L.jdf | 9 ++++----- src/zpoinv_U.jdf | 9 ++++----- src/zpotrf_L.jdf | 12 +----------- src/zpotrf_U.jdf | 12 +----------- src/ztrsm_LLN.jdf | 6 +++++- src/ztrsm_LLT.jdf | 6 +++++- src/ztrsm_LUN.jdf | 6 +++++- src/ztrsm_LUT.jdf | 6 +++++- src/ztrsm_RLN.jdf | 6 +++++- src/ztrsm_RLT.jdf | 6 +++++- src/ztrsm_RUN.jdf | 6 +++++- src/ztrsm_RUT.jdf | 6 +++++- 27 files changed, 99 insertions(+), 133 deletions(-) diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h index 5336613a..ceebd1cb 100644 --- a/src/dplasmaaux_cuda.h +++ b/src/dplasmaaux_cuda.h @@ -16,11 +16,11 @@ /** * DPLASMA currently supports a mix of cublas v1 and v2, but not in the same source file. Thus, * the simplest way to provide common headers is to require the developer to manually specify - * when cublas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include - * cublas.h (v1) automatically if CUDA is enabled. + * when legacy cublas is needed by including the header before dplasmaaux.h. Otherwise, we will include + * cublas_v2.h (v2) automatically if CUDA is enabled. */ -#if !defined(CUBLAS_V2_H_) -#include +#if !defined(CUBLAS_H_) +#include #endif /* !defined(CUBLAS_V2_H_) */ #define dplasma_cublas_side(side) \ diff --git a/src/dtd_wrappers/zgemm.c b/src/dtd_wrappers/zgemm.c index 6a13bb37..1650b6f6 100644 --- a/src/dtd_wrappers/zgemm.c +++ b/src/dtd_wrappers/zgemm.c @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c index c1943425..3de61529 100644 --- a/src/dtd_wrappers/zherk.c +++ b/src/dtd_wrappers/zherk.c @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c index b74e6293..521845c2 100644 --- a/src/dtd_wrappers/zpotrf.c +++ b/src/dtd_wrappers/zpotrf.c @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c index b54c0ec8..7e5a7ac6 100644 --- a/src/dtd_wrappers/ztrsm.c +++ b/src/dtd_wrappers/ztrsm.c @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index 407ebbc8..fa273d40 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 97467030..bf087dd7 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -9,18 +9,14 @@ extern "C" %{ * */ -#include -#include "dplasmajdf.h" -#include "dplasmaaux.h" -#include "parsec/data_dist/matrix/matrix.h" -#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) { diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index 3c38167b..c27a5846 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -8,19 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index 051bb603..b6fa8c89 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 03fd212d..d66e5171 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -8,19 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index 6d2eca1e..aa529d91 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index af3a113a..b095ac61 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -8,19 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index e3595ca3..4a4effdc 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 99b3da2a..5b3eb373 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -8,19 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index d77bedda..04113181 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,12 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" /* Define the different shapes this JDF is using */ #define DEFAULT 0 diff --git a/src/zpoinv_L.jdf b/src/zpoinv_L.jdf index 309e4b86..4dc3e79c 100644 --- a/src/zpoinv_L.jdf +++ b/src/zpoinv_L.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,13 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ - +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" %} descA [type = "parsec_tiled_matrix_t*"] diff --git a/src/zpoinv_U.jdf b/src/zpoinv_U.jdf index 3db2a761..bdfc7459 100644 --- a/src/zpoinv_U.jdf +++ b/src/zpoinv_U.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,13 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ - +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" %} descA [type = "parsec_tiled_matrix_t*"] diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 6d3ba6d0..5a2e335c 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -11,9 +11,9 @@ extern "C" %{ * */ #include "dplasma/config.h" - #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +#include "potrf_gpu_workspaces.h" #if defined(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT) #include "parsec/data_dist/matrix/subtile.h" @@ -24,16 +24,6 @@ static void zherk_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_cal static void ztrsm_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_callback_t* data); #endif /* PARSEC_HAVE_DEV_RECURSIVE_SUPPORT */ -#if defined(DPLASMA_HAVE_CUDA) -#include -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ -#include "potrf_gpu_workspaces.h" - /* Define the different shapes this JDF is using */ #define DEFAULT 0 diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 0473ca84..67846484 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -11,7 +11,7 @@ extern "C" %{ * */ #include "dplasma/config.h" - +#include "potrf_gpu_workspaces.h" #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -24,16 +24,6 @@ static void zherk_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_cal static void ztrsm_recursive_cb(parsec_taskpool_t* tp, const parsec_recursive_callback_t* data); #endif /* PARSEC_HAVE_DEV_RECURSIVE_SUPPORT */ -#if defined(DPLASMA_HAVE_CUDA) -#include -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ -#if defined(DPLASMA_HAVE_HIP) -#include -#include -#endif /* defined(DPLASMA_HAVE_HIP) */ -#include "potrf_gpu_workspaces.h" - /* Define the different shapes this JDF is using */ #define DEFAULT 0 diff --git a/src/ztrsm_LLN.jdf b/src/ztrsm_LLN.jdf index b814593a..40661a3b 100644 --- a/src/ztrsm_LLN.jdf +++ b/src/ztrsm_LLN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LLT.jdf b/src/ztrsm_LLT.jdf index 2c0d708e..fec3febc 100644 --- a/src/ztrsm_LLT.jdf +++ b/src/ztrsm_LLT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LUN.jdf b/src/ztrsm_LUN.jdf index 838cc835..e71a4c0b 100644 --- a/src/ztrsm_LUN.jdf +++ b/src/ztrsm_LUN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LUT.jdf b/src/ztrsm_LUT.jdf index d8eb0950..a009a7f2 100644 --- a/src/ztrsm_LUT.jdf +++ b/src/ztrsm_LUT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RLN.jdf b/src/ztrsm_RLN.jdf index 2cd6cc5e..759e95ad 100644 --- a/src/ztrsm_RLN.jdf +++ b/src/ztrsm_RLN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RLT.jdf b/src/ztrsm_RLT.jdf index bf799e47..12657a6a 100644 --- a/src/ztrsm_RLT.jdf +++ b/src/ztrsm_RLT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RUN.jdf b/src/ztrsm_RUN.jdf index 9d2a59f6..e864bc9c 100644 --- a/src/ztrsm_RUN.jdf +++ b/src/ztrsm_RUN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RUT.jdf b/src/ztrsm_RUT.jdf index 6e4385f3..7b8c5e54 100644 --- a/src/ztrsm_RUT.jdf +++ b/src/ztrsm_RUT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h"