From 79ab7a9cec365929875b2e6cf973d200b75956fe Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 12 Jan 2021 16:46:38 -0500 Subject: [PATCH 01/36] Initial hipify from perl script --- src/build_info.hpp.in | 2 +- src/device/kronmult_cuda.cpp | 34 +++++++++---------- src/kronmult.cpp | 4 +-- src/kronmult.hpp | 2 +- src/lib_dispatch.cpp | 64 ++++++++++++++++++------------------ src/lib_dispatch_tests.cpp | 6 ++-- src/program_options.cpp | 2 +- src/tensors.hpp | 18 +++++----- 8 files changed, 65 insertions(+), 67 deletions(-) diff --git a/src/build_info.hpp.in b/src/build_info.hpp.in index 089a874cd..019abe0b7 100644 --- a/src/build_info.hpp.in +++ b/src/build_info.hpp.in @@ -6,7 +6,7 @@ #define BUILD_TIME "@BUILD_TIME@" #cmakedefine ASGARD_IO_HIGHFIVE -#cmakedefine ASGARD_USE_CUDA +#cmakedefine ASGARD_USE_HIP #cmakedefine ASGARD_USE_OPENMP #cmakedefine ASGARD_USE_MPI #cmakedefine ASGARD_USE_MATLAB diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index 9cc8a9d85..c543531a0 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -1,9 +1,8 @@ #include "kronmult_cuda.hpp" #include "build_info.hpp" -#ifdef ASGARD_USE_CUDA -#include -#include +#ifdef ASGARD_USE_HIP +#include #define USE_GPU #define GLOBAL_FUNCTION __global__ #define SYNCTHREADS __syncthreads() @@ -47,7 +46,7 @@ GLOBAL_FUNCTION void stage_inputs_kronmult_kernel(P const *const x, P *const workspace, int const num_elems, int const num_copies) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(blockIdx.y == 0); expect(blockIdx.z == 0); @@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, expect(num_elems > 0); expect(num_copies > 0); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto constexpr warp_size = 32; auto constexpr num_warps = 8; @@ -99,8 +98,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, auto const total_copies = static_cast(num_elems) * num_copies; auto const num_blocks = (total_copies + num_threads - 1) / num_threads; - stage_inputs_kronmult_kernel

- <<>>(x, workspace, num_elems, num_copies); + hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel

), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies); auto const stat = cudaDeviceSynchronize(); expect(stat == cudaSuccess); @@ -175,7 +173,7 @@ prepare_kronmult_kernel(int const *const flattened_table, auto const coord_size = num_dims * 2; auto const num_elems = static_cast(num_cols) * num_rows; -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(blockIdx.y == 0); expect(blockIdx.z == 0); @@ -192,7 +190,7 @@ prepare_kronmult_kernel(int const *const flattened_table, auto const increment = 1; #endif -#ifndef ASGARD_USE_CUDA +#ifndef ASGARD_USE_HIP #ifdef ASGARD_USE_OPENMP #pragma omp parallel for #endif @@ -273,7 +271,7 @@ void prepare_kronmult(int const *const flattened_table, expect(input_ptrs); expect(output_ptrs); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto constexpr warp_size = 32; auto constexpr num_warps = 8; auto constexpr num_threads = num_warps * warp_size; @@ -281,7 +279,7 @@ void prepare_kronmult(int const *const flattened_table, static_cast(elem_col_stop - elem_col_start + 1) * (elem_row_stop - elem_row_start + 1); auto const num_blocks = (num_krons / num_threads) + 1; - prepare_kronmult_kernel

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel

), dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table, operators, operator_lda, element_x, element_work, fx, operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms, num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop); @@ -304,7 +302,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], P const *const operator_ptrs[], int const lda, int const num_krons, int const num_dims) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP { int constexpr warpsize = 32; int constexpr nwarps = 1; @@ -313,27 +311,27 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], switch (num_dims) { case 1: - kronmult1_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; case 2: - kronmult2_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; case 3: - kronmult3_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; case 4: - kronmult4_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; case 5: - kronmult5_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; case 6: - kronmult6_xbatched

<<>>( + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); break; default: diff --git a/src/kronmult.cpp b/src/kronmult.cpp index 44d8f0098..a040c28f4 100644 --- a/src/kronmult.cpp +++ b/src/kronmult.cpp @@ -3,8 +3,8 @@ #include "lib_dispatch.hpp" #include "tools.hpp" -#ifdef ASGARD_USE_CUDA -#include +#ifdef ASGARD_USE_HIP +#include #endif #ifdef ASGARD_USE_OPENMP diff --git a/src/kronmult.hpp b/src/kronmult.hpp index 47e2f46f3..ad6cdbbf7 100644 --- a/src/kronmult.hpp +++ b/src/kronmult.hpp @@ -1,5 +1,5 @@ #pragma once -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP #define USE_GPU #endif #include "distribution.hpp" diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index 15277a3b0..e39e952ba 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -60,9 +60,9 @@ extern "C" #pragma GCC diagnostic pop #endif -#ifdef ASGARD_USE_CUDA -#include -#include +#ifdef ASGARD_USE_HIP +#include +#include #endif #ifdef ASGARD_USE_SCALAPACK @@ -104,7 +104,7 @@ struct device_handler void set_device(int const local_rank) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP int num_devices; auto success = cudaGetDeviceCount(&num_devices); @@ -133,14 +133,14 @@ struct device_handler #endif ~device_handler() { -#ifdef ASGARD_USE_CUDA - cublasDestroy(handle); +#ifdef ASGARD_USE_HIP + hipblasDestroy(handle); #endif } private: -#ifdef ASGARD_USE_CUDA - cublasHandle_t handle; +#ifdef ASGARD_USE_HIP + hipblasHandle_t handle; #endif }; static device_handler device; @@ -155,16 +155,16 @@ void initialize_libraries(int const local_rank) #endif } -#ifdef ASGARD_USE_CUDA -inline cublasOperation_t cublas_trans(char trans) +#ifdef ASGARD_USE_HIP +inline hipblasOperation_t cublas_trans(char trans) { if (trans == 'N' || trans == 'n') { - return CUBLAS_OP_N; + return HIPBLAS_OP_N; } else { - return CUBLAS_OP_T; + return HIPBLAS_OP_T; } } #endif @@ -182,7 +182,7 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // function instantiated for these two fp types if constexpr (std::is_same::value) { @@ -219,7 +219,7 @@ P nrm2(int *n, P *x, int *incx, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); P norm; @@ -272,7 +272,7 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -323,7 +323,7 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -379,7 +379,7 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -429,7 +429,7 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -561,7 +561,7 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -569,14 +569,14 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, if constexpr (std::is_same::value) { auto const success = - cublasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, + hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, + hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } @@ -627,21 +627,21 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemm( + auto const success = hipblasDgemm( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgemm( + auto const success = hipblasSgemm( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); @@ -686,7 +686,7 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -747,7 +747,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -824,7 +824,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -849,7 +849,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemmBatched( + auto const success = hipblasDgemmBatched( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); auto const cuda_stat = cudaDeviceSynchronize(); @@ -858,7 +858,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, } else if constexpr (std::is_same::value) { - auto const success = cublasSgemmBatched( + auto const success = hipblasSgemmBatched( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); auto const cuda_stat = cudaDeviceSynchronize(); @@ -912,7 +912,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); char const transb = 'n'; @@ -945,7 +945,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemmBatched( + auto const success = hipblasDgemmBatched( device.get_handle(), cublas_trans(*trans), cublas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); @@ -955,7 +955,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, } else if constexpr (std::is_same::value) { - auto const success = cublasSgemmBatched( + auto const success = hipblasSgemmBatched( device.get_handle(), cublas_trans(*trans), cublas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp index 87ea0c9ea..a6095cec0 100644 --- a/src/lib_dispatch_tests.cpp +++ b/src/lib_dispatch_tests.cpp @@ -793,7 +793,7 @@ TEMPLATE_TEST_CASE("dot product (lib_dispatch::dot)", "[lib_dispatch]", float, TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", "[lib_dispatch]", float, double) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP fk::matrix const test{{0.767135868133925, -0.641484652834663}, {0.641484652834663, 0.767135868133926}}; @@ -810,7 +810,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", lib_dispatch::getrf(&m, &n, test_d.data(), &lda, ipiv_d.data(), info_d.data(), resource::device); - auto stat = cudaDeviceSynchronize(); + auto stat = hipDeviceSynchronize(); REQUIRE(stat == 0); fk::vector const info_check(info_d.clone_onto_host()); REQUIRE(info_check(0) == 0); @@ -822,7 +822,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", lib_dispatch::getri(&n, test_d.data(), &lda, ipiv_d.data(), work.data(), &size, info_d.data(), resource::device); - stat = cudaDeviceSynchronize(); + stat = hipDeviceSynchronize(); REQUIRE(stat == 0); fk::vector const info_check_2(info_d.clone_onto_host()); REQUIRE(info_check_2(0) == 0); diff --git a/src/program_options.cpp b/src/program_options.cpp index 548c17f3d..6663686ef 100644 --- a/src/program_options.cpp +++ b/src/program_options.cpp @@ -211,7 +211,7 @@ parser::parser(int argc, char **argv) } } -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP if (use_implicit_stepping) { std::cerr << "GPU acceleration not implemented for implicit stepping\n"; diff --git a/src/tensors.hpp b/src/tensors.hpp index 90eecd8c7..7b3560250 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -1,8 +1,8 @@ #pragma once #include "build_info.hpp" -#ifdef ASGARD_USE_CUDA -#include +#ifdef ASGARD_USE_HIP +#include #endif #include "lib_dispatch.hpp" @@ -683,7 +683,7 @@ template inline void copy_on_device(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice); expect(success == cudaSuccess); @@ -696,7 +696,7 @@ template inline void copy_to_device(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice); expect(success == cudaSuccess); @@ -709,7 +709,7 @@ template inline void copy_to_host(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost); expect(success == cudaSuccess); @@ -726,9 +726,9 @@ copy_matrix_on_device(fk::matrix &dest, expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), source.ncols(), cudaMemcpyDeviceToDevice); expect(success == 0); @@ -747,7 +747,7 @@ copy_matrix_to_device(fk::matrix &dest, expect(source.ncols() == dest.ncols()); #ifdef ASGARD_USE_CUDA auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), source.ncols(), cudaMemcpyHostToDevice); expect(success == 0); @@ -766,7 +766,7 @@ copy_matrix_to_host(fk::matrix &dest, expect(source.ncols() == dest.ncols()); #ifdef ASGARD_USE_CUDA auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), source.ncols(), cudaMemcpyDeviceToHost); expect(success == 0); From f70d02bc34920550bf7b0da63094383f92be4fd5 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 26 Jan 2021 11:20:49 -0500 Subject: [PATCH 02/36] WIP HIP CMake configuration --- CMakeLists.txt | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d6ea3fee9..5fc2ccdc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "") option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "") option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "") option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF) -option (ASGARD_USE_CUDA "Optional CUDA support for asgard" OFF) +option (ASGARD_USE_HIP "Optional HIP support for asgard" ON) option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON) option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF) include(CMakeDependentOption) @@ -136,12 +136,30 @@ if(ASGARD_USE_OPENMP) endif() endif() -if(ASGARD_USE_CUDA) - find_package(CUDA 9.0 REQUIRED) # eventually want to remove this - how to set min version with enable_language? - include_directories(${CUDA_INCLUDE_DIRS}) - enable_language(CUDA) - set (CMAKE_CUDA_STANDARD 14) - set (CMAKE_CUDA_STANDARD_REQUIRED ON) +if(ASGARD_USE_HIP) + # find the HIP directory for the cmake files, taken from HIP samples + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() + endif() + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + + find_package(HIP 4.0 REQUIRED) + # Print some debug info about HIP configuration + message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") + message(STATUS "HIP COMPILER: ${HIP_COMPILER}") + message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") + message(STATUS "HIP Include dir: ${HIP_INCLUDE_DIRS}") + message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") + #include_directories(${HIP_INCLUDE_DIRS}) + message(STATUS ${HIP_INCLUDE_DIRS}) + + #enable_language(HIP) + #set (CMAKE_CUDA_STANDARD 14) + #set (CMAKE_CUDA_STANDARD_REQUIRED ON) endif() # build component to interface with Ed's kronmult lib @@ -158,7 +176,7 @@ if(ASGARD_USE_CUDA) endif() if(ASGARD_USE_MKL) - if(ASGARD_USE_CUDA) + if(ASGARD_USE_HIP) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") else() target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly @@ -330,8 +348,8 @@ target_link_libraries (quadrature PRIVATE matlab_utilities tensors) target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensors) target_link_libraries (tensors PRIVATE lib_dispatch) -if (ASGARD_USE_CUDA) - target_link_libraries (tensors PRIVATE ${CUDA_LIBRARIES}) +if (ASGARD_USE_HIP) + target_link_libraries (tensors PRIVATE ${HIP_LIBRARIES}) endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) From 0e65742bc27602d6c222cbde1902703b80a13b71 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 2 Feb 2021 11:19:32 -0500 Subject: [PATCH 03/36] Hipifying some missed parts --- src/device/kronmult_cuda.cpp | 13 ++-- src/lib_dispatch.cpp | 112 +++++++++++++++++------------------ src/tensors.hpp | 30 +++++----- 3 files changed, 78 insertions(+), 77 deletions(-) diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index c543531a0..9e5fcb139 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" #include "kronmult_cuda.hpp" #include "build_info.hpp" @@ -100,8 +101,8 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel

), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies); - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); #else stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies); #endif @@ -283,8 +284,8 @@ void prepare_kronmult(int const *const flattened_table, flattened_table, operators, operator_lda, element_x, element_work, fx, operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms, num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop); - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); #else prepare_kronmult_kernel( flattened_table, operators, operator_lda, element_x, element_work, fx, @@ -341,8 +342,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); } #else diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index e39e952ba..d6dbbe1cf 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -61,8 +61,8 @@ extern "C" #endif #ifdef ASGARD_USE_HIP -#include #include +#include #endif #ifdef ASGARD_USE_SCALAPACK @@ -94,11 +94,11 @@ struct device_handler device_handler() { #ifdef ASGARD_USE_CUDA - auto success = cublasCreate(&handle); - expect(success == CUBLAS_STATUS_SUCCESS); + auto success = hipblasCreate(&handle); + expect(success == HIPBLAS_STATUS_SUCCESS); - success = cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); - expect(success == CUBLAS_STATUS_SUCCESS); + success = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + expect(success == HIPBLAS_STATUS_SUCCESS); #endif } @@ -106,22 +106,22 @@ struct device_handler { #ifdef ASGARD_USE_HIP int num_devices; - auto success = cudaGetDeviceCount(&num_devices); + auto success = hipGetDeviceCount(&num_devices); - expect(success == cudaSuccess); + expect(success == hipSuccess); expect(local_rank >= 0); expect(local_rank < num_devices); if (handle) { - auto const cublas_success = cublasDestroy(handle); - expect(cublas_success == CUBLAS_STATUS_SUCCESS); + auto const cublas_success = hipblasDestroy(handle); + expect(cublas_success == HIPBLAS_STATUS_SUCCESS); } - success = cudaSetDevice(local_rank); - expect(success == cudaSuccess); - auto const cublas_success = cublasCreate(&handle); - expect(cublas_success == CUBLAS_STATUS_SUCCESS); + success = hipSetDevice(local_rank); + expect(success == hipSuccess); + auto const cublas_success = hipblasCreate(&handle); + expect(cublas_success == HIPBLAS_STATUS_SUCCESS); #else ignore(local_rank); @@ -129,7 +129,7 @@ struct device_handler } #ifdef ASGARD_USE_CUDA - cublasHandle_t const &get_handle() const { return handle; } + hipblasHandle_t const &get_handle() const { return handle; } #endif ~device_handler() { @@ -186,12 +186,12 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc) // function instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDrotg(device.get_handle(), a, b, c, s); + auto const success = hipblasDrotg(device.get_handle(), a, b, c, s); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSrotg(device.get_handle(), a, b, c, s); + auto const success = hipblasSrotg(device.get_handle(), a, b, c, s); expect(success == 0); } return; @@ -227,13 +227,13 @@ P nrm2(int *n, P *x, int *incx, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDnrm2(device.get_handle(), *n, x, *incx, &norm); + hipblasDnrm2(device.get_handle(), *n, x, *incx, &norm); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSnrm2(device.get_handle(), *n, x, *incx, &norm); + hipblasSnrm2(device.get_handle(), *n, x, *incx, &norm); expect(success == 0); } return norm; @@ -280,13 +280,13 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDcopy(device.get_handle(), *n, x, *incx, y, *incy); + hipblasDcopy(device.get_handle(), *n, x, *incx, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasScopy(device.get_handle(), *n, x, *incx, y, *incy); + hipblasScopy(device.get_handle(), *n, x, *incx, y, *incy); expect(success == 0); } return; @@ -332,13 +332,13 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result); + hipblasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result); + hipblasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result); expect(success == 0); } return result; @@ -387,13 +387,13 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy, if constexpr (std::is_same::value) { auto const success = - cublasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); + hipblasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); + hipblasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); expect(success == 0); } return; @@ -437,13 +437,13 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDscal(device.get_handle(), *n, alpha, x, *incx); + hipblasDscal(device.get_handle(), *n, alpha, x, *incx); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSscal(device.get_handle(), *n, alpha, x, *incx); + hipblasSscal(device.get_handle(), *n, alpha, x, *incx); expect(success == 0); } return; @@ -570,14 +570,14 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, { auto const success = hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } return; @@ -694,9 +694,9 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info, ignore(m); P **A_d; - auto stat = cudaMalloc((void **)&A_d, sizeof(P *)); + auto stat = hipMalloc((void **)&A_d, sizeof(P *)); expect(stat == 0); - stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types @@ -757,14 +757,14 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, P const **A_d; P **work_d; - auto stat = cudaMalloc((void **)&A_d, sizeof(P *)); + auto stat = hipMalloc((void **)&A_d, sizeof(P *)); expect(stat == 0); - stat = cudaMalloc((void **)&work_d, sizeof(P *)); + stat = hipMalloc((void **)&work_d, sizeof(P *)); expect(stat == 0); - stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(work_d, &work, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(work_d, &work, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types @@ -833,17 +833,17 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, P **c_d; size_t const list_size = *num_batch * sizeof(P *); - auto stat = cudaMalloc((void **)&a_d, list_size); + auto stat = hipMalloc((void **)&a_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&b_d, list_size); + stat = hipMalloc((void **)&b_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&c_d, list_size); + stat = hipMalloc((void **)&c_d, list_size); expect(stat == 0); - stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(b_d, b, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(b_d, b, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(c_d, c, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(c_d, c, list_size, hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types @@ -852,7 +852,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, auto const success = hipblasDgemmBatched( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); + auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); } @@ -861,16 +861,16 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, auto const success = hipblasSgemmBatched( device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); + auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); } - stat = cudaFree(a_d); + stat = hipFree(a_d); expect(stat == 0); - stat = cudaFree(b_d); + stat = hipFree(b_d); expect(stat == 0); - stat = cudaFree(c_d); + stat = hipFree(c_d); expect(stat == 0); return; @@ -929,17 +929,17 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, P **y_d; size_t const list_size = *num_batch * sizeof(P *); - auto stat = cudaMalloc((void **)&a_d, list_size); + auto stat = hipMalloc((void **)&a_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&x_d, list_size); + stat = hipMalloc((void **)&x_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&y_d, list_size); + stat = hipMalloc((void **)&y_d, list_size); expect(stat == 0); - stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(x_d, x, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(x_d, x, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(y_d, y, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(y_d, y, list_size, hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types @@ -949,7 +949,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, device.get_handle(), cublas_trans(*trans), cublas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); + auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); } @@ -959,16 +959,16 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, device.get_handle(), cublas_trans(*trans), cublas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); + auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); } - stat = cudaFree(a_d); + stat = hipFree(a_d); expect(stat == 0); - stat = cudaFree(x_d); + stat = hipFree(x_d); expect(stat == 0); - stat = cudaFree(y_d); + stat = hipFree(y_d); expect(stat == 0); return; diff --git a/src/tensors.hpp b/src/tensors.hpp index 7b3560250..1e08065b1 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -640,8 +640,8 @@ inline void allocate_device(P *&ptr, int const num_elems, bool const initialize = true) { #ifdef ASGARD_USE_CUDA - auto success = cudaMalloc((void **)&ptr, num_elems * sizeof(P)); - assert(success == cudaSuccess); + auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P)); + assert(success == hipSuccess); if (num_elems > 0) { expect(ptr != nullptr); @@ -649,8 +649,8 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true) if (initialize) { - success = cudaMemset((void *)ptr, 0, num_elems * sizeof(P)); - expect(success == cudaSuccess); + success = hipMemset((void *)ptr, 0, num_elems * sizeof(P)); + expect(success == hipSuccess); } #else @@ -669,11 +669,11 @@ template inline void delete_device(P *const ptr) { #ifdef ASGARD_USE_CUDA - auto const success = cudaFree(ptr); + auto const success = hipFree(ptr); // the device runtime may be unloaded at process shut down // (when static storage duration destructors are called) // returning a cudartUnloading error code. - expect((success == cudaSuccess) || (success == cudaErrorCudartUnloading)); + expect((success == hipSuccess) || (success == hipErrorDeinitialized)); #else delete[] ptr; #endif @@ -685,8 +685,8 @@ copy_on_device(P *const dest, P const *const source, int const num_elems) { #ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToDevice); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -698,8 +698,8 @@ copy_to_device(P *const dest, P const *const source, int const num_elems) { #ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyHostToDevice); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -711,8 +711,8 @@ copy_to_host(P *const dest, P const *const source, int const num_elems) { #ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToHost); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -730,7 +730,7 @@ copy_matrix_on_device(fk::matrix &dest, auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyDeviceToDevice); + source.ncols(), hipMemcpyDeviceToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -749,7 +749,7 @@ copy_matrix_to_device(fk::matrix &dest, auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyHostToDevice); + source.ncols(), hipMemcpyHostToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -768,7 +768,7 @@ copy_matrix_to_host(fk::matrix &dest, auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyDeviceToHost); + source.ncols(), hipMemcpyDeviceToHost); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); From 3daf6459552207c13391db2a71a6416cdc9a672e Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 2 Feb 2021 12:03:51 -0500 Subject: [PATCH 04/36] Updating formatting for hipified files --- src/device/kronmult_cuda.cpp | 52 +++++++++++++-------- src/tensors.hpp | 88 ++++++++++++++++++------------------ 2 files changed, 78 insertions(+), 62 deletions(-) diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index 9e5fcb139..52bf40f90 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -1,6 +1,6 @@ -#include "hip/hip_runtime.h" #include "kronmult_cuda.hpp" #include "build_info.hpp" +#include "hip/hip_runtime.h" #ifdef ASGARD_USE_HIP #include @@ -99,7 +99,9 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, auto const total_copies = static_cast(num_elems) * num_copies; auto const num_blocks = (total_copies + num_threads - 1) / num_threads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel

), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies); + hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel

), + dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, + num_elems, num_copies); auto const stat = hipDeviceSynchronize(); expect(stat == hipSuccess); @@ -280,10 +282,12 @@ void prepare_kronmult(int const *const flattened_table, static_cast(elem_col_stop - elem_col_start + 1) * (elem_row_stop - elem_row_start + 1); auto const num_blocks = (num_krons / num_threads) + 1; - hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel

), dim3(num_blocks), dim3(num_threads), 0, 0, - flattened_table, operators, operator_lda, element_x, element_work, fx, - operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms, - num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop); + hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel

), + dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table, + operators, operator_lda, element_x, element_work, fx, + operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, + num_terms, num_dims, elem_row_start, elem_row_stop, + elem_col_start, elem_col_stop); auto const stat = hipDeviceSynchronize(); expect(stat == hipSuccess); #else @@ -312,28 +316,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], switch (num_dims) { case 1: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 2: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 3: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 4: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 5: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 6: - hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched

), dim3(num_krons), dim3(num_threads), 0, 0, - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; default: expect(false); diff --git a/src/tensors.hpp b/src/tensors.hpp index 1e08065b1..dd27cbf07 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -240,10 +240,10 @@ class vector template> vector

operator-(vector const &right) const; template> - P operator*(vector const &)const; + P operator*(vector const &) const; template> - vector

operator*(matrix const &)const; + vector

operator*(matrix const &) const; template> vector

operator*(P const) const; @@ -475,9 +475,9 @@ class matrix template> matrix

operator*(P const) const; template> - vector

operator*(vector const &)const; + vector

operator*(vector const &) const; template> - matrix

operator*(matrix const &)const; + matrix

operator*(matrix const &) const; template> matrix

operator+(matrix const &) const; template> @@ -729,8 +729,8 @@ copy_matrix_on_device(fk::matrix &dest, #ifdef ASGARD_USE_HIP auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), hipMemcpyDeviceToDevice); + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyDeviceToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -748,8 +748,8 @@ copy_matrix_to_device(fk::matrix &dest, #ifdef ASGARD_USE_CUDA auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), hipMemcpyHostToDevice); + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyHostToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -767,8 +767,8 @@ copy_matrix_to_host(fk::matrix &dest, #ifdef ASGARD_USE_CUDA auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), hipMemcpyDeviceToHost); + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyDeviceToHost); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -996,8 +996,8 @@ fk::vector::vector(vector const &a) // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom // template -fk::vector &fk::vector:: -operator=(vector const &a) +fk::vector & +fk::vector::operator=(vector const &a) { static_assert(mem != mem_type::const_view, "cannot copy assign into const_view!"); @@ -1042,8 +1042,8 @@ fk::vector::vector(vector &&a) // vector move assignment // template -fk::vector &fk::vector:: -operator=(vector &&a) +fk::vector & +fk::vector::operator=(vector &&a) { static_assert(mem != mem_type::const_view, "cannot move assign into const_view!"); @@ -1089,8 +1089,8 @@ fk::vector::vector(vector const &a) // template template -fk::vector &fk::vector:: -operator=(vector const &a) +fk::vector & +fk::vector::operator=(vector const &a) { expect(size() == a.size()); @@ -1124,8 +1124,8 @@ fk::vector::vector(vector const &a) // assignment owner <-> view template template -fk::vector &fk::vector:: -operator=(vector const &a) +fk::vector & +fk::vector::operator=(vector const &a) { expect(size() == a.size()); if constexpr (resrc == resource::host) @@ -1191,8 +1191,8 @@ fk::vector &fk::vector::transfer_from( // template template -fk::vector &fk::vector:: -operator=(std::vector

const &v) +fk::vector & +fk::vector::operator=(std::vector

const &v) { expect(size() == static_cast(v.size())); std::memcpy(data_, v.data(), v.size() * sizeof(P)); @@ -1278,8 +1278,8 @@ bool fk::vector::operator<(vector const &other) const // template template -fk::vector

fk::vector:: -operator+(vector const &right) const +fk::vector

+fk::vector::operator+(vector const &right) const { expect(size() == right.size()); vector

ans(size()); @@ -1293,8 +1293,8 @@ operator+(vector const &right) const // template template -fk::vector

fk::vector:: -operator-(vector const &right) const +fk::vector

+fk::vector::operator-(vector const &right) const { expect(size() == right.size()); vector

ans(size()); @@ -1323,8 +1323,8 @@ P fk::vector::operator*(vector const &right) const // template template -fk::vector

fk::vector:: -operator*(fk::matrix const &A) const +fk::vector

+fk::vector::operator*(fk::matrix const &A) const { // check dimension compatibility expect(size() == A.nrows()); @@ -1780,8 +1780,8 @@ fk::matrix::matrix(matrix const &a) // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom // template -fk::matrix &fk::matrix:: -operator=(matrix const &a) +fk::matrix & +fk::matrix::operator=(matrix const &a) { static_assert(mem != mem_type::const_view, "cannot copy assign into const_view!"); @@ -1834,8 +1834,8 @@ fk::matrix::matrix(matrix const &a) // assignment owner <-> view template template -fk::matrix &fk::matrix:: -operator=(matrix const &a) +fk::matrix & +fk::matrix::operator=(matrix const &a) { expect(nrows() == a.nrows()); expect(ncols() == a.ncols()); @@ -1874,8 +1874,8 @@ fk::matrix::matrix(matrix const &a) // template template -fk::matrix &fk::matrix:: -operator=(matrix const &a) +fk::matrix & +fk::matrix::operator=(matrix const &a) { expect((nrows() == a.nrows()) && (ncols() == a.ncols())); @@ -1966,8 +1966,8 @@ fk::matrix::matrix(matrix &&a) // matrix move assignment // template -fk::matrix &fk::matrix:: -operator=(matrix &&a) +fk::matrix & +fk::matrix::operator=(matrix &&a) { static_assert(mem != mem_type::const_view, "cannot move assign into const_view!"); @@ -1998,8 +1998,8 @@ operator=(matrix &&a) // template template -fk::matrix &fk::matrix:: -operator=(fk::vector const &v) +fk::matrix & +fk::matrix::operator=(fk::vector const &v) { expect(nrows() * ncols() == v.size()); @@ -2085,8 +2085,8 @@ bool fk::matrix::operator<(matrix const &other) const // template template -fk::matrix

fk::matrix:: -operator+(matrix const &right) const +fk::matrix

+fk::matrix::operator+(matrix const &right) const { expect(nrows() == right.nrows() && ncols() == right.ncols()); @@ -2106,8 +2106,8 @@ operator+(matrix const &right) const // template template -fk::matrix

fk::matrix:: -operator-(matrix const &right) const +fk::matrix

+fk::matrix::operator-(matrix const &right) const { expect(nrows() == right.nrows() && ncols() == right.ncols()); @@ -2145,8 +2145,8 @@ fk::matrix

fk::matrix::operator*(P const right) const // template template -fk::vector

fk::matrix:: -operator*(fk::vector const &right) const +fk::vector

+fk::matrix::operator*(fk::vector const &right) const { // check dimension compatibility expect(ncols() == right.size()); @@ -2172,8 +2172,8 @@ operator*(fk::vector const &right) const // template template -fk::matrix

fk::matrix:: -operator*(matrix const &B) const +fk::matrix

+fk::matrix::operator*(matrix const &B) const { expect(ncols() == B.nrows()); // k == k From e2491dc2debf7479d4d892be18ba4bd81b2956bd Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 15 Feb 2021 14:18:02 -0500 Subject: [PATCH 05/36] Minor cmake adjustments for hip --- CMakeLists.txt | 27 ++++++++++++++++++--------- src/lib_dispatch.cpp | 6 +++--- src/tensors.hpp | 8 ++++---- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fc2ccdc5..da414be06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,18 +157,26 @@ if(ASGARD_USE_HIP) #include_directories(${HIP_INCLUDE_DIRS}) message(STATUS ${HIP_INCLUDE_DIRS}) - #enable_language(HIP) + #enable_language(HIP) # no cmake support yet? #set (CMAKE_CUDA_STANDARD 14) #set (CMAKE_CUDA_STANDARD_REQUIRED ON) + + + #find_package(hipBLAS) + #if (hipBLAS_FOUND) + # message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}") + #endif() + endif() # build component to interface with Ed's kronmult lib ##TODO: link to kronmult as interface library add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) -if(ASGARD_USE_CUDA) - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension - set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") +if(ASGARD_USE_HIP) + # there is no hip language property in cmake yet? + #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) + #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") # Turn on GPU support in kronmult. @@ -320,9 +328,10 @@ else () target_link_libraries (lib_dispatch PRIVATE LINALG::LINALG) endif () -if (ASGARD_USE_CUDA) - target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES} - ${CUDA_CUBLAS_LIBRARIES}) +if (ASGARD_USE_HIP) + target_link_libraries(lib_dispatch PRIVATE ${HIP_LIBRARIES}) + target_include_directories(lib_dispatch PRIVATE ${HIP_INCLUDE_DIRS}) + target_include_directories(kronmult_cuda PRIVATE ${HIP_INCLUDE_DIRS}) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -450,7 +459,7 @@ if (ASGARD_BUILD_TESTS) target_link_libraries (${component}-tests PRIVATE ${component} MPI::MPI_CXX) if (${component} IN_LIST mpi_test_components) set(test_ranks "4") - if (ASGARD_USE_CUDA) + if (ASGARD_USE_HIP) set(test_ranks "1") endif () if (${ASGARD_TESTING_RANKS}) diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index d6dbbe1cf..0a274a87c 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -93,7 +93,7 @@ struct device_handler { device_handler() { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto success = hipblasCreate(&handle); expect(success == HIPBLAS_STATUS_SUCCESS); @@ -128,7 +128,7 @@ struct device_handler #endif } -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP hipblasHandle_t const &get_handle() const { return handle; } #endif ~device_handler() @@ -147,7 +147,7 @@ static device_handler device; void initialize_libraries(int const local_rank) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(local_rank >= 0); device.set_device(local_rank); #else diff --git a/src/tensors.hpp b/src/tensors.hpp index dd27cbf07..87234e1a9 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -639,7 +639,7 @@ template inline void allocate_device(P *&ptr, int const num_elems, bool const initialize = true) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P)); assert(success == hipSuccess); if (num_elems > 0) @@ -668,7 +668,7 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true) template inline void delete_device(P *const ptr) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = hipFree(ptr); // the device runtime may be unloaded at process shut down // (when static storage duration destructors are called) @@ -745,7 +745,7 @@ copy_matrix_to_device(fk::matrix &dest, { expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), @@ -764,7 +764,7 @@ copy_matrix_to_host(fk::matrix &dest, { expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), From 4b94cc87edd56df3466b4ca9367056d4fbb29498 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 11 Mar 2021 15:37:21 -0500 Subject: [PATCH 06/36] Add initial hip platform configuration --- CMakeLists.txt | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da414be06..d06059a14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,8 @@ if(ASGARD_USE_OPENMP) endif() endif() +set(ASGARD_PLATFORM_NVCC 0) +set(ASGARD_PLATFORM_HCC 0) if(ASGARD_USE_HIP) # find the HIP directory for the cmake files, taken from HIP samples if(NOT DEFINED HIP_PATH) @@ -145,6 +147,34 @@ if(ASGARD_USE_HIP) set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") endif() endif() + + find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") + if(ASGARD_HIPCONFIG_PATH) + execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM) + elseif(DEFINED ENV{HIP_PLATFORM}) + set(ASGARD_HIP_PLATFORM "$ENV{HIP_PLATFORM}") + else() + message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set") + endif() + + message(STATUS "HIP Platform configured for ${ASGARD_HIP_PLATFORM}") + if(ASGARD_HIP_PLATFORM STREQUAL "hcc") + set(ASGARD_PLATFORM_HCC 1) + elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc") + set(ASGARD_PLATFORM_NVCC 1) + endif() + + # double check for cuda path since HIP uses it internally + if(ASGARD_PLATFORM_NVCC) + if (NOT DEFINED ENV{CUDA_PATH}) + find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH) + if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH) + message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP") + endif() + endif() + message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}") + endif() + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP 4.0 REQUIRED) @@ -174,7 +204,7 @@ endif() add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) if(ASGARD_USE_HIP) # there is no hip language property in cmake yet? - #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") @@ -307,7 +337,11 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX) endif () -target_link_libraries(kronmult_cuda PUBLIC kron) +if (ASGARD_USE_HIP) + target_link_libraries(kronmult_cuda PUBLIC kron hip::device) +else () + target_link_libraries(kronmult_cuda PUBLIC kron) +endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult_cuda PRIVATE OpenMP::OpenMP_CXX) From e8161031e60c3f6a8c88246d39a1d3a4af290551 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 16 Mar 2021 14:29:56 -0400 Subject: [PATCH 07/36] Update cmake cuda options and add hipblas --- CMakeLists.txt | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d06059a14..141f2dd1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,6 +148,14 @@ if(ASGARD_USE_HIP) endif() endif() + if(NOT DEFINED HIPBLAS_PATH) + if(NOT DEFINED ENV{HIPBLAS_PATH}) + set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed") + else() + set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed") + endif() +endif() + find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") if(ASGARD_HIPCONFIG_PATH) execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM) @@ -176,8 +184,10 @@ if(ASGARD_USE_HIP) endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") find_package(HIP 4.0 REQUIRED) + find_package(hipblas REQUIRED) # Print some debug info about HIP configuration message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") message(STATUS "HIP COMPILER: ${HIP_COMPILER}") @@ -188,14 +198,16 @@ if(ASGARD_USE_HIP) message(STATUS ${HIP_INCLUDE_DIRS}) #enable_language(HIP) # no cmake support yet? - #set (CMAKE_CUDA_STANDARD 14) - #set (CMAKE_CUDA_STANDARD_REQUIRED ON) - + if(ASGARD_PLATFORM_NVCC) + enable_language(CUDA) + set (CMAKE_CUDA_STANDARD 14) + set (CMAKE_CUDA_STANDARD_REQUIRED ON) + endif() #find_package(hipBLAS) - #if (hipBLAS_FOUND) - # message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}") - #endif() + if (hipBLAS_FOUND) + message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") + endif() endif() @@ -203,10 +215,14 @@ endif() ##TODO: link to kronmult as interface library add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) if(ASGARD_USE_HIP) - # there is no hip language property in cmake yet? - #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension - #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") + if(ASGARD_PLATFORM_HCC) + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension + elseif(ASGARD_PLATFORM_NVCC) + # there is no hip language property in cmake yet? + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") + endif() set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") # Turn on GPU support in kronmult. @@ -338,7 +354,7 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) endif () if (ASGARD_USE_HIP) - target_link_libraries(kronmult_cuda PUBLIC kron hip::device) + target_link_libraries(kronmult_cuda PUBLIC kron)# hip::device) else () target_link_libraries(kronmult_cuda PUBLIC kron) endif() @@ -363,9 +379,14 @@ else () endif () if (ASGARD_USE_HIP) - target_link_libraries(lib_dispatch PRIVATE ${HIP_LIBRARIES}) - target_include_directories(lib_dispatch PRIVATE ${HIP_INCLUDE_DIRS}) - target_include_directories(kronmult_cuda PRIVATE ${HIP_INCLUDE_DIRS}) + if(ASGARD_PLATFORM_HCC) + target_link_libraries(lib_dispatch PRIVATE hip::device) + elseif(ASGARD_PLATFORM_NVCC) + target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) + endif() + target_link_libraries(lib_dispatch PRIVATE roc::hipblas) + target_include_directories(lib_dispatch PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS}) + target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS}) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) From 0d986487804969e25eb7de9238e02d3a5b86a514 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 26 Apr 2021 10:14:11 -0400 Subject: [PATCH 08/36] CMake adjustments and initial platform options --- CMakeLists.txt | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 141f2dd1e..5f2aed325 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,10 +136,11 @@ if(ASGARD_USE_OPENMP) endif() endif() +# convenience flags for which HIP platform has been setup set(ASGARD_PLATFORM_NVCC 0) set(ASGARD_PLATFORM_HCC 0) if(ASGARD_USE_HIP) - # find the HIP directory for the cmake files, taken from HIP samples + # search for HIP and libraries if(NOT DEFINED HIP_PATH) if(NOT DEFINED ENV{HIP_PATH}) set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") @@ -154,8 +155,9 @@ if(ASGARD_USE_HIP) else() set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed") endif() -endif() + endif() + # try to find hipconfig executable which can help detect platforms and include dirs find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") if(ASGARD_HIPCONFIG_PATH) execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM) @@ -165,7 +167,7 @@ endif() message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set") endif() - message(STATUS "HIP Platform configured for ${ASGARD_HIP_PLATFORM}") + message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") if(ASGARD_HIP_PLATFORM STREQUAL "hcc") set(ASGARD_PLATFORM_HCC 1) elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc") @@ -183,7 +185,12 @@ endif() message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}") endif() - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + # look for HIP cmake configs in different locations + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake") + if(ASGARD_PLATFORM_HCC) + # note: causes issues on nvidia, but might be needed on amd platforms? + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake") + endif() list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") find_package(HIP 4.0 REQUIRED) @@ -192,23 +199,27 @@ endif() message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") message(STATUS "HIP COMPILER: ${HIP_COMPILER}") message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") - message(STATUS "HIP Include dir: ${HIP_INCLUDE_DIRS}") + message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") #include_directories(${HIP_INCLUDE_DIRS}) - message(STATUS ${HIP_INCLUDE_DIRS}) - #enable_language(HIP) # no cmake support yet? if(ASGARD_PLATFORM_NVCC) - enable_language(CUDA) - set (CMAKE_CUDA_STANDARD 14) - set (CMAKE_CUDA_STANDARD_REQUIRED ON) + enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia? + #set (CMAKE_CUDA_STANDARD 14) + #set (CMAKE_CUDA_STANDARD_REQUIRED ON) + elseif(ASGARD_PLATFORM_HCC) + #enable_language(HIP) # not yet added to latest cmake, but should be available soon endif() - #find_package(hipBLAS) if (hipBLAS_FOUND) message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") endif() + set(ASGARD_HIP_FLAGS "-std=c++14") + set(ASGARD_AMD_ARCH "gfx_906") + set(ASGARD_AMD_FLAGS "") # amdgpu specific options + set(ASGARD_NVCC_ARCH "sm_86") + set(ASGARD_NVCC_FLAGS "") # nvcc specific options endif() # build component to interface with Ed's kronmult lib @@ -216,12 +227,12 @@ endif() add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) if(ASGARD_USE_HIP) if(ASGARD_PLATFORM_HCC) - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) elseif(ASGARD_PLATFORM_NVCC) # there is no hip language property in cmake yet? set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_86 -g -lineinfo --ptxas-options=-O3") endif() set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") From 0a78c9e39a979129d143d72bbfe3ffdb3ee89d71 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 17 May 2021 13:18:42 -0400 Subject: [PATCH 09/36] Add in nvidia platform update for hip 4.2 --- CMakeLists.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f2aed325..45861bfe5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,7 +170,8 @@ if(ASGARD_USE_HIP) message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") if(ASGARD_HIP_PLATFORM STREQUAL "hcc") set(ASGARD_PLATFORM_HCC 1) - elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc") + # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" + elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia") set(ASGARD_PLATFORM_NVCC 1) endif() @@ -204,11 +205,15 @@ if(ASGARD_USE_HIP) #include_directories(${HIP_INCLUDE_DIRS}) if(ASGARD_PLATFORM_NVCC) + find_package(CUDA 9.0 REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia? - #set (CMAKE_CUDA_STANDARD 14) - #set (CMAKE_CUDA_STANDARD_REQUIRED ON) + set (CMAKE_CUDA_STANDARD 14) + set (CMAKE_CUDA_STANDARD_REQUIRED ON) + add_compile_definitions(__HIP_PLATFORM_NVCC__) elseif(ASGARD_PLATFORM_HCC) #enable_language(HIP) # not yet added to latest cmake, but should be available soon + add_compile_definitions(__HIP_PLATFORM_AMD__) endif() if (hipBLAS_FOUND) @@ -220,6 +225,8 @@ if(ASGARD_USE_HIP) set(ASGARD_AMD_FLAGS "") # amdgpu specific options set(ASGARD_NVCC_ARCH "sm_86") set(ASGARD_NVCC_FLAGS "") # nvcc specific options + + #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) endif() # build component to interface with Ed's kronmult lib From b397160625dc8491bb608c0ac67cdaf5579b1f23 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 17 May 2021 16:33:19 -0400 Subject: [PATCH 10/36] Update tensor linking and missed cublas calls --- CMakeLists.txt | 13 ++++++++++--- src/lib_dispatch.cpp | 44 ++++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45861bfe5..f10a4bdaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,6 +226,10 @@ if(ASGARD_USE_HIP) set(ASGARD_NVCC_ARCH "sm_86") set(ASGARD_NVCC_FLAGS "") # nvcc specific options + # assume this include path since ${HIP_INCLUDE_DIRS} not being set on nvidia platform + include_directories("${HIP_PATH}/include") + include_directories(${HIPBLAS_INCLUDE_DIRS}) + #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) endif() @@ -403,8 +407,6 @@ if (ASGARD_USE_HIP) target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) endif() target_link_libraries(lib_dispatch PRIVATE roc::hipblas) - target_include_directories(lib_dispatch PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS}) - target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS}) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -431,7 +433,12 @@ target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensor target_link_libraries (tensors PRIVATE lib_dispatch) if (ASGARD_USE_HIP) - target_link_libraries (tensors PRIVATE ${HIP_LIBRARIES}) + if(ASGARD_PLATFORM_HCC) + target_link_libraries(tensors PRIVATE hip::device) + elseif(ASGARD_PLATFORM_NVCC) + target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES}) + endif() + target_link_libraries (tensors PRIVATE roc::hipblas) endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index 0a274a87c..3bbf840f8 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -156,7 +156,7 @@ void initialize_libraries(int const local_rank) } #ifdef ASGARD_USE_HIP -inline hipblasOperation_t cublas_trans(char trans) +inline hipblasOperation_t hipblas_trans(char trans) { if (trans == 'N' || trans == 'n') { @@ -569,15 +569,15 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, if constexpr (std::is_same::value) { auto const success = - hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + hipblasDgemv(device.get_handle(), hipblas_trans(*trans), *m, *n, + alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + hipblasSgemv(device.get_handle(), hipblas_trans(*trans), *m, *n, + alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } return; @@ -635,15 +635,15 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k, if constexpr (std::is_same::value) { auto const success = hipblasDgemm( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = hipblasSgemm( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); } return; @@ -702,14 +702,14 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info, // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgetrfBatched(device.get_handle(), *n, A_d, - *lda, ipiv, info, 1); + auto const success = hipblasDgetrfBatched(device.get_handle(), *n, A_d, + *lda, ipiv, info, 1); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgetrfBatched(device.get_handle(), *n, A_d, - *lda, ipiv, info, 1); + auto const success = hipblasSgetrfBatched(device.get_handle(), *n, A_d, + *lda, ipiv, info, 1); expect(success == 0); } return; @@ -755,7 +755,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, expect(*lwork == (*n) * (*n)); ignore(lwork); - P const **A_d; + P **A_d; // hipBlas loses const to DgetriBatched P **work_d; auto stat = hipMalloc((void **)&A_d, sizeof(P *)); expect(stat == 0); @@ -770,13 +770,13 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgetriBatched( + auto const success = hipblasDgetriBatched( device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgetriBatched( + auto const success = hipblasSgetriBatched( device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1); expect(success == 0); } @@ -850,8 +850,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, if constexpr (std::is_same::value) { auto const success = hipblasDgemmBatched( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); @@ -859,8 +859,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, else if constexpr (std::is_same::value) { auto const success = hipblasSgemmBatched( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); auto const cuda_stat = hipDeviceSynchronize(); expect(cuda_stat == 0); expect(success == 0); @@ -946,7 +946,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, if constexpr (std::is_same::value) { auto const success = hipblasDgemmBatched( - device.get_handle(), cublas_trans(*trans), cublas_trans(transb), + device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); auto const cuda_stat = hipDeviceSynchronize(); @@ -956,7 +956,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, else if constexpr (std::is_same::value) { auto const success = hipblasSgemmBatched( - device.get_handle(), cublas_trans(*trans), cublas_trans(transb), + device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); auto const cuda_stat = hipDeviceSynchronize(); From 15a9d3598973d49c5f1db7f5c172d8239dc2e46a Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 26 May 2021 15:41:07 -0400 Subject: [PATCH 11/36] Update hip compiler option variables, hip_add_library for kronmult --- CMakeLists.txt | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f10a4bdaf..3a7c16154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,6 +194,16 @@ if(ASGARD_USE_HIP) endif() list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") + set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") + set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") + set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options + set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx + set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH}" CACHE STRING "Flags to pass to NVCC") # nvcc specific options + + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") + set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") + set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") + find_package(HIP 4.0 REQUIRED) find_package(hipblas REQUIRED) # Print some debug info about HIP configuration @@ -202,7 +212,6 @@ if(ASGARD_USE_HIP) message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") - #include_directories(${HIP_INCLUDE_DIRS}) if(ASGARD_PLATFORM_NVCC) find_package(CUDA 9.0 REQUIRED) @@ -220,22 +229,21 @@ if(ASGARD_USE_HIP) message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") endif() - set(ASGARD_HIP_FLAGS "-std=c++14") - set(ASGARD_AMD_ARCH "gfx_906") - set(ASGARD_AMD_FLAGS "") # amdgpu specific options - set(ASGARD_NVCC_ARCH "sm_86") - set(ASGARD_NVCC_FLAGS "") # nvcc specific options - - # assume this include path since ${HIP_INCLUDE_DIRS} not being set on nvidia platform + include_directories(${HIP_INCLUDE_DIRS}) + # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform include_directories("${HIP_PATH}/include") include_directories(${HIPBLAS_INCLUDE_DIRS}) - #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) + hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") +else() + # build component to interface with Ed's kronmult lib + ##TODO: link to kronmult as interface library + add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) endif() -# build component to interface with Ed's kronmult lib -##TODO: link to kronmult as interface library -add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) if(ASGARD_USE_HIP) if(ASGARD_PLATFORM_HCC) set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) From b0a81a1b5c190f3e9159a325d5063fd198238e3e Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 9 Jun 2021 11:56:25 -0400 Subject: [PATCH 12/36] Update nvidia arch flag for kronmult_cuda --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a7c16154..74611902e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,6 +194,8 @@ if(ASGARD_USE_HIP) endif() list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") + set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options @@ -251,7 +253,7 @@ if(ASGARD_USE_HIP) # there is no hip language property in cmake yet? set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_86 -g -lineinfo --ptxas-options=-O3") + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") endif() set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") @@ -415,6 +417,7 @@ if (ASGARD_USE_HIP) target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) endif() target_link_libraries(lib_dispatch PRIVATE roc::hipblas) + #set_target_properties(lib_dispatch PROPERTIES LINKER_LANGUAGE HIP) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -447,6 +450,7 @@ if (ASGARD_USE_HIP) target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES}) endif() target_link_libraries (tensors PRIVATE roc::hipblas) + #set_target_properties(tensors PROPERTIES LINKER_LANGUAGE HIP) endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) From 0889a35395e745b7b066c8ae63c2e52f41bb84cd Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 16 Jun 2021 13:17:39 -0400 Subject: [PATCH 13/36] Set hip_clang_path in cmake to find for amd platform --- CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74611902e..f38c5d135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,6 +149,15 @@ if(ASGARD_USE_HIP) endif() endif() + # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack) + if (NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries") + endif() + endif() + if(NOT DEFINED HIPBLAS_PATH) if(NOT DEFINED ENV{HIPBLAS_PATH}) set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed") @@ -187,10 +196,10 @@ if(ASGARD_USE_HIP) endif() # look for HIP cmake configs in different locations - list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake") + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") if(ASGARD_PLATFORM_HCC) # note: causes issues on nvidia, but might be needed on amd platforms? - list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake") + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") endif() list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") From 3d2f4666ec679156cf1f4f22c21e2e6eb8cc9385 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Fri, 18 Jun 2021 11:40:03 -0400 Subject: [PATCH 14/36] Set the hip_clang_include_path needed for spack installs --- CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f38c5d135..11262f95e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,9 +153,18 @@ if(ASGARD_USE_HIP) if (NOT DEFINED HIP_CLANG_PATH) if(NOT DEFINED ENV{HIP_CLANG_PATH}) set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries") - else() + else() set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries") - endif() + endif() + endif() + + # note: could probably grab this path directly using hipconfig? + if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) + if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) + set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/12.0.0/include" CACHE PATH "Path to HIP clang include directory") + else() + set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") + endif() endif() if(NOT DEFINED HIPBLAS_PATH) From 119cbb2069ce377e13d5abd5435b2159501e84cb Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 21 Jun 2021 13:51:27 -0600 Subject: [PATCH 15/36] Update hip build for amd platform --- CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11262f95e..6644787d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,7 @@ if(ASGARD_USE_HIP) endif() message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") - if(ASGARD_HIP_PLATFORM STREQUAL "hcc") + if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd") set(ASGARD_PLATFORM_HCC 1) # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia") @@ -266,9 +266,9 @@ endif() if(ASGARD_USE_HIP) if(ASGARD_PLATFORM_HCC) - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) + #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) elseif(ASGARD_PLATFORM_NVCC) - # there is no hip language property in cmake yet? set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") @@ -280,8 +280,8 @@ if(ASGARD_USE_HIP) endif() if(ASGARD_USE_MKL) - if(ASGARD_USE_HIP) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") + if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") else() target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly if(APPLE) # Need to link against the same openmp library as the MKL. @@ -403,8 +403,8 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX) endif () -if (ASGARD_USE_HIP) - target_link_libraries(kronmult_cuda PUBLIC kron)# hip::device) +if (ASGARD_USE_HIP AND ASGARD_PLATFORM_HCC) + target_link_libraries(kronmult_cuda PUBLIC kron hip::device) else () target_link_libraries(kronmult_cuda PUBLIC kron) endif() From b9002c6d1c797cedf650e45045cb2dd9cb407d32 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 28 Jun 2021 10:13:54 -0600 Subject: [PATCH 16/36] Add hcc platform def for backwards compatability --- CMakeLists.txt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6644787d6..735f05066 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,6 +186,7 @@ if(ASGARD_USE_HIP) endif() message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") + # hip >= 4.2 is now using "amd" to identify platform if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd") set(ASGARD_PLATFORM_HCC 1) # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" @@ -218,7 +219,7 @@ if(ASGARD_USE_HIP) set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx - set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH}" CACHE STRING "Flags to pass to NVCC") # nvcc specific options + set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo;-g" CACHE STRING "Flags to pass to NVCC") # nvcc specific options set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") @@ -236,13 +237,15 @@ if(ASGARD_USE_HIP) if(ASGARD_PLATFORM_NVCC) find_package(CUDA 9.0 REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) - enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia? + enable_language(CUDA) set (CMAKE_CUDA_STANDARD 14) set (CMAKE_CUDA_STANDARD_REQUIRED ON) - add_compile_definitions(__HIP_PLATFORM_NVCC__) + add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) elseif(ASGARD_PLATFORM_HCC) - #enable_language(HIP) # not yet added to latest cmake, but should be available soon - add_compile_definitions(__HIP_PLATFORM_AMD__) + #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21 + # these compile definitions should be added automatically if using amd's clang, but + # may not necessarily be added if compiling with gcc or others + add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) endif() if (hipBLAS_FOUND) From d08af1e01f6075579149da73da076d9ea907a1eb Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 28 Jun 2021 14:28:33 -0600 Subject: [PATCH 17/36] Add temporary workaround for kronmult hip interface --- src/device/kronmult_cuda.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index 52bf40f90..0a346268b 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -1,8 +1,10 @@ #include "kronmult_cuda.hpp" #include "build_info.hpp" -#include "hip/hip_runtime.h" #ifdef ASGARD_USE_HIP +// temporary workaround to use hip only on nvidia +// this can be removed once the hipified kronmult is used +#ifdef __HIP_PLATFORM_NVCC__ #include #define USE_GPU #define GLOBAL_FUNCTION __global__ @@ -11,6 +13,14 @@ #define DEVICE_FUNCTION __device__ #define HOST_FUNCTION __host__ #else +#undef ASGARD_USE_HIP +#define GLOBAL_FUNCTION +#define SYNCTHREADS +#define SHARED_MEMORY +#define DEVICE_FUNCTION +#define HOST_FUNCTION +#endif +#else #define GLOBAL_FUNCTION #define SYNCTHREADS #define SHARED_MEMORY @@ -397,6 +407,13 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], #endif } +// note - temporary workaround for compiling kronmult on amd platforms +// this needs to be removed when hipified kronmult is used +#ifndef __HIP_PLATFORM_NVCC__ +// redefine hip flag +#define ASGARD_USE_HIP +#endif + template void stage_inputs_kronmult(float const *const x, float *const workspace, int const num_elems, int const num_copies); From 3dc539d4b854bef85699fbc8b850c82d2f8ae59d Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 28 Jun 2021 10:07:45 -0600 Subject: [PATCH 18/36] Temporarily suppress clang compiler warnings --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 735f05066..7f46c0a4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,6 +246,8 @@ if(ASGARD_USE_HIP) # these compile definitions should be added automatically if using amd's clang, but # may not necessarily be added if compiling with gcc or others add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) + # temporarily disable warnings generated when using clang + add_compile_options("-Wno-dtor-name" "-Wno-gnu-anonymous-struct" "-Wno-nested-anon-types") endif() if (hipBLAS_FOUND) From ef16b563445926197093f762401b654152e06670 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 6 Jul 2021 08:26:03 -0600 Subject: [PATCH 19/36] Fix shared flags for hip platform, amd gpu target archs --- CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f46c0a4c..9c3a86ce2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,16 +215,19 @@ if(ASGARD_USE_HIP) set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") - set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") + set(ASGARD_HIP_FLAGS "-std=c++14 -g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") - set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options + set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS} --amdgpu-target=gfx${ASGARD_AMD_ARCH} -O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx - set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo;-g" CACHE STRING "Flags to pass to NVCC") # nvcc specific options + set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS} -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") + set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}") + set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}") + find_package(HIP 4.0 REQUIRED) find_package(hipblas REQUIRED) # Print some debug info about HIP configuration From a745fbe57bf7bf39ed9f6d4973bf2a7773a80515 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 6 Jul 2021 13:31:16 -0600 Subject: [PATCH 20/36] Move kronmult source properties before add_lib --- CMakeLists.txt | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c3a86ce2..13ae90442 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,11 +215,11 @@ if(ASGARD_USE_HIP) set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") - set(ASGARD_HIP_FLAGS "-std=c++14 -g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") + set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") - set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS} --amdgpu-target=gfx${ASGARD_AMD_ARCH} -O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options + set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx - set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS} -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options + set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") @@ -262,6 +262,14 @@ if(ASGARD_USE_HIP) include_directories("${HIP_PATH}/include") include_directories(${HIPBLAS_INCLUDE_DIRS}) + # set source file language properties + if(ASGARD_PLATFORM_HCC) + #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + elseif(ASGARD_PLATFORM_NVCC) + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + endif() + hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" @@ -273,11 +281,7 @@ else() endif() if(ASGARD_USE_HIP) - if(ASGARD_PLATFORM_HCC) - #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - elseif(ASGARD_PLATFORM_NVCC) - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + if(ASGARD_PLATFORM_NVCC) set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") endif() From e591cd92527fa48a7c119576979c8a1f6e472fdf Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 15 Jul 2021 08:54:11 -0600 Subject: [PATCH 21/36] Add check for empty matrix to avoid hipmemcpy2d error --- CMakeLists.txt | 2 +- src/lib_dispatch.cpp | 24 ++++++++++++------------ src/tensors.hpp | 9 +++++++++ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 13ae90442..9aff4c384 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,7 +219,7 @@ if(ASGARD_USE_HIP) set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx - set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options + set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index 3bbf840f8..b94ee451a 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -114,14 +114,14 @@ struct device_handler if (handle) { - auto const cublas_success = hipblasDestroy(handle); - expect(cublas_success == HIPBLAS_STATUS_SUCCESS); + auto const hipblas_success = hipblasDestroy(handle); + expect(hipblas_success == HIPBLAS_STATUS_SUCCESS); } success = hipSetDevice(local_rank); expect(success == hipSuccess); - auto const cublas_success = hipblasCreate(&handle); - expect(cublas_success == HIPBLAS_STATUS_SUCCESS); + auto const hipblas_success = hipblasCreate(&handle); + expect(hipblas_success == HIPBLAS_STATUS_SUCCESS); #else ignore(local_rank); @@ -852,8 +852,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, auto const success = hipblasDgemmBatched( device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = hipDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } else if constexpr (std::is_same::value) @@ -861,8 +861,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, auto const success = hipblasSgemmBatched( device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = hipDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } @@ -949,8 +949,8 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = hipDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } else if constexpr (std::is_same::value) @@ -959,8 +959,8 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = hipDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } diff --git a/src/tensors.hpp b/src/tensors.hpp index 87234e1a9..d6bbb7986 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -727,6 +727,9 @@ copy_matrix_on_device(fk::matrix &dest, expect(source.ncols() == dest.ncols()); #ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), @@ -746,6 +749,9 @@ copy_matrix_to_device(fk::matrix &dest, expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); #ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), @@ -765,6 +771,9 @@ copy_matrix_to_host(fk::matrix &dest, expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); #ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), source.stride() * sizeof(P), source.nrows() * sizeof(P), From 770a040d4584e460a3ec038e62787be9f11a397e Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 20 Jul 2021 13:49:07 -0600 Subject: [PATCH 22/36] Fix hardcoded amd clang version and update target flags --- CMakeLists.txt | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9aff4c384..a29d7e3ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "") option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "") option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "") option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF) -option (ASGARD_USE_HIP "Optional HIP support for asgard" ON) +option (ASGARD_USE_HIP "Optional HIP support for asgard" OFF) option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON) option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF) include(CMakeDependentOption) @@ -161,7 +161,8 @@ if(ASGARD_USE_HIP) # note: could probably grab this path directly using hipconfig? if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) - set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/12.0.0/include" CACHE PATH "Path to HIP clang include directory") + # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers + set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory") else() set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") endif() @@ -210,6 +211,11 @@ if(ASGARD_USE_HIP) if(ASGARD_PLATFORM_HCC) # note: causes issues on nvidia, but might be needed on amd platforms? list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + + # output a warning if compiling for AMD without using amd-clang + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + message(WARNING "Compiling HIP for AMD without using AMD clang might not work. Use -DCMAKE_CXX_COMPILER=clang++") + endif() endif() list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") @@ -225,8 +231,8 @@ if(ASGARD_USE_HIP) set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") - set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}") - set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}") + set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") + set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") find_package(HIP 4.0 REQUIRED) find_package(hipblas REQUIRED) @@ -249,8 +255,6 @@ if(ASGARD_USE_HIP) # these compile definitions should be added automatically if using amd's clang, but # may not necessarily be added if compiling with gcc or others add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) - # temporarily disable warnings generated when using clang - add_compile_options("-Wno-dtor-name" "-Wno-gnu-anonymous-struct" "-Wno-nested-anon-types") endif() if (hipBLAS_FOUND) From c318c8926ce4dfd89b722428cd915286f29c1dd2 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 21 Jul 2021 11:07:41 -0600 Subject: [PATCH 23/36] Modify include dirs to hide deprecated cuda messages --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a29d7e3ee..ba7c16b1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,9 +261,9 @@ if(ASGARD_USE_HIP) message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") endif() - include_directories(${HIP_INCLUDE_DIRS}) + include_directories(SYSTEM ${HIP_INCLUDE_DIRS}) # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform - include_directories("${HIP_PATH}/include") + include_directories(SYSTEM "${HIP_PATH}/include") include_directories(${HIPBLAS_INCLUDE_DIRS}) # set source file language properties From cf7c04a7f69508cea896a3cd87cbb80fc3f922c5 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 21 Jul 2021 12:21:13 -0600 Subject: [PATCH 24/36] Update device test tol for batch tests --- src/batch_tests.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/batch_tests.cpp b/src/batch_tests.cpp index 757b3d4e8..4739b0f79 100644 --- a/src/batch_tests.cpp +++ b/src/batch_tests.cpp @@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda, batched_gemv(a_batch, x_batch, y_batch, alpha, beta); - P const tol_factor = 1e-17; + P tol_factor = 1e-17; + if constexpr (resrc == resource::device) + { + tol_factor = 1e-7; + } for (int i = 0; i < num_batch; ++i) { if constexpr (resrc == resource::host) From d5f34cab7999286aa013cc1383d1256c4045af0b Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 29 Jul 2021 14:07:29 -0600 Subject: [PATCH 25/36] Fix clang formatting issues --- src/tensors.hpp | 76 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/tensors.hpp b/src/tensors.hpp index d6bbb7986..0fff81f60 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -240,10 +240,10 @@ class vector template> vector

operator-(vector const &right) const; template> - P operator*(vector const &) const; + P operator*(vector const &)const; template> - vector

operator*(matrix const &) const; + vector

operator*(matrix const &)const; template> vector

operator*(P const) const; @@ -475,9 +475,9 @@ class matrix template> matrix

operator*(P const) const; template> - vector

operator*(vector const &) const; + vector

operator*(vector const &)const; template> - matrix

operator*(matrix const &) const; + matrix

operator*(matrix const &)const; template> matrix

operator+(matrix const &) const; template> @@ -1005,8 +1005,8 @@ fk::vector::vector(vector const &a) // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom // template -fk::vector & -fk::vector::operator=(vector const &a) +fk::vector &fk::vector:: +operator=(vector const &a) { static_assert(mem != mem_type::const_view, "cannot copy assign into const_view!"); @@ -1051,8 +1051,8 @@ fk::vector::vector(vector &&a) // vector move assignment // template -fk::vector & -fk::vector::operator=(vector &&a) +fk::vector &fk::vector:: +operator=(vector &&a) { static_assert(mem != mem_type::const_view, "cannot move assign into const_view!"); @@ -1098,8 +1098,8 @@ fk::vector::vector(vector const &a) // template template -fk::vector & -fk::vector::operator=(vector const &a) +fk::vector &fk::vector:: +operator=(vector const &a) { expect(size() == a.size()); @@ -1133,8 +1133,8 @@ fk::vector::vector(vector const &a) // assignment owner <-> view template template -fk::vector & -fk::vector::operator=(vector const &a) +fk::vector &fk::vector:: +operator=(vector const &a) { expect(size() == a.size()); if constexpr (resrc == resource::host) @@ -1200,8 +1200,8 @@ fk::vector &fk::vector::transfer_from( // template template -fk::vector & -fk::vector::operator=(std::vector

const &v) +fk::vector &fk::vector:: +operator=(std::vector

const &v) { expect(size() == static_cast(v.size())); std::memcpy(data_, v.data(), v.size() * sizeof(P)); @@ -1287,8 +1287,8 @@ bool fk::vector::operator<(vector const &other) const // template template -fk::vector

-fk::vector::operator+(vector const &right) const +fk::vector

fk::vector:: +operator+(vector const &right) const { expect(size() == right.size()); vector

ans(size()); @@ -1302,8 +1302,8 @@ fk::vector::operator+(vector const &right) const // template template -fk::vector

-fk::vector::operator-(vector const &right) const +fk::vector

fk::vector:: +operator-(vector const &right) const { expect(size() == right.size()); vector

ans(size()); @@ -1332,8 +1332,8 @@ P fk::vector::operator*(vector const &right) const // template template -fk::vector

-fk::vector::operator*(fk::matrix const &A) const +fk::vector

fk::vector:: +operator*(fk::matrix const &A) const { // check dimension compatibility expect(size() == A.nrows()); @@ -1789,8 +1789,8 @@ fk::matrix::matrix(matrix const &a) // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom // template -fk::matrix & -fk::matrix::operator=(matrix const &a) +fk::matrix &fk::matrix:: +operator=(matrix const &a) { static_assert(mem != mem_type::const_view, "cannot copy assign into const_view!"); @@ -1843,8 +1843,8 @@ fk::matrix::matrix(matrix const &a) // assignment owner <-> view template template -fk::matrix & -fk::matrix::operator=(matrix const &a) +fk::matrix &fk::matrix:: +operator=(matrix const &a) { expect(nrows() == a.nrows()); expect(ncols() == a.ncols()); @@ -1883,8 +1883,8 @@ fk::matrix::matrix(matrix const &a) // template template -fk::matrix & -fk::matrix::operator=(matrix const &a) +fk::matrix &fk::matrix:: +operator=(matrix const &a) { expect((nrows() == a.nrows()) && (ncols() == a.ncols())); @@ -1975,8 +1975,8 @@ fk::matrix::matrix(matrix &&a) // matrix move assignment // template -fk::matrix & -fk::matrix::operator=(matrix &&a) +fk::matrix &fk::matrix:: +operator=(matrix &&a) { static_assert(mem != mem_type::const_view, "cannot move assign into const_view!"); @@ -2007,8 +2007,8 @@ fk::matrix::operator=(matrix &&a) // template template -fk::matrix & -fk::matrix::operator=(fk::vector const &v) +fk::matrix &fk::matrix:: +operator=(fk::vector const &v) { expect(nrows() * ncols() == v.size()); @@ -2094,8 +2094,8 @@ bool fk::matrix::operator<(matrix const &other) const // template template -fk::matrix

-fk::matrix::operator+(matrix const &right) const +fk::matrix

fk::matrix:: +operator+(matrix const &right) const { expect(nrows() == right.nrows() && ncols() == right.ncols()); @@ -2115,8 +2115,8 @@ fk::matrix::operator+(matrix const &right) const // template template -fk::matrix

-fk::matrix::operator-(matrix const &right) const +fk::matrix

fk::matrix:: +operator-(matrix const &right) const { expect(nrows() == right.nrows() && ncols() == right.ncols()); @@ -2154,8 +2154,8 @@ fk::matrix

fk::matrix::operator*(P const right) const // template template -fk::vector

-fk::matrix::operator*(fk::vector const &right) const +fk::vector

fk::matrix:: +operator*(fk::vector const &right) const { // check dimension compatibility expect(ncols() == right.size()); @@ -2181,8 +2181,8 @@ fk::matrix::operator*(fk::vector const &right) const // template template -fk::matrix

-fk::matrix::operator*(matrix const &B) const +fk::matrix

fk::matrix:: +operator*(matrix const &B) const { expect(ncols() == B.nrows()); // k == k From 8c69e5702f813099c2e1b1157d4dd4bf069021af Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 1 Sep 2021 11:48:44 -0600 Subject: [PATCH 26/36] Re-enable kronmult for amd platforms --- src/device/kronmult_cuda.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index 0a346268b..fe6573a2c 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -2,9 +2,6 @@ #include "build_info.hpp" #ifdef ASGARD_USE_HIP -// temporary workaround to use hip only on nvidia -// this can be removed once the hipified kronmult is used -#ifdef __HIP_PLATFORM_NVCC__ #include #define USE_GPU #define GLOBAL_FUNCTION __global__ @@ -13,14 +10,6 @@ #define DEVICE_FUNCTION __device__ #define HOST_FUNCTION __host__ #else -#undef ASGARD_USE_HIP -#define GLOBAL_FUNCTION -#define SYNCTHREADS -#define SHARED_MEMORY -#define DEVICE_FUNCTION -#define HOST_FUNCTION -#endif -#else #define GLOBAL_FUNCTION #define SYNCTHREADS #define SHARED_MEMORY @@ -407,13 +396,6 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], #endif } -// note - temporary workaround for compiling kronmult on amd platforms -// this needs to be removed when hipified kronmult is used -#ifndef __HIP_PLATFORM_NVCC__ -// redefine hip flag -#define ASGARD_USE_HIP -#endif - template void stage_inputs_kronmult(float const *const x, float *const workspace, int const num_elems, int const num_copies); From 49f45dca4a7009eae3a3ca1066236617474c40f0 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Wed, 15 Sep 2021 11:22:17 -0400 Subject: [PATCH 27/36] Rename hip platform from hcc to amd --- CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba7c16b1c..7d08981ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,7 +138,7 @@ endif() # convenience flags for which HIP platform has been setup set(ASGARD_PLATFORM_NVCC 0) -set(ASGARD_PLATFORM_HCC 0) +set(ASGARD_PLATFORM_AMD 0) if(ASGARD_USE_HIP) # search for HIP and libraries if(NOT DEFINED HIP_PATH) @@ -189,7 +189,7 @@ if(ASGARD_USE_HIP) message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") # hip >= 4.2 is now using "amd" to identify platform if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd") - set(ASGARD_PLATFORM_HCC 1) + set(ASGARD_PLATFORM_AMD 1) # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia") set(ASGARD_PLATFORM_NVCC 1) @@ -208,7 +208,7 @@ if(ASGARD_USE_HIP) # look for HIP cmake configs in different locations list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") - if(ASGARD_PLATFORM_HCC) + if(ASGARD_PLATFORM_AMD) # note: causes issues on nvidia, but might be needed on amd platforms? list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") @@ -250,7 +250,7 @@ if(ASGARD_USE_HIP) set (CMAKE_CUDA_STANDARD 14) set (CMAKE_CUDA_STANDARD_REQUIRED ON) add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) - elseif(ASGARD_PLATFORM_HCC) + elseif(ASGARD_PLATFORM_AMD) #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21 # these compile definitions should be added automatically if using amd's clang, but # may not necessarily be added if compiling with gcc or others @@ -267,7 +267,7 @@ if(ASGARD_USE_HIP) include_directories(${HIPBLAS_INCLUDE_DIRS}) # set source file language properties - if(ASGARD_PLATFORM_HCC) + if(ASGARD_PLATFORM_AMD) #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) elseif(ASGARD_PLATFORM_NVCC) @@ -419,7 +419,7 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX) endif () -if (ASGARD_USE_HIP AND ASGARD_PLATFORM_HCC) +if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD) target_link_libraries(kronmult_cuda PUBLIC kron hip::device) else () target_link_libraries(kronmult_cuda PUBLIC kron) @@ -445,7 +445,7 @@ else () endif () if (ASGARD_USE_HIP) - if(ASGARD_PLATFORM_HCC) + if(ASGARD_PLATFORM_AMD) target_link_libraries(lib_dispatch PRIVATE hip::device) elseif(ASGARD_PLATFORM_NVCC) target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) @@ -478,7 +478,7 @@ target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensor target_link_libraries (tensors PRIVATE lib_dispatch) if (ASGARD_USE_HIP) - if(ASGARD_PLATFORM_HCC) + if(ASGARD_PLATFORM_AMD) target_link_libraries(tensors PRIVATE hip::device) elseif(ASGARD_PLATFORM_NVCC) target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES}) From 8cd9e452a09424511b833c8084ff60b27aea6191 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 5 Oct 2021 16:13:56 -0400 Subject: [PATCH 28/36] Adjust lib dispatch device test tol for amd --- src/lib_dispatch_tests.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp index a6095cec0..6ffa36aaf 100644 --- a/src/lib_dispatch_tests.cpp +++ b/src/lib_dispatch_tests.cpp @@ -1179,7 +1179,11 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]", (double, resource::host), (double, resource::device), (float, resource::host), (float, resource::device)) { - TestType const tol_factor = 1e-18; + TestType tol_factor = 1e-18; + if constexpr (resrc == resource::device) + { + tol_factor = 1e-8; + } SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0") { From 2de03a3bf848f2608aaff475e3f54783e49f7ede Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Fri, 8 Oct 2021 10:29:41 -0400 Subject: [PATCH 29/36] Clean up cmake, add hipblas version check for amd --- CMakeLists.txt | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d08981ea..02bf50b8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.19) +cmake_minimum_required (VERSION 3.21) project (asgard VERSION 0.3.0 @@ -198,7 +198,7 @@ if(ASGARD_USE_HIP) # double check for cuda path since HIP uses it internally if(ASGARD_PLATFORM_NVCC) if (NOT DEFINED ENV{CUDA_PATH}) - find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH) + find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include) if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH) message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP") endif() @@ -224,7 +224,7 @@ if(ASGARD_USE_HIP) set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options - set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx + set(ASGARD_NVCC_ARCH "70" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") @@ -234,8 +234,14 @@ if(ASGARD_USE_HIP) set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") - find_package(HIP 4.0 REQUIRED) - find_package(hipblas REQUIRED) + if(ASGARD_PLATFORM_AMD) + # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1 + find_package(HIP 4.3.0 REQUIRED) + find_package(hipblas 0.49 REQUIRED) + else() + find_package(HIP 4.0 REQUIRED) + find_package(hipblas REQUIRED) + endif() # Print some debug info about HIP configuration message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") message(STATUS "HIP COMPILER: ${HIP_COMPILER}") @@ -451,7 +457,6 @@ if (ASGARD_USE_HIP) target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) endif() target_link_libraries(lib_dispatch PRIVATE roc::hipblas) - #set_target_properties(lib_dispatch PROPERTIES LINKER_LANGUAGE HIP) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -484,7 +489,6 @@ if (ASGARD_USE_HIP) target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES}) endif() target_link_libraries (tensors PRIVATE roc::hipblas) - #set_target_properties(tensors PROPERTIES LINKER_LANGUAGE HIP) endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) From c5ed1fd43587d49b0811e849398e4167b7fd5b6f Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 12 Oct 2021 13:48:16 -0400 Subject: [PATCH 30/36] Pass gpu arch to kronmult, set amd flags only on amd --- CMakeLists.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02bf50b8e..3683d7c13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,10 +231,10 @@ if(ASGARD_USE_HIP) set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") - set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") - set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") - if(ASGARD_PLATFORM_AMD) + set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") + set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") + # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1 find_package(HIP 4.3.0 REQUIRED) find_package(hipblas 0.49 REQUIRED) @@ -299,6 +299,13 @@ if(ASGARD_USE_HIP) # Turn on GPU support in kronmult. set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE) + + # pass gpu arch code to kronmult + if(ASGARD_PLATFORM_NVCC) + set(GPU_ARCH "${ASGARD_NVCC_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE) + else() + set(GPU_ARCH "${ASGARD_AMD_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE) + endif() endif() if(ASGARD_USE_MKL) From c79997b6a0f335bcda3e85a27fb1891717e9b5a8 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 28 Oct 2021 14:37:53 -0400 Subject: [PATCH 31/36] Change kronmult fetch content to after configuration --- CMakeLists.txt | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3683d7c13..f11cb87b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ include (FetchContent) find_package (Git) # Define a macro to register new projects. -function (register_project name dir url default_tag) +function (register_project name dir url default_tag make_avail) set (BUILD_TAG_${dir} ${default_tag} CACHE STRING "Name of the tag to checkout.") set (BUILD_REPO_${dir} ${url} CACHE STRING "URL of the repo to clone.") @@ -24,7 +24,9 @@ function (register_project name dir url default_tag) SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/contrib/${dir} ) - FetchContent_MakeAvailable(${name}) + if (${make_avail}) + FetchContent_MakeAvailable(${name}) + endif() endfunction () # Changes to the current version of kromult should proceed through a pull @@ -33,6 +35,7 @@ register_project (kronmult KRONMULT https://github.com/project-asgard/kronmult.git f941819685bbd3026a85145dde286f593683c1f4 + OFF ) ############################################################################### @@ -280,23 +283,6 @@ if(ASGARD_USE_HIP) set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension endif() - hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp - HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" - NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" - CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") -else() - # build component to interface with Ed's kronmult lib - ##TODO: link to kronmult as interface library - add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) -endif() - -if(ASGARD_USE_HIP) - if(ASGARD_PLATFORM_NVCC) - set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") - endif() - set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") - # Turn on GPU support in kronmult. set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE) @@ -308,6 +294,25 @@ if(ASGARD_USE_HIP) endif() endif() +# Fetch kronmult after configuring everything, but before adding libraries +FetchContent_MakeAvailable(kronmult) + +if(ASGARD_USE_HIP) + hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + if(ASGARD_PLATFORM_NVCC) + set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") + endif() + set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") +else() + # build component to interface with Ed's kronmult lib + ##TODO: link to kronmult as interface library + add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) +endif() + if(ASGARD_USE_MKL) if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") From 91b0626a24fd77dcf869c40a9421c14eb2c65ac9 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Thu, 28 Oct 2021 16:15:25 -0400 Subject: [PATCH 32/36] Fix kron linking on nvidia --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f11cb87b2..9844ad117 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -395,9 +395,7 @@ if (build_hdf5) add_dependencies (io hdf5-ext) endif () -if (build_kron) - add_dependencies (kronmult_cuda kronmult-ext) -endif () +add_dependencies (kronmult_cuda kron) if (ASGARD_USE_SCALAPACK) target_link_libraries (tensors PRIVATE scalapack_matrix_info cblacs_grid) @@ -440,7 +438,7 @@ endif () if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD) target_link_libraries(kronmult_cuda PUBLIC kron hip::device) else () - target_link_libraries(kronmult_cuda PUBLIC kron) + target_link_libraries(kronmult_cuda PRIVATE kron) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) From acf6305daea8cf314ae3a99a349e3962ccc94870 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Mon, 1 Nov 2021 16:40:26 -0400 Subject: [PATCH 33/36] Consolidate gpu arch flags and reorder hip flags --- CMakeLists.txt | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9844ad117..53aa08939 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,18 +225,11 @@ if(ASGARD_USE_HIP) set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") - set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number") - set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options - set(ASGARD_NVCC_ARCH "70" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx - set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options - - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") - set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") - set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") + set(GPU_ARCH "70" CACHE STRING "AMD/NVIDIA GPU architecture number (such as 906 or 70)") if(ASGARD_PLATFORM_AMD) - set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") - set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for") + set(AMDGPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE) + set(GPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE) # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1 find_package(HIP 4.3.0 REQUIRED) @@ -259,11 +252,13 @@ if(ASGARD_USE_HIP) set (CMAKE_CUDA_STANDARD 14) set (CMAKE_CUDA_STANDARD_REQUIRED ON) add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) + set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${GPU_ARCH},code=compute_${GPU_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC" FORCE) # nvcc specific options elseif(ASGARD_PLATFORM_AMD) #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21 # these compile definitions should be added automatically if using amd's clang, but # may not necessarily be added if compiling with gcc or others add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) + set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${GPU_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP" FORCE) # amdgpu specific options endif() if (hipBLAS_FOUND) @@ -285,26 +280,23 @@ if(ASGARD_USE_HIP) # Turn on GPU support in kronmult. set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE) - - # pass gpu arch code to kronmult - if(ASGARD_PLATFORM_NVCC) - set(GPU_ARCH "${ASGARD_NVCC_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE) - else() - set(GPU_ARCH "${ASGARD_AMD_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE) - endif() endif() # Fetch kronmult after configuring everything, but before adding libraries FetchContent_MakeAvailable(kronmult) if(ASGARD_USE_HIP) + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") + set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") + set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") + hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") if(ASGARD_PLATFORM_NVCC) set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3") + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3") endif() set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") else() From 37ebc252fafa6ecaeccead2f98324dca0e133aa5 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 2 Nov 2021 11:59:41 -0400 Subject: [PATCH 34/36] Add new register project arg when building openblas --- contrib/FindLINALG.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/FindLINALG.cmake b/contrib/FindLINALG.cmake index b92d94607..7f1f06af4 100644 --- a/contrib/FindLINALG.cmake +++ b/contrib/FindLINALG.cmake @@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS}) OPENBLAS https://github.com/xianyi/OpenBLAS.git v0.3.18 + ON ) # Fetch content does not run the install phase so the headers for openblas are From 8909eab6b283369037db47230d217130c459fe04 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 9 Nov 2021 12:03:24 -0500 Subject: [PATCH 35/36] Decrease batched gemv test tol for amd gpu --- src/lib_dispatch_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp index 6ffa36aaf..8c0eb3ca9 100644 --- a/src/lib_dispatch_tests.cpp +++ b/src/lib_dispatch_tests.cpp @@ -1182,7 +1182,7 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]", TestType tol_factor = 1e-18; if constexpr (resrc == resource::device) { - tol_factor = 1e-8; + tol_factor = 1e-7; } SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0") From ef3c11b8ad00c3fb11fa4eff98ddb74532870211 Mon Sep 17 00:00:00 2001 From: Cole Kendrick Date: Tue, 23 Nov 2021 11:01:46 -0500 Subject: [PATCH 36/36] Change CMake HIP linking --- CMakeLists.txt | 98 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53aa08939..ff5d239d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -246,8 +246,6 @@ if(ASGARD_USE_HIP) message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") if(ASGARD_PLATFORM_NVCC) - find_package(CUDA 9.0 REQUIRED) - include_directories(${CUDA_INCLUDE_DIRS}) enable_language(CUDA) set (CMAKE_CUDA_STANDARD 14) set (CMAKE_CUDA_STANDARD_REQUIRED ON) @@ -265,11 +263,6 @@ if(ASGARD_USE_HIP) message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") endif() - include_directories(SYSTEM ${HIP_INCLUDE_DIRS}) - # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform - include_directories(SYSTEM "${HIP_PATH}/include") - include_directories(${HIPBLAS_INCLUDE_DIRS}) - # set source file language properties if(ASGARD_PLATFORM_AMD) #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? @@ -286,6 +279,7 @@ endif() FetchContent_MakeAvailable(kronmult) if(ASGARD_USE_HIP) + set(CMAKE_HIP_ARCHITECTURES OFF) set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") @@ -294,6 +288,10 @@ if(ASGARD_USE_HIP) HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS}) + # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform + target_include_directories(kronmult_cuda PUBLIC "${HIP_PATH}/include") + target_include_directories(kronmult_cuda PUBLIC ${HIPBLAS_INCLUDE_DIRS}) if(ASGARD_PLATFORM_NVCC) set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3") @@ -376,7 +374,22 @@ if (ASGARD_USE_MATLAB) endif () foreach (component IN LISTS components) - add_library (${component} src/${component}.cpp) + if (ASGARD_USE_HIP) + hip_add_library (${component} src/${component}.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + set_target_properties(${component} PROPERTIES LINKER_LANGUAGE HIP) + target_include_directories(${component} SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(${component} SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(${component} SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + if(ASGARD_PLATFORM_NVCC) + target_include_directories(${component} SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + endif() + target_link_directories(${component} PRIVATE ${HIPBLAS_PATH}/lib/) # roc::hipblas target isn't linking properly on nvidia platforms + else() + add_library (${component} src/${component}.cpp) + endif() target_include_directories (${component} PRIVATE ${CMAKE_BINARY_DIR}) if(ASGARD_USE_MKL) target_compile_options (${component} PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly @@ -455,10 +468,8 @@ endif () if (ASGARD_USE_HIP) if(ASGARD_PLATFORM_AMD) target_link_libraries(lib_dispatch PRIVATE hip::device) - elseif(ASGARD_PLATFORM_NVCC) - target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}) + target_link_libraries(lib_dispatch PRIVATE roc::hipblas) endif() - target_link_libraries(lib_dispatch PRIVATE roc::hipblas) endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -487,10 +498,8 @@ target_link_libraries (tensors PRIVATE lib_dispatch) if (ASGARD_USE_HIP) if(ASGARD_PLATFORM_AMD) target_link_libraries(tensors PRIVATE hip::device) - elseif(ASGARD_PLATFORM_NVCC) - target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES}) + target_link_libraries (tensors PRIVATE roc::hipblas) endif() - target_link_libraries (tensors PRIVATE roc::hipblas) endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) @@ -504,7 +513,16 @@ target_link_libraries (transformations quadrature tensors) # define the main application and its linking -add_executable (asgard src/main.cpp) +if (ASGARD_USE_HIP) + hip_add_executable (asgard src/main.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + set_target_properties(asgard PROPERTIES LINKER_LANGUAGE HIP) + target_link_directories(asgard PRIVATE ${HIPBLAS_PATH}/lib/) +else() + add_executable (asgard src/main.cpp) +endif() # link in components needed directly by main set (main_app_link_deps @@ -559,7 +577,21 @@ if (ASGARD_BUILD_TESTS) enable_testing () # Define ctest tests and their executables - add_library (tests_general testing/tests_general.cpp) + if (ASGARD_USE_HIP) + hip_add_library (tests_general testing/tests_general.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + target_include_directories(tests_general SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(tests_general SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(tests_general SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + if (ASGARD_PLATFORM_NVCC) + target_include_directories(tests_general SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(tests_general PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + endif() + else() + add_library (tests_general testing/tests_general.cpp) + endif() target_link_libraries (tests_general PUBLIC Catch PRIVATE pde program_options ) target_include_directories(tests_general PRIVATE ${CMAKE_BINARY_DIR}) @@ -575,7 +607,22 @@ if (ASGARD_BUILD_TESTS) endif() foreach (component IN LISTS components) - add_executable (${component}-tests src/${component}_tests.cpp) + if (ASGARD_USE_HIP) + hip_add_executable (${component}-tests src/${component}_tests.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + if (ASGARD_PLATFORM_NVCC) + target_include_directories(${component}-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(${component}-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + target_link_directories(${component}-tests PRIVATE ${HIPBLAS_PATH}/lib/) + endif() + target_include_directories(${component}-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(${component}-tests SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(${component}-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + else() + add_executable (${component}-tests src/${component}_tests.cpp) + endif() target_include_directories (${component}-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing) target_include_directories (${component}-tests PRIVATE ${CMAKE_BINARY_DIR}) target_link_libraries (${component}-tests PRIVATE ${component} tests_general) @@ -609,7 +656,22 @@ if (ASGARD_BUILD_TESTS) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endforeach () - add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp) + if (ASGARD_USE_HIP) + hip_add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + if (ASGARD_PLATFORM_NVCC) + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(kronmult_cuda-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + target_link_directories(kronmult_cuda-tests PRIVATE ${HIPBLAS_PATH}/lib/) + endif() + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + else() + add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp) + endif() target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing) target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_BINARY_DIR}) target_link_libraries (kronmult_cuda-tests PRIVATE coefficients kronmult_cuda tests_general)