diff --git a/accessor/cuda_hip_helper.hpp b/accessor/cuda_hip_helper.hpp new file mode 100644 index 00000000000..225fdfe1b15 --- /dev/null +++ b/accessor/cuda_hip_helper.hpp @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ +#define GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ + + +#include + + +#ifdef GKO_COMPILING_HIP +#include "accessor/hip_helper.hpp" +#else // GKO_COMPILING_CUDA +#include "accessor/cuda_helper.hpp" +#endif + + +namespace gko { +namespace acc { + + +template +GKO_ACC_INLINE auto as_device_range(AccType&& acc) +{ +#ifdef GKO_COMPILING_HIP + return as_hip_range(std::forward(acc)); +#else // GKO_COMPILING_CUDA + return as_cuda_range(std::forward(acc)); +#endif +} + + +} // namespace acc +} // namespace gko + + +#endif // GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_ diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ca209e65057..306655d2315 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,6 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def) endif() # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) + target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse) endfunction() @@ -27,6 +28,7 @@ function(ginkgo_benchmark_hipsparse_linops type def) add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) + target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP) target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index f239740d655..a404f9151ea 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -139,7 +139,7 @@ class CusparseCsrmp auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv_mp( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -156,7 +156,7 @@ class CusparseCsrmp : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -213,7 +213,7 @@ class CusparseCsr auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -230,7 +230,7 @@ class CusparseCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -288,7 +288,7 @@ class CusparseCsrmm auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmm( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -306,7 +306,7 @@ class CusparseCsrmm : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -376,7 +376,7 @@ class CusparseCsrEx gko::size_type buffer_size = 0; auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); - auto handle = this->get_gpu_exec()->get_cusparse_handle(); + auto handle = this->get_gpu_exec()->get_sparselib_handle(); // This function seems to require the pointer mode to be set to HOST. // Ginkgo use pointer mode DEVICE by default, so we change this // temporarily. @@ -407,7 +407,7 @@ class CusparseCsrEx : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE), + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE), buffer_(exec) { algmode_ = CUSPARSE_ALG_MERGE_PATH; @@ -465,7 +465,7 @@ class CusparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::csr2hyb( - this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0], + this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_, Threshold, Partition); @@ -496,7 +496,7 @@ class CusparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( - this->get_gpu_exec()->get_cusparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, &scalars.get_const_data()[1], dx); } @@ -508,7 +508,7 @@ class CusparseHybrid CusparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) { auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_)); @@ -555,13 +555,13 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( - gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, &buffer_size)); gko::array buffer_array(gpu_exec, buffer_size); auto dbuffer = buffer_array.get_data(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( - gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], + gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); @@ -654,7 +654,7 @@ class CusparseGenericCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -745,7 +745,7 @@ class CusparseGenericCoo const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), coo_(std::move(coo::create(exec))), - trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 2d952ce60e9..f0d7edb45c3 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -126,7 +126,7 @@ class HipsparseCsr auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -143,7 +143,7 @@ class HipsparseCsr : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -201,7 +201,7 @@ class HipsparseCsrmm auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmm( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], csr_->get_num_stored_elements(), &scalars.get_const_data()[0], this->get_descr(), csr_->get_const_values(), @@ -219,7 +219,7 @@ class HipsparseCsrmm : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) {} private: @@ -269,7 +269,7 @@ class HipsparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::csr2hyb( - this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0], + this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_, Threshold, Partition); @@ -300,7 +300,7 @@ class HipsparseHybrid auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( - this->get_gpu_exec()->get_hipsparse_handle(), trans_, + this->get_gpu_exec()->get_sparselib_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, &scalars.get_const_data()[1], dx); } @@ -312,7 +312,7 @@ class HipsparseHybrid HipsparseHybrid(std::shared_ptr exec, const gko::dim<2>& size = gko::dim<2>{}) : gko::EnableLinOp(exec, size), - trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) + trans_(SPARSELIB_OPERATION_NON_TRANSPOSE) { auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_)); diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 0aa93a3b141..9f7079f60a3 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -160,7 +160,7 @@ endfunction(ginkgo_create_cuda_test) ## Internal function allowing separate test name, filename and target name function(ginkgo_create_cuda_test_internal test_name filename test_target_name) add_executable(${test_target_name} ${filename}) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) if(MSVC) target_compile_options(${test_target_name} PRIVATE @@ -188,7 +188,7 @@ endfunction(ginkgo_create_hip_test) function(ginkgo_create_hip_test_internal test_name filename test_target_name) set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP) add_executable(${test_target_name} ${filename}) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) @@ -203,7 +203,7 @@ endfunction() function(ginkgo_create_omp_test_internal test_name filename test_target_name) ginkgo_build_test_name(${test_name} test_target_name) add_executable(${test_target_name} ${test_name}.cpp) - target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu) @@ -253,7 +253,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec) target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX) endif () - target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) + target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} GKO_DEVICE_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) @@ -285,13 +285,13 @@ function(ginkgo_create_common_device_test test_name) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.cu COPYONLY) ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN}) - target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda) + target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda) endif() if(GINKGO_BUILD_HIP) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN}) - target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip) + target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor GKO_DEVICE_NAMESPACE=hip) endif() endfunction(ginkgo_create_common_device_test) diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp new file mode 100644 index 00000000000..e59bbf0d7a0 --- /dev/null +++ b/common/cuda_hip/base/blas_bindings.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/cublas_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hipblas_bindings.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp new file mode 100644 index 00000000000..00825fe8b72 --- /dev/null +++ b/common/cuda_hip/base/config.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/config.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/config.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_ diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp new file mode 100644 index 00000000000..40bf694ef73 --- /dev/null +++ b/common/cuda_hip/base/pointer_mode_guard.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/pointer_mode_guard.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/pointer_mode_guard.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp new file mode 100644 index 00000000000..7797ad38c64 --- /dev/null +++ b/common/cuda_hip/base/randlib_bindings.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/curand_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hiprand_bindings.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp new file mode 100644 index 00000000000..6a7a7a3c4a2 --- /dev/null +++ b/common/cuda_hip/base/runtime.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +// nothing needed here +#elif defined(GKO_COMPILING_HIP) +#include +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_ diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp new file mode 100644 index 00000000000..26c0bda236d --- /dev/null +++ b/common/cuda_hip/base/sparselib_bindings.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/cusparse_bindings.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/hipsparse_bindings.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_ diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp new file mode 100644 index 00000000000..02aaebc9f3d --- /dev/null +++ b/common/cuda_hip/base/thrust.hpp @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ + + +#include + + +#include +#include + + +#if defined(GKO_COMPILING_CUDA) || \ + (defined(GKO_COMPILING_HIP) && !GINKGO_HIP_PLATFORM_HCC) +#include +#else +#include +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +#if defined(GKO_COMPILING_CUDA) +inline auto thrust_policy(std::shared_ptr exec) +{ + return thrust::cuda::par.on(exec->get_stream()); +} +#elif defined(GKO_COMPILING_HIP) +inline auto thrust_policy(std::shared_ptr exec) +{ +#if GINKGO_HIP_PLATFORM_HCC + return thrust::hip::par.on(exec->get_stream()); +#else + return thrust::cuda::par.on(exec->get_stream()); +#endif +} +#else +#error "Executor definition missing" +#endif + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp new file mode 100644 index 00000000000..08f0516d691 --- /dev/null +++ b/common/cuda_hip/base/types.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/types.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/types.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 3d76cfdcb79..60eaf5a9dd9 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -196,3 +196,35 @@ GKO_BIND_ATOMIC_MAX(unsigned long long int); #undef GKO_BIND_ATOMIC_MAX + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex* __restrict__ address, thrust::complex val) +{ + auto addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(addr, val.real()); + auto imag = atomic_add(addr + 1, val.imag()); + return {real, imag}; +} + + +/** + * @internal + * + * @note It is not 'real' complex atomic add operation + */ +__forceinline__ __device__ thrust::complex atomic_add( + thrust::complex* __restrict__ address, thrust::complex val) +{ + auto addr = reinterpret_cast(address); + // Separate to real part and imag part + auto real = atomic_add(addr, val.real()); + auto imag = atomic_add(addr + 1, val.imag()); + return {real, imag}; +} diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp new file mode 100644 index 00000000000..a57440f6d30 --- /dev/null +++ b/common/cuda_hip/components/cooperative_groups.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/components/cooperative_groups.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/cooperative_groups.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_ diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp new file mode 100644 index 00000000000..9faf7a58c25 --- /dev/null +++ b/common/cuda_hip/components/format_conversion.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/components/format_conversion.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/format_conversion.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_ diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp new file mode 100644 index 00000000000..9bfd9cba1e0 --- /dev/null +++ b/common/cuda_hip/components/memory.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/components/memory.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/components/memory.hip.hpp" +#else +#error "Executor definition missing" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_ diff --git a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc similarity index 75% rename from common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc index 93a49e56d21..87aa8297345 100644 --- a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc @@ -206,4 +206,72 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init( } +template +__global__ __launch_bounds__(default_block_size) void ict_sweep( + const IndexType* __restrict__ a_row_ptrs, + const IndexType* __restrict__ a_col_idxs, + const ValueType* __restrict__ a_vals, + const IndexType* __restrict__ l_row_ptrs, + const IndexType* __restrict__ l_row_idxs, + const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, + IndexType l_nnz) +{ + auto l_nz = thread::get_subwarp_id_flat(); + if (l_nz >= l_nnz) { + return; + } + auto row = l_row_idxs[l_nz]; + auto col = l_col_idxs[l_nz]; + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto lh_col_begin = l_row_ptrs[col]; + auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin; + ValueType sum{}; + IndexType lh_nz{}; + auto last_entry = col; + group_merge( + l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin, + lh_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType lh_idx, + IndexType lh_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == lh_row && l_col < last_entry) { + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); + } + // remember the transposed element + auto found_transp = subwarp.ballot(lh_row == row); + if (found_transp) { + lh_nz = + subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + auto to_write = + row == col ? sqrt(a_val - sum) + : (a_val - sum) / + load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); + if (is_finite(to_write)) { + store_relaxed(l_vals + l_nz, to_write); + } + } +} + + } // namespace kernel diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc deleted file mode 100644 index bc58f0a9799..00000000000 --- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void ict_sweep( - const IndexType* __restrict__ a_row_ptrs, - const IndexType* __restrict__ a_col_idxs, - const ValueType* __restrict__ a_vals, - const IndexType* __restrict__ l_row_ptrs, - const IndexType* __restrict__ l_row_idxs, - const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, - IndexType l_nnz) -{ - auto l_nz = thread::get_subwarp_id_flat(); - if (l_nz >= l_nnz) { - return; - } - auto row = l_row_idxs[l_nz]; - auto col = l_col_idxs[l_nz]; - auto subwarp = - group::tiled_partition(group::this_thread_block()); - // find entry of A at (row, col) - auto a_row_begin = a_row_ptrs[row]; - auto a_row_end = a_row_ptrs[row + 1]; - auto a_row_size = a_row_end - a_row_begin; - auto a_idx = - group_wide_search(a_row_begin, a_row_size, subwarp, - [&](IndexType i) { return a_col_idxs[i] >= col; }); - bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; - auto a_val = has_a ? a_vals[a_idx] : zero(); - auto l_row_begin = l_row_ptrs[row]; - auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; - auto lh_col_begin = l_row_ptrs[col]; - auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin; - ValueType sum{}; - IndexType lh_nz{}; - auto last_entry = col; - group_merge( - l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin, - lh_col_size, subwarp, - [&](IndexType l_idx, IndexType l_col, IndexType lh_idx, - IndexType lh_row, IndexType, bool) { - // we don't need to use the `bool valid` because last_entry is - // already a smaller sentinel value than the one used in group_merge - if (l_col == lh_row && l_col < last_entry) { - sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * - conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); - } - // remember the transposed element - auto found_transp = subwarp.ballot(lh_row == row); - if (found_transp) { - lh_nz = - subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1); - } - return true; - }); - // accumulate result from all threads - sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); - - if (subwarp.thread_rank() == 0) { - auto to_write = - row == col ? sqrt(a_val - sum) - : (a_val - sum) / - load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); - if (is_finite(to_write)) { - store_relaxed(l_vals + l_nz, to_write); - } - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc similarity index 100% rename from common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index b32572546f0..5ca25ecb1e3 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -19,7 +19,7 @@ #define GKO_DEVICE_NAMESPACE cuda #define GKO_KERNEL __device__ -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -46,7 +46,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #define GKO_DEVICE_NAMESPACE hip #define GKO_KERNEL __device__ -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt index 56f83181375..f500ddb6ae5 100644 --- a/core/test/gtest/CMakeLists.txt +++ b/core/test/gtest/CMakeLists.txt @@ -25,14 +25,14 @@ if (GINKGO_BUILD_MPI) add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi) endif() if (GINKGO_BUILD_OMP) - add_gtest_main("_omp" "GKO_COMPILING_OMP") + add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp") endif() if (GINKGO_BUILD_CUDA) - add_gtest_main("_cuda" "GKO_COMPILING_CUDA") + add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda") endif() if (GINKGO_BUILD_HIP) - add_gtest_main("_hip" "GKO_COMPILING_HIP") + add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip") endif() if (GINKGO_BUILD_SYCL) - add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP") + add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp") endif() diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bd214691a2e..3d251ecfa82 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -33,12 +33,12 @@ target_sources(ginkgo_cuda factorization/par_ic_kernels.cu factorization/par_ict_kernels.cu factorization/par_ilu_kernels.cu - factorization/par_ilut_approx_filter_kernel.cu - factorization/par_ilut_filter_kernel.cu + factorization/par_ilut_approx_filter_kernels.cu + factorization/par_ilut_filter_kernels.cu factorization/par_ilut_select_common.cu - factorization/par_ilut_select_kernel.cu - factorization/par_ilut_spgeam_kernel.cu - factorization/par_ilut_sweep_kernel.cu + factorization/par_ilut_select_kernels.cu + factorization/par_ilut_spgeam_kernels.cu + factorization/par_ilut_sweep_kernels.cu matrix/batch_csr_kernels.cu matrix/batch_dense_kernels.cu matrix/batch_ell_kernels.cu @@ -54,10 +54,10 @@ target_sources(ginkgo_cuda multigrid/pgm_kernels.cu preconditioner/batch_jacobi_kernels.cu preconditioner/isai_kernels.cu - preconditioner/jacobi_advanced_apply_kernel.cu - preconditioner/jacobi_generate_kernel.cu + preconditioner/jacobi_advanced_apply_kernels.cu + preconditioner/jacobi_generate_kernels.cu preconditioner/jacobi_kernels.cu - preconditioner/jacobi_simple_apply_kernel.cu + preconditioner/jacobi_simple_apply_kernels.cu reorder/rcm_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu @@ -85,18 +85,18 @@ endif() set(GKO_CUDA_JACOBI_SOURCES) foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) configure_file( - preconditioner/jacobi_generate_instantiate.inc.cu - preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_generate_kernels.instantiate.cu + preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) configure_file( - preconditioner/jacobi_simple_apply_instantiate.inc.cu - preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_simple_apply_kernels.instantiate.cu + preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) configure_file( - preconditioner/jacobi_advanced_apply_instantiate.inc.cu - preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + preconditioner/jacobi_advanced_apply_kernels.instantiate.cu + preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) list(APPEND GKO_CUDA_JACOBI_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) endforeach() target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") @@ -120,7 +120,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") endif() ginkgo_compile_features(ginkgo_cuda) -target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) +target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) # include path for generated headers like jacobi_common.hpp target_include_directories(ginkgo_cuda @@ -133,7 +133,7 @@ ginkgo_default_includes(ginkgo_cuda) ginkgo_install_library(ginkgo_cuda) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA) + ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda") endif() if(GINKGO_BUILD_TESTS) diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 5bc899c11ed..dcaafd5a46c 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -13,13 +13,14 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -39,6 +40,7 @@ namespace batch_multi_vector { constexpr auto default_block_size = 256; constexpr int sm_oversubscription = 4; + // clang-format off // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp index 7c968ec2c6e..5251c594d42 100644 --- a/cuda/base/batch_struct.hpp +++ b/cuda/base/batch_struct.hpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp index 485249b7665..c1cdf1f996e 100644 --- a/cuda/base/cublas_bindings.hpp +++ b/cuda/base/cublas_bindings.hpp @@ -12,8 +12,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -249,6 +249,20 @@ inline void destroy(cublasHandle_t handle) } // namespace cublas + + +namespace blas { + + +using namespace cublas; + + +#define BLAS_OP_N CUBLAS_OP_N +#define BLAS_OP_T CUBLAS_OP_T +#define BLAS_OP_C CUBLAS_OP_C + + +} // namespace blas } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index b0ae52c5f00..10e09f4a356 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -12,8 +12,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -85,6 +85,18 @@ GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); } // namespace curand + + +namespace randlib { + + +using namespace curand; + + +#define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT + + +} // namespace randlib } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index 87737e8865e..c18e1d7e9a6 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -13,7 +13,7 @@ #include -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -940,6 +940,7 @@ inline void destroy(csrsm2Info_t info) #endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11031) +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS inline csrilu02Info_t create_ilu0_info() { csrilu02Info_t info{}; @@ -948,7 +949,7 @@ inline csrilu02Info_t create_ilu0_info() } -inline void destroy(csrilu02Info_t info) +inline void destroy_ilu0_info(csrilu02Info_t info) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info)); } @@ -962,10 +963,11 @@ inline csric02Info_t create_ic0_info() } -inline void destroy(csric02Info_t info) +inline void destroy_ic0_info(csric02Info_t info) { GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info)); } +GKO_END_DISABLE_DEPRECATION_WARNINGS #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11031)) @@ -1174,19 +1176,6 @@ void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a, #endif // (defined(CUDA_VERSION) && (CUDA_VERSION >= 11031)) -template -void create_identity_permutation(cusparseHandle_t handle, IndexType size, - IndexType* permutation) GKO_NOT_IMPLEMENTED; - -template <> -inline void create_identity_permutation(cusparseHandle_t handle, - int32 size, int32* permutation) -{ - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseCreateIdentityPermutation(handle, size, permutation)); -} - - template void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n, IndexType nnz, const IndexType* row_ptrs, @@ -1264,6 +1253,7 @@ inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in, #endif +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS template void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz, const cusparseMatDescr_t descr, const ValueType* vals, @@ -1458,11 +1448,26 @@ GKO_BIND_CUSPARSE_IC0(float, cusparseScsric02); GKO_BIND_CUSPARSE_IC0(double, cusparseDcsric02); GKO_BIND_CUSPARSE_IC0(std::complex, cusparseCcsric02); GKO_BIND_CUSPARSE_IC0(std::complex, cusparseZcsric02); +GKO_END_DISABLE_DEPRECATION_WARNINGS #undef GKO_BIND_CUSPARSE_IC0 } // namespace cusparse + + +namespace sparselib { + + +using namespace cusparse; + + +#define SPARSELIB_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE +#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE +#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL + + +} // namespace sparselib } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp index eddf249a22b..c3db763f0da 100644 --- a/cuda/base/cusparse_block_bindings.hpp +++ b/cuda/base/cusparse_block_bindings.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -190,6 +190,7 @@ GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex, cusparseZgebsr2gebsc); #undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32 +GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS inline std::unique_ptr, std::function> create_bsr_trsm_info() @@ -457,6 +458,7 @@ GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseCbsrilu02); GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseZbsrilu02); #undef GKO_BIND_CUSPARSE_BILU0 +GKO_END_DISABLE_DEPRECATION_WARNINGS } // namespace cusparse diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu index ed5601f57a5..554abe8bc37 100644 --- a/cuda/base/device_matrix_data_kernels.cu +++ b/cuda/base/device_matrix_data_kernels.cu @@ -14,8 +14,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 52a92132689..3d1dbf7c92c 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -20,7 +20,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_handle.hpp" #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index ec8d31ba747..0d4bc4eebd5 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -11,8 +11,9 @@ #include -#include "accessor/cuda_helper.hpp" -#include "cuda/base/types.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/components/thread_ids.cuh" @@ -23,21 +24,21 @@ namespace cuda { template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(gko::acc::range& range) { - return gko::acc::as_cuda_range(range); + return gko::acc::as_device_range(range); } }; template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(const gko::acc::range& range) { - return gko::acc::as_cuda_range(range); + return gko::acc::as_device_range(range); } }; diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 6146d7248d0..817d19006bc 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -8,9 +8,9 @@ #endif +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index 17988755517..0d9eaeb2653 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -8,6 +8,9 @@ #endif +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { namespace kernels { namespace cuda { diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 88e9eb17a35..561612f2869 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -394,6 +394,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } +using deviceComplex = cuComplex; +using deviceDoubleComplex = cuDoubleComplex; + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh index 6dbed0b0d25..1964f0ae196 100644 --- a/cuda/components/atomic.cuh +++ b/cuda/components/atomic.cuh @@ -9,8 +9,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" namespace gko { @@ -21,38 +21,6 @@ namespace cuda { #include "common/cuda_hip/components/atomic.hpp.inc" -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - cuComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(&(addr->x), val.real()); - auto imag = atomic_add(&(addr->y), val.imag()); - return {real, imag}; -} - - -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - cuDoubleComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(&(addr->x), val.real()); - auto imag = atomic_add(&(addr->y), val.imag()); - return {real, imag}; -} - - } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index eae0c957f21..70643a3b16a 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -13,7 +13,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh index d748fcab2e5..a8f27d3a81f 100644 --- a/cuda/components/diagonal_block_manipulation.cuh +++ b/cuda/components/diagonal_block_manipulation.cuh @@ -9,9 +9,9 @@ #include -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index bccc927c9cd..f0ef007c53c 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -10,7 +10,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh index 22bedca9699..97e5d67c23a 100644 --- a/cuda/components/memory.cuh +++ b/cuda/components/memory.cuh @@ -12,7 +12,7 @@ #include -#include "cuda/base/types.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh index 653de4e9e15..2f6f145e304 100644 --- a/cuda/components/prefix_sum.cuh +++ b/cuda/components/prefix_sum.cuh @@ -9,8 +9,8 @@ #include -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index ded80fae40a..250c560d44b 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -13,10 +13,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh index 1dc1304a82a..5472ac46ed1 100644 --- a/cuda/components/searching.cuh +++ b/cuda/components/searching.cuh @@ -6,7 +6,7 @@ #define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" #include "cuda/components/intrinsics.cuh" diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh index 842f1e06760..6ffb8028334 100644 --- a/cuda/components/segment_scan.cuh +++ b/cuda/components/segment_scan.cuh @@ -6,7 +6,7 @@ #define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh index e6eb17ec8e4..59e44d1bb82 100644 --- a/cuda/components/sorting.cuh +++ b/cuda/components/sorting.cuh @@ -6,8 +6,8 @@ #define GKO_CUDA_COMPONENTS_SORTING_CUH_ -#include "cuda/base/config.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh index 0d45c8db516..0d5c0d11f43 100644 --- a/cuda/components/syncfree.cuh +++ b/cuda/components/syncfree.cuh @@ -9,11 +9,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/memory.cuh" namespace gko { diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh index c3e517e0f9d..1113ea75fc6 100644 --- a/cuda/components/thread_ids.cuh +++ b/cuda/components/thread_ids.cuh @@ -6,17 +6,12 @@ #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { namespace cuda { -/** - * @brief The CUDA thread namespace. - * - * @ingroup cuda_thread - */ namespace thread { diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index 79779f2f54b..e05b0803dc2 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/syncfree.cuh" @@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr exec, } // sort postorder_cols inside rows { - const auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, mtx_nnz); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, mtx_nnz); size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + row_ptrs, postorder_cols, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - cusparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - cusparse::destroy(descr); + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + postorder_cols, permutation, buffer); + sparselib::destroy(descr); } // count nonzeros per row of L { diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu index 4ea03981a15..309ded37d34 100644 --- a/cuda/factorization/factorization_kernels.cu +++ b/cuda/factorization/factorization_kernels.cu @@ -8,12 +8,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/searching.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu index 1afb10ce57a..9d55856f139 100644 --- a/cuda/factorization/ic_kernels.cu +++ b/cuda/factorization/ic_kernels.cu @@ -8,7 +8,7 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -27,37 +27,37 @@ void compute(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_cusparse_handle(); - auto desc = cusparse::create_mat_descr(); - auto info = cusparse::create_ic0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ic0_info(); // get buffer size for IC IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - cusparse::ic0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); + sparselib::ic0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up IC(0) - cusparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, CUSPARSE_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); + sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); - cusparse::ic0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing #if (CUDA_VERSION >= 11040) exec->synchronize(); #endif - cusparse::destroy(info); - cusparse::destroy(desc); + sparselib::destroy_ic0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu index 33e59bb54c9..acebec6e94c 100644 --- a/cuda/factorization/ilu_kernels.cu +++ b/cuda/factorization/ilu_kernels.cu @@ -8,7 +8,7 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -27,37 +27,37 @@ void compute_lu(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_cusparse_handle(); - auto desc = cusparse::create_mat_descr(); - auto info = cusparse::create_ilu0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ilu0_info(); // get buffer size for ILU IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc, - m->get_const_values(), m->get_const_row_ptrs(), - m->get_const_col_idxs(), info, buffer_size); + sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc, + m->get_const_values(), m->get_const_row_ptrs(), + m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up ILU(0) - cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, CUSPARSE_SOLVE_POLICY_USE_LEVEL, - buffer.get_data()); + sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, + buffer.get_data()); - cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), - m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(), + m->get_const_row_ptrs(), m->get_const_col_idxs(), info, + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); // CUDA 11.4 has a use-after-free bug on Turing #if (CUDA_VERSION >= 11040) exec->synchronize(); #endif - cusparse::destroy(info); - cusparse::destroy(desc); + sparselib::destroy_ilu0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu index 583bf51fb67..9c3069f62cf 100644 --- a/cuda/factorization/lu_kernels.cu +++ b/cuda/factorization/lu_kernels.cu @@ -17,11 +17,11 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/syncfree.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index a9de634f1f9..f493cb11fd1 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index 5f48ceef2f8..d958f81d2f4 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -12,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -19,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" @@ -46,8 +47,7 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc" +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" namespace { diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 7a770a39353..755723e7d4c 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -8,9 +8,10 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_approx_filter_kernel.cu rename to cuda/factorization/par_ilut_approx_filter_kernels.cu index 853519cd36b..ae544939e17 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu @@ -15,16 +15,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/sorting.cuh" diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernels.cu similarity index 96% rename from cuda/factorization/par_ilut_filter_kernel.cu rename to cuda/factorization/par_ilut_filter_kernels.cu index 0e63f102b72..4a24c5f305b 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernels.cu @@ -12,15 +12,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernels.cu similarity index 98% rename from cuda/factorization/par_ilut_select_kernel.cu rename to cuda/factorization/par_ilut_select_kernels.cu index ca8b55e504b..6a7bd53c1c4 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernels.cu @@ -13,6 +13,7 @@ #include +#include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "cuda/base/math.hpp" #include "cuda/components/atomic.cuh" @@ -147,7 +148,7 @@ void threshold_select(std::shared_ptr exec, auto out_ptr = reinterpret_cast(tmp1.get_data()); kernel::basecase_select<<<1, kernel::basecase_block_size, 0, exec->get_stream()>>>( - as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr)); + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); threshold = exec->copy_val_to_host(out_ptr); } diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_spgeam_kernel.cu rename to cuda/factorization/par_ilut_spgeam_kernels.cu index c4372f66219..0a751c2f48f 100644 --- a/cuda/factorization/par_ilut_spgeam_kernel.cu +++ b/cuda/factorization/par_ilut_spgeam_kernels.cu @@ -12,13 +12,14 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" @@ -80,8 +81,8 @@ void add_candidates(syn::value_list, auto u_vals = u->get_const_values(); auto l_new_row_ptrs = l_new->get_row_ptrs(); auto u_new_row_ptrs = u_new->get_row_ptrs(); - // count non-zeros per row if (num_blocks > 0) { + // count non-zeros per row kernel::tri_spgeam_nnz <<get_stream()>>>( lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, @@ -105,8 +106,8 @@ void add_candidates(syn::value_list, auto u_new_col_idxs = u_new->get_col_idxs(); auto u_new_vals = u_new->get_values(); - // fill columns and values if (num_blocks > 0) { + // fill columns and values kernel::tri_spgeam_init <<get_stream()>>>( lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernels.cu similarity index 97% rename from cuda/factorization/par_ilut_sweep_kernel.cu rename to cuda/factorization/par_ilut_sweep_kernels.cu index 85fb3f26e21..5924ebe328d 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernels.cu @@ -12,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -19,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh index 26c60ae78eb..3e53d6ef0a6 100644 --- a/cuda/log/batch_logger.cuh +++ b/cuda/log/batch_logger.cuh @@ -23,4 +23,5 @@ namespace batch_log { } // namespace kernels } // namespace gko + #endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu index 6be0a2cab3b..6ec20480405 100644 --- a/cuda/matrix/batch_csr_kernels.cu +++ b/cuda/matrix/batch_csr_kernels.cu @@ -13,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu index 56268d8d6b4..673b08e5db1 100644 --- a/cuda/matrix/batch_dense_kernels.cu +++ b/cuda/matrix/batch_dense_kernels.cu @@ -9,15 +9,17 @@ #include +#include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu index 3c824cf8da4..8f0160bd154 100644 --- a/cuda/matrix/batch_ell_kernels.cu +++ b/cuda/matrix/batch_ell_kernels.cu @@ -13,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp index 1c17aea3bfe..5e9c803c9f6 100644 --- a/cuda/matrix/batch_struct.hpp +++ b/cuda/matrix/batch_struct.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index 3d67144c9ec..f138d0b934e 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -12,25 +12,21 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/matrix/dense_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/segment_scan.cuh" #include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { -/** - * @brief The CUDA namespace. - * - * @ingroup cuda - */ namespace cuda { /** * @brief The Coordinate matrix format namespace. diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu index 4a779775670..a0a7e4e97b8 100644 --- a/cuda/matrix/csr_kernels.template.cu +++ b/cuda/matrix/csr_kernels.template.cu @@ -27,7 +27,13 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -38,15 +44,9 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" @@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list, kernel::abstract_merge_path_spmv <<get_stream()>>>( static_cast(a->get_size()[0]), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list, abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { if (grid_num > 0) { @@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list, <<get_stream()>>>( static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list, grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -245,21 +246,21 @@ void classical_spmv(syn::value_list, if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( - a->get_size()[0], acc::as_cuda_range(a_vals), + a->get_size()[0], acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else if (alpha != nullptr && beta != nullptr) { if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } } else { GKO_KERNEL_NOT_FOUND; @@ -301,20 +302,20 @@ void load_balance_spmv(std::shared_ptr exec, exec->get_stream()>>>( nwarps, static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else { if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<get_stream()>>>( nwarps, static_cast(a->get_size()[0]), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } } @@ -329,55 +330,55 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* beta, matrix::Dense* c) { - auto handle = exec->get_cusparse_handle(); + auto handle = exec->get_sparselib_handle(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value || + if (!sparselib::is_supported::value || b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 || c->get_size()[0] == 0) { return false; } - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], - a->get_size()[1], a->get_num_stored_elements(), alpha, descr, - a->get_const_values(), row_ptrs, col_idxs, - b->get_const_values(), beta, c->get_values()); + sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], + a->get_size()[1], a->get_num_stored_elements(), alpha, + descr, a->get_const_values(), row_ptrs, col_idxs, + b->get_const_values(), beta, c->get_values()); - cusparse::destroy(descr); + sparselib::destroy(descr); #else // CUDA_VERSION >= 11000 // workaround for a division by zero in cuSPARSE 11.? if (a->get_size()[1] == 0) { return false; } - cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE; auto row_ptrs = const_cast(a->get_const_row_ptrs()); auto col_idxs = const_cast(a->get_const_col_idxs()); auto values = const_cast(a->get_const_values()); - auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], - a->get_num_stored_elements(), row_ptrs, - col_idxs, values); + auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), row_ptrs, + col_idxs, values); auto b_val = const_cast(b->get_const_values()); auto c_val = c->get_values(); if (b->get_stride() == 1 && c->get_stride() == 1) { - auto vecb = cusparse::create_dnvec(b->get_size()[0], b_val); - auto vecc = cusparse::create_dnvec(c->get_size()[0], c_val); + auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val); + auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val); #if CUDA_VERSION >= 11021 constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1; #else constexpr auto alg = CUSPARSE_CSRMV_ALG1; #endif size_type buffer_size = 0; - cusparse::spmv_buffersize(handle, trans, alpha, mat, vecb, - beta, vecc, alg, &buffer_size); + sparselib::spmv_buffersize(handle, trans, alpha, mat, vecb, + beta, vecc, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::spmv(handle, trans, alpha, mat, vecb, beta, vecc, - alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); + sparselib::spmv(handle, trans, alpha, mat, vecb, beta, vecc, + alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); } else { #if CUDA_VERSION >= 11060 if (b->get_size()[1] == 1) { @@ -388,22 +389,22 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, #endif // CUDA_VERSION >= 11060 cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2; auto vecb = - cusparse::create_dnmat(b->get_size(), b->get_stride(), b_val); + sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val); auto vecc = - cusparse::create_dnmat(c->get_size(), c->get_stride(), c_val); + sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val); size_type buffer_size = 0; - cusparse::spmm_buffersize(handle, trans, trans, alpha, mat, - vecb, beta, vecc, alg, - &buffer_size); + sparselib::spmm_buffersize(handle, trans, trans, alpha, mat, + vecb, beta, vecc, alg, + &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::spmm(handle, trans, trans, alpha, mat, vecb, beta, - vecc, alg, buffer); - cusparse::destroy(vecb); - cusparse::destroy(vecc); + sparselib::spmm(handle, trans, trans, alpha, mat, vecb, beta, + vecc, alg, buffer); + sparselib::destroy(vecb); + sparselib::destroy(vecc); } - cusparse::destroy(mat); + sparselib::destroy(mat); #endif return true; } @@ -437,8 +438,8 @@ bool try_sparselib_spmv(std::shared_ptr exec, return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, beta->get_const_values(), c); } else { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto valpha = one(); const auto vbeta = zero(); return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); @@ -583,8 +584,8 @@ void spgemm(std::shared_ptr exec, auto b_col_idxs = b->get_const_col_idxs(); auto c_row_ptrs = c->get_row_ptrs(); - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto a_nnz = static_cast(a->get_num_stored_elements()); @@ -600,18 +601,18 @@ void spgemm(std::shared_ptr exec, auto& c_vals_array = c_builder.get_value_array(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value) { + if (!sparselib::is_supported::value) { GKO_NOT_IMPLEMENTED; } - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); // allocate buffer size_type buffer_size{}; - cusparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_index, null_index, info, buffer_size); @@ -620,74 +621,75 @@ void spgemm(std::shared_ptr exec, // count nnz IndexType c_nnz{}; - cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, zero_nnz, null_index, null_index, c_descr, - c_row_ptrs, &c_nnz, info, buffer); + sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, zero_nnz, null_index, null_index, c_descr, + c_row_ptrs, &c_nnz, info, buffer); // accumulate non-zeros c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, - b_col_idxs, null_value, d_descr, zero_nnz, null_value, - null_index, null_index, c_descr, c_vals, c_row_ptrs, - c_col_idxs, info, buffer); - - cusparse::destroy(info); - cusparse::destroy(d_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); + sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, + null_value, null_index, null_index, c_descr, c_vals, + c_row_ptrs, c_col_idxs, info, buffer); + + sparselib::destroy(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); #else // CUDA_VERSION >= 11000 const auto beta = zero(); - auto spgemm_descr = cusparse::create_spgemm_descr(); - auto a_descr = cusparse::create_csr( + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( m, k, a_nnz, const_cast(a_row_ptrs), const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = cusparse::create_csr( + auto b_descr = sparselib::create_csr( k, n, b_nnz, const_cast(b_row_ptrs), const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); // estimate work size_type buffer1_size{}; - cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - nullptr); + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + nullptr); array buffer1{exec, buffer1_size}; - cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, - c_descr, spgemm_descr, buffer1_size, - buffer1.get_data()); + sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1_size, + buffer1.get_data()); // compute spgemm size_type buffer2_size{}; - cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - nullptr); + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + nullptr); array buffer2{exec, buffer2_size}; - cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr, buffer1.get_data(), buffer2_size, - buffer2.get_data()); + sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr, buffer1.get_data(), buffer2_size, + buffer2.get_data()); // copy data to result - auto c_nnz = cusparse::sparse_matrix_nnz(c_descr); + auto c_nnz = sparselib::sparse_matrix_nnz(c_descr); c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); - cusparse::csr_set_pointers(c_descr, c_row_ptrs, c_col_idxs_array.get_data(), - c_vals_array.get_data()); + sparselib::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); - cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, - spgemm_descr); + sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); - cusparse::destroy(spgemm_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); #endif // CUDA_VERSION >= 11000 } @@ -701,8 +703,8 @@ void advanced_spgemm(std::shared_ptr exec, const matrix::Csr* d, matrix::Csr* c) { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); auto valpha = exec->copy_val_to_host(alpha->get_const_values()); auto a_nnz = IndexType(a->get_num_stored_elements()); @@ -724,102 +726,102 @@ void advanced_spgemm(std::shared_ptr exec, auto c_row_ptrs = c->get_row_ptrs(); #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - if (!cusparse::is_supported::value) { + if (!sparselib::is_supported::value) { GKO_NOT_IMPLEMENTED; } matrix::CsrBuilder c_builder{c}; auto& c_col_idxs_array = c_builder.get_col_idx_array(); auto& c_vals_array = c_builder.get_value_array(); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); // allocate buffer size_type buffer_size{}; - cusparse::spgemm_buffer_size(handle, m, n, k, &valpha, a_descr, a_nnz, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, - b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, - d_row_ptrs, d_col_idxs, info, buffer_size); + sparselib::spgemm_buffer_size( + handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, + b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, + d_row_ptrs, d_col_idxs, info, buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); // count nnz IndexType c_nnz{}; - cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, - a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, - d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, - c_row_ptrs, &c_nnz, info, buffer); + sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs, + a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, + d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr, + c_row_ptrs, &c_nnz, info, buffer); // accumulate non-zeros c_col_idxs_array.resize_and_reset(c_nnz); c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, - a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, - b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, d_row_ptrs, - d_col_idxs, c_descr, c_vals, c_row_ptrs, c_col_idxs, info, - buffer); - - cusparse::destroy(info); - cusparse::destroy(d_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); + sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals, + a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, + b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, + d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs, + c_col_idxs, info, buffer); + + sparselib::destroy(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); #else // CUDA_VERSION >= 11000 auto null_value = static_cast(nullptr); auto null_index = static_cast(nullptr); auto one_val = one(); auto zero_val = zero(); auto zero_nnz = IndexType{}; - auto spgemm_descr = cusparse::create_spgemm_descr(); - auto a_descr = cusparse::create_csr( + auto spgemm_descr = sparselib::create_spgemm_descr(); + auto a_descr = sparselib::create_csr( m, k, a_nnz, const_cast(a_row_ptrs), const_cast(a_col_idxs), const_cast(a_vals)); - auto b_descr = cusparse::create_csr( + auto b_descr = sparselib::create_csr( k, n, b_nnz, const_cast(b_row_ptrs), const_cast(b_col_idxs), const_cast(b_vals)); - auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index, - null_value); + auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index, + null_value); // estimate work size_type buffer1_size{}; - cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, nullptr); + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, nullptr); array buffer1{exec, buffer1_size}; - cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, - &zero_val, c_descr, spgemm_descr, - buffer1_size, buffer1.get_data()); + sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); // compute spgemm size_type buffer2_size{}; - cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, nullptr); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); array buffer2{exec, buffer2_size}; - cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr, buffer1.get_data(), - buffer2_size, buffer2.get_data()); + sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); // write result to temporary storage - auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr); + auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr); array c_tmp_row_ptrs_array(exec, m + 1); array c_tmp_col_idxs_array(exec, c_tmp_nnz); array c_tmp_vals_array(exec, c_tmp_nnz); - cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), - c_tmp_col_idxs_array.get_data(), - c_tmp_vals_array.get_data()); + sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), + c_tmp_col_idxs_array.get_data(), + c_tmp_vals_array.get_data()); - cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, - c_descr, spgemm_descr); + sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr); - cusparse::destroy(c_descr); - cusparse::destroy(b_descr); - cusparse::destroy(a_descr); - cusparse::destroy(spgemm_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); + sparselib::destroy(spgemm_descr); auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements(); auto nnz_per_row = spgeam_total_nnz / m; @@ -846,13 +848,13 @@ void transpose(std::shared_ptr exec, if (orig->get_size()[0] == 0) { return; } - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -864,8 +866,8 @@ void transpose(std::shared_ptr exec, cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; size_type buffer_size = 0; - cusparse::transpose_buffersize( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose_buffersize( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -873,8 +875,8 @@ void transpose(std::shared_ptr exec, idxBase, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -898,13 +900,13 @@ void conj_transpose(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_size = ceildiv(trans->get_num_stored_elements(), block_size); - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -916,8 +918,8 @@ void conj_transpose(std::shared_ptr exec, cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; size_type buffer_size = 0; - cusparse::transpose_buffersize( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose_buffersize( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -925,8 +927,8 @@ void conj_transpose(std::shared_ptr exec, idxBase, alg, &buffer_size); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::transpose( - exec->get_cusparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -948,9 +950,9 @@ template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); auto m = IndexType(to_sort->get_size()[0]); auto n = IndexType(to_sort->get_size()[1]); auto nnz = IndexType(to_sort->get_num_stored_elements()); @@ -966,30 +968,30 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - cusparse::create_identity_permutation(handle, nnz, permutation); + components::fill_seq_array(exec, permutation, nnz); // allocate buffer size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, - buffer_size); + sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); // sort column indices - cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, - permutation, buffer); + sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + permutation, buffer); // sort values #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cusparse::gather(handle, nnz, tmp_vals, vals, permutation); + sparselib::gather(handle, nnz, tmp_vals, vals, permutation); #else // CUDA_VERSION >= 11000 - auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals); + auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals); auto tmp_vec = - cusparse::create_dnvec(nnz, const_cast(tmp_vals)); - cusparse::gather(handle, tmp_vec, val_vec); + sparselib::create_dnvec(nnz, const_cast(tmp_vals)); + sparselib::gather(handle, tmp_vec, val_vec); #endif - cusparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 04b34953c6a..b117c39107b 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -17,12 +17,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -53,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr exec, matrix::Dense* result, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); } else { compute_dot(exec, x, y, result, tmp); } @@ -78,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); } else { compute_conj_dot(exec, x, y, result, tmp); } @@ -102,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); } else { compute_norm2(exec, x, result, tmp); } @@ -124,18 +125,18 @@ void simple_apply(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* c) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); } else { dense::fill(exec, c, zero()); } @@ -154,15 +155,15 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (cublas::is_supported::value) { + if (blas::is_supported::value) { if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - cublas::gemm( - exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), c->get_stride()); + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); } else { dense::scale(exec, beta, c); } @@ -180,17 +181,17 @@ void transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -205,17 +206,17 @@ void conj_transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu index b81329e29a0..e362ff0462b 100644 --- a/cuda/matrix/diagonal_kernels.cu +++ b/cuda/matrix/diagonal_kernels.cu @@ -9,9 +9,10 @@ #include -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 9c23abc9dc4..105122ec4a9 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -15,19 +15,20 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -97,9 +98,9 @@ void abstract_spmv(syn::value_list, using arithmetic_type = highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; + acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; + acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); @@ -114,11 +115,11 @@ void abstract_spmv(syn::value_list, const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); - const auto a_vals = gko::acc::range( + const auto a_vals = acc::range( std::array{{static_cast( num_stored_elements_per_row * stride)}}, a->get_const_values()); - const auto b_vals = gko::acc::range( + const auto b_vals = acc::range( std::array{ {static_cast(b->get_size()[0]), static_cast(b->get_size()[1])}}, @@ -130,20 +131,21 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(a_vals), + nrows, num_worker_per_row, acc::as_device_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + num_stored_elements_per_row, acc::as_device_range(b_vals), as_device_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = gko::acc::range( + const auto alpha_val = acc::range( std::array{1}, alpha->get_const_values()); if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + nrows, num_worker_per_row, acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), as_device_type(c->get_values()), c->get_stride()); } @@ -212,7 +214,7 @@ void spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the cuda kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ @@ -246,7 +248,7 @@ void advanced_spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the cuda kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu index 8b835c6fd7d..ad36c84216e 100644 --- a/cuda/matrix/fbcsr_kernels.template.cu +++ b/cuda/matrix/fbcsr_kernels.template.cu @@ -24,6 +24,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/unified/base/kernel_launch.hpp" #include "core/base/array_access.hpp" #include "core/base/block_sizes.hpp" @@ -33,16 +40,10 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/cusparse_block_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" @@ -72,6 +73,7 @@ constexpr int default_block_size{512}; namespace { + template void dense_transpose(std::shared_ptr exec, const size_type nrows, const size_type ncols, @@ -81,21 +83,22 @@ void dense_transpose(std::shared_ptr exec, if (nrows == 0) { return; } - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); { - cublas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha, - orig, orig_stride, &beta, trans, trans_stride, trans, - trans_stride); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); } } else { GKO_NOT_IMPLEMENTED; } } + } // namespace @@ -114,12 +117,12 @@ void spmv(std::shared_ptr exec, dense::fill(exec, c, zero()); return; } - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); - cusparse::pointer_mode_guard pm_guard(handle); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto alpha = one(); const auto beta = zero(); - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -133,21 +136,21 @@ void spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), &beta, c->get_values()); + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); - cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - cusparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -171,11 +174,11 @@ void advanced_spmv(std::shared_ptr exec, dense::scale(exec, beta, c); return; } - if (cusparse::is_supported::value) { - auto handle = exec->get_cusparse_handle(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); const auto alphp = alpha->get_const_values(); const auto betap = beta->get_const_values(); - auto descr = cusparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -189,23 +192,23 @@ void advanced_spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), trans_stride, trans_c.get_data()); - cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - cusparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -244,20 +247,21 @@ void transpose(const std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) { - if (cusparse::is_supported::value) { +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { const int bs = orig->get_block_size(); const IndexType nnzb = static_cast(orig->get_num_stored_blocks()); cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = cusparse::bsr_transpose_buffersize( - exec->get_cusparse_handle(), orig->get_num_block_rows(), + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), orig->get_num_block_cols(), nnzb, orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); array buffer_array(exec, buffer_size); auto buffer = buffer_array.get_data(); - cusparse::bsr_transpose( - exec->get_cusparse_handle(), orig->get_num_block_rows(), + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), orig->get_num_block_cols(), nnzb, orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), @@ -268,7 +272,9 @@ void transpose(const std::shared_ptr exec, fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, syn::value_list(), syn::type_list<>(), exec, trans); - } else { + } else +#endif + { fallback_transpose(exec, orig, trans); } } diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu index 5eadf0d3858..d6c20075ef4 100644 --- a/cuda/matrix/sellp_kernels.cu +++ b/cuda/matrix/sellp_kernels.cu @@ -12,10 +12,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 3a3d60b24e0..311e4d3782c 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -11,18 +11,19 @@ #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -41,7 +42,11 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; constexpr int default_block_size = 512; +#ifdef GKO_COMPILING_HIP +constexpr int spmv_block_size = 256; +#else constexpr int spmv_block_size = 128; +#endif constexpr int warps_in_block = 4; @@ -105,16 +110,16 @@ void classical_spmv(syn::value_list, a->get_size()[0], as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -168,21 +173,21 @@ void sort_by_column_index(std::shared_ptr exec, const auto num_cols = static_cast(to_sort->get_size()[1]); const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - if (cusparse::is_supported::value) { - const auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); + if (sparselib::is_supported::value) { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, to_sort->get_num_nonzeros()); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, to_sort->get_num_nonzeros()); size_type buffer_size{}; - cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs, - col_idxs, buffer_size); + sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, + row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, - col_idxs, permutation, buffer); - cusparse::destroy(descr); + sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + col_idxs, permutation, buffer); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu index b5e9fa1612d..75c3dd911ad 100644 --- a/cuda/multigrid/pgm_kernels.cu +++ b/cuda/multigrid/pgm_kernels.cu @@ -8,8 +8,6 @@ #include -#include -#include #include #include #include @@ -21,8 +19,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" namespace gko { diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index 0eae8650bdc..e83d6e04ee9 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -6,9 +6,9 @@ #define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_ +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index 6551f32bb86..d0dd516466a 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -10,12 +10,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_advanced_apply_kernel.cu rename to cuda/preconditioner/jacobi_advanced_apply_kernels.cu diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu similarity index 95% rename from cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu rename to cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index 5633ad15a4b..10ede90da7e 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -8,14 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index fe99fd88227..aeb47fec97e 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -6,7 +6,7 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_generate_kernel.cu rename to cuda/preconditioner/jacobi_generate_kernels.cu diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu similarity index 94% rename from cuda/preconditioner/jacobi_generate_instantiate.inc.cu rename to cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index a76c4fba271..129c50625f4 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -9,14 +9,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/diagonal_block_manipulation.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -35,7 +35,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 2508f33acb9..bce2ff23303 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -8,13 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -30,8 +31,12 @@ namespace cuda { namespace jacobi { -// a total of 32 warps (1024 threads) +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC constexpr int default_num_warps = 32; +#endif // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu similarity index 100% rename from cuda/preconditioner/jacobi_simple_apply_kernel.cu rename to cuda/preconditioner/jacobi_simple_apply_kernels.cu diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu similarity index 95% rename from cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu rename to cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index 07689daa815..15f6dc138ad 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -8,14 +8,14 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu index d699d00dfb6..72322016fba 100644 --- a/cuda/reorder/rcm_kernels.cu +++ b/cuda/reorder/rcm_kernels.cu @@ -25,9 +25,9 @@ #include +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 0ce95e2d34f..58e1a6b7b0d 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -13,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f429e5f22f0..398e831eb09 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -13,15 +13,15 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/kernel_config.hpp" #include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 107835ca1b5..3dbefadf22a 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -8,25 +8,25 @@ #include +#include #include -#include #include #include -#include "accessor/cuda_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/solver/cb_gmres_accessor.hpp" -#include "cuda/base/config.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -44,6 +44,8 @@ namespace cb_gmres { constexpr int default_block_size = 512; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. constexpr int default_dot_dim = 32; constexpr int default_dot_size = default_dot_dim * default_dot_dim; @@ -116,7 +118,7 @@ void restart(std::shared_ptr exec, restart_1_kernel <<get_stream()>>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, @@ -145,7 +147,7 @@ void restart(std::shared_ptr exec, residual_norm->get_stride(), as_device_type(arnoldi_norm->get_const_values() + 2 * stride_arnoldi), - stride_arnoldi, acc::as_cuda_range(krylov_bases)); + stride_arnoldi, acc::as_device_range(krylov_bases)); } const auto grid_dim_2 = @@ -158,7 +160,7 @@ void restart(std::shared_ptr exec, residual->get_stride(), as_device_type(residual_norm->get_const_values()), as_device_type(residual_norm_collection->get_values()), - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(next_krylov_basis->get_values()), next_krylov_basis->get_stride(), as_device_type(final_iter_nums->get_data())); @@ -212,6 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, as_device_type(next_krylov_basis->get_const_values()), stride_next_krylov, as_device_type(arnoldi_norm->get_values()), as_device_type(stop_status)); + // nrmP = norm(next_krylov_basis) zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, hessenberg_iter->get_values()); if (dim_size[1] > 1) { @@ -219,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } else { @@ -228,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } @@ -240,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); @@ -269,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), as_device_type(num_reorth->get_data())); num_reorth_host = get_element(*num_reorth, 0); @@ -282,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } else { @@ -291,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } @@ -303,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(buffer_iter->get_const_values()), stride_buffer, @@ -335,18 +338,19 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), num_reorth->get_data()); num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be + // reorthogonalization } - update_krylov_next_krylov_kernel <<get_stream()>>>( iter, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); // next_krylov_basis /= hessenberg(iter, iter + 1) @@ -460,7 +464,7 @@ void calculate_qy(std::shared_ptr exec, calculate_Qy_kernel <<get_stream()>>>( - num_rows, num_cols, acc::as_cuda_range(krylov_bases), + num_rows, num_cols, acc::as_device_range(krylov_bases), as_device_type(y->get_const_values()), y->get_stride(), as_device_type(before_preconditioner->get_values()), stride_before_preconditioner, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index cb627e04b53..992974e95ef 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -66,7 +66,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CudaSolveStruct(std::shared_ptr exec, const matrix::Csr* matrix, size_type num_rhs, bool is_upper, bool unit_diag) - : handle{exec->get_cusparse_handle()}, + : handle{exec->get_sparselib_handle()}, spsm_descr{}, descr_a{}, num_rhs{num_rhs}, @@ -75,18 +75,18 @@ struct CudaSolveStruct : gko::solver::SolveStruct { if (num_rhs == 0) { return; } - cusparse::pointer_mode_guard pm_guard(handle); - spsm_descr = cusparse::create_spsm_descr(); - descr_a = cusparse::create_csr( + sparselib::pointer_mode_guard pm_guard(handle); + spsm_descr = sparselib::create_spsm_descr(); + descr_a = sparselib::create_csr( matrix->get_size()[0], matrix->get_size()[1], matrix->get_num_stored_elements(), const_cast(matrix->get_const_row_ptrs()), const_cast(matrix->get_const_col_idxs()), const_cast(matrix->get_const_values())); - cusparse::set_attribute( + sparselib::set_attribute( descr_a, CUSPARSE_SPMAT_FILL_MODE, is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER); - cusparse::set_attribute( + sparselib::set_attribute( descr_a, CUSPARSE_SPMAT_DIAG_TYPE, unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT); @@ -94,28 +94,28 @@ struct CudaSolveStruct : gko::solver::SolveStruct { // workaround suggested by NVIDIA engineers: for some reason // cusparse needs non-nullptr input vectors even for analysis // also make sure they are aligned by 16 bytes - auto descr_b = cusparse::create_dnmat( + auto descr_b = sparselib::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAD0)); - auto descr_c = cusparse::create_dnmat( + auto descr_c = sparselib::create_dnmat( dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1], reinterpret_cast(0xDEAF0)); - auto work_size = cusparse::spsm_buffer_size( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, one(), descr_a, + auto work_size = sparselib::spsm_buffer_size( + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, one(), descr_a, descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); work.resize_and_reset(work_size); - cusparse::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - one(), descr_a, descr_b, descr_c, - CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr, - work.get_data()); + sparselib::spsm_analysis(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, + one(), descr_a, descr_b, descr_c, + CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr, + work.get_data()); - cusparse::destroy(descr_b); - cusparse::destroy(descr_c); + sparselib::destroy(descr_b); + sparselib::destroy(descr_c); } void solve(const matrix::Csr*, @@ -134,30 +134,30 @@ struct CudaSolveStruct : gko::solver::SolveStruct { "provided at generation time. Check the value specified in " ".with_num_rhs(...)."}; } - cusparse::pointer_mode_guard pm_guard(handle); - auto descr_b = cusparse::create_dnmat( + sparselib::pointer_mode_guard pm_guard(handle); + auto descr_b = sparselib::create_dnmat( input->get_size(), input->get_stride(), const_cast(input->get_const_values())); - auto descr_c = cusparse::create_dnmat( + auto descr_c = sparselib::create_dnmat( output->get_size(), output->get_stride(), output->get_values()); - cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, one(), - descr_a, descr_b, descr_c, - CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); + sparselib::spsm_solve(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_NON_TRANSPOSE, + one(), descr_a, descr_b, descr_c, + CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr); - cusparse::destroy(descr_b); - cusparse::destroy(descr_c); + sparselib::destroy(descr_b); + sparselib::destroy(descr_c); } ~CudaSolveStruct() { if (descr_a) { - cusparse::destroy(descr_a); + sparselib::destroy(descr_a); descr_a = nullptr; } if (spsm_descr) { - cusparse::destroy(spsm_descr); + sparselib::destroy(spsm_descr); spsm_descr = nullptr; } } @@ -189,7 +189,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { const matrix::Csr* matrix, size_type num_rhs, bool is_upper, bool unit_diag) : exec{exec}, - handle{exec->get_cusparse_handle()}, + handle{exec->get_sparselib_handle()}, algorithm{}, solve_info{}, policy{}, @@ -200,23 +200,23 @@ struct CudaSolveStruct : gko::solver::SolveStruct { if (num_rhs == 0) { return; } - cusparse::pointer_mode_guard pm_guard(handle); - factor_descr = cusparse::create_mat_descr(); - solve_info = cusparse::create_solve_info(); - cusparse::set_mat_fill_mode( + sparselib::pointer_mode_guard pm_guard(handle); + factor_descr = sparselib::create_mat_descr(); + solve_info = sparselib::create_solve_info(); + sparselib::set_mat_fill_mode( factor_descr, is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER); - cusparse::set_mat_diag_type( + sparselib::set_mat_diag_type( factor_descr, unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT); algorithm = 0; - policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; + policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL; size_type work_size{}; - cusparse::buffer_size_ext( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, + sparselib::buffer_size_ext( + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, @@ -225,9 +225,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct { // allocate workspace work.resize_and_reset(work_size); - cusparse::csrsm2_analysis( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, + sparselib::csrsm2_analysis( + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, @@ -250,11 +250,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct { "provided at generation time. Check the value specified in " ".with_num_rhs(...)."}; } - cusparse::pointer_mode_guard pm_guard(handle); + sparselib::pointer_mode_guard pm_guard(handle); dense::copy(exec, input, output); - cusparse::csrsm2_solve( - handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], + sparselib::csrsm2_solve( + handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], output->get_stride(), matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), @@ -265,11 +265,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct { ~CudaSolveStruct() { if (factor_descr) { - cusparse::destroy(factor_descr); + sparselib::destroy(factor_descr); factor_descr = nullptr; } if (solve_info) { - cusparse::destroy(solve_info); + sparselib::destroy(solve_info); solve_info = nullptr; } } @@ -304,7 +304,7 @@ void generate_kernel(std::shared_ptr exec, if (matrix->get_size()[0] == 0) { return; } - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { solve_struct = std::make_shared>( exec, matrix, num_rhs, is_upper, unit_diag); } else { @@ -327,7 +327,7 @@ void solve_kernel(std::shared_ptr exec, } using vec = matrix::Dense; - if (cusparse::is_supported::value) { + if (sparselib::is_supported::value) { if (auto cuda_solve_struct = dynamic_cast*>( solve_struct)) { diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index 9c97d99f13c..f7e89c9d9d8 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -13,14 +13,15 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/randlib_bindings.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/curand_bindings.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -69,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr exec, bool deterministic) { if (!deterministic) { - auto gen = curand::rand_generator(std::random_device{}(), - CURAND_RNG_PSEUDO_DEFAULT, - exec->get_stream()); - curand::rand_vector( + auto gen = randlib::rand_generator(std::random_device{}(), + RANDLIB_RNG_PSEUDO_DEFAULT, + exec->get_stream()); + randlib::rand_vector( gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); - curand::destroy(gen); + randlib::destroy(gen); } } @@ -145,9 +146,8 @@ void update_g_and_u(std::shared_ptr exec, as_device_type(alpha->get_values()), stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, - g_k->get_values(), g_k->get_stride(), - alpha->get_values()); + blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), + g_k->get_stride(), alpha->get_values()); } update_g_k_and_u_kernel <<get_stride(), default_block_size), @@ -196,8 +196,8 @@ void update_m(std::shared_ptr exec, const size_type nrhs, as_device_type(g_k->get_const_values()), g_k->get_stride(), as_device_type(m_i), stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); + blas::dot(exec->get_blas_handle(), size, p_i, 1, + g_k->get_const_values(), g_k->get_stride(), m_i); } } } diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index 46b4cb4c2e4..002cc0140cb 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -17,9 +17,9 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu index 4eea02883b2..1d31130623a 100644 --- a/cuda/solver/multigrid_kernels.cu +++ b/cuda/solver/multigrid_kernels.cu @@ -11,9 +11,10 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index a8ee5f77cca..e1e01538f79 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -17,9 +17,9 @@ #include -#include "cuda/base/cusparse_bindings.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/solver/common_trs_kernels.cuh" diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu index 17bcbbc1567..e54b5d140f2 100644 --- a/cuda/stop/criterion_kernels.cu +++ b/cuda/stop/criterion_kernels.cu @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu index 18102d91ec5..7146d0cbf04 100644 --- a/cuda/stop/residual_norm_kernels.cu +++ b/cuda/stop/residual_norm_kernels.cu @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index c7f70fe3011..944e7642223 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -17,8 +17,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu index 1b514842e84..c9d9e6bf124 100644 --- a/cuda/test/components/cooperative_groups.cu +++ b/cuda/test/components/cooperative_groups.cu @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "cuda/components/cooperative_groups.cuh" - - #include @@ -15,7 +12,8 @@ #include -#include "cuda/base/config.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu index 6ef7d3ab3c4..37b032eb794 100644 --- a/cuda/test/components/merging.cu +++ b/cuda/test/components/merging.cu @@ -18,7 +18,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu index 0eeb383c05c..ffe00c247c0 100644 --- a/cuda/test/components/searching.cu +++ b/cuda/test/components/searching.cu @@ -17,7 +17,7 @@ #include -#include "cuda/components/cooperative_groups.cuh" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "cuda/test/utils.hpp" diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 8c68efae046..ee373243842 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -93,7 +93,7 @@ string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLO configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp) ginkgo_compile_features(ginkgo_dpcpp) -target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) +target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp _ONEDPL_COMPILE_KERNEL=0) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") @@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_dpcpp GKO_COMPILING_DPCPP) + ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp") endif() if(GINKGO_BUILD_TESTS) diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index bb9c8a75050..38ecad08271 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -2,4 +2,4 @@ ginkgo_create_dpcpp_test(executor) ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp -target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) +target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 046fd1e4d7a..bf2d6a6cf58 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -31,12 +31,12 @@ set(GINKGO_HIP_SOURCES factorization/par_ic_kernels.hip.cpp factorization/par_ict_kernels.hip.cpp factorization/par_ilu_kernels.hip.cpp - factorization/par_ilut_approx_filter_kernel.hip.cpp - factorization/par_ilut_filter_kernel.hip.cpp + factorization/par_ilut_approx_filter_kernels.hip.cpp + factorization/par_ilut_filter_kernels.hip.cpp factorization/par_ilut_select_common.hip.cpp - factorization/par_ilut_select_kernel.hip.cpp - factorization/par_ilut_spgeam_kernel.hip.cpp - factorization/par_ilut_sweep_kernel.hip.cpp + factorization/par_ilut_select_kernels.hip.cpp + factorization/par_ilut_spgeam_kernels.hip.cpp + factorization/par_ilut_sweep_kernels.hip.cpp matrix/batch_csr_kernels.hip.cpp matrix/batch_dense_kernels.hip.cpp matrix/batch_ell_kernels.hip.cpp @@ -51,10 +51,10 @@ set(GINKGO_HIP_SOURCES multigrid/pgm_kernels.hip.cpp preconditioner/batch_jacobi_kernels.hip.cpp preconditioner/isai_kernels.hip.cpp - preconditioner/jacobi_advanced_apply_kernel.hip.cpp - preconditioner/jacobi_generate_kernel.hip.cpp + preconditioner/jacobi_advanced_apply_kernels.hip.cpp + preconditioner/jacobi_generate_kernels.hip.cpp preconditioner/jacobi_kernels.hip.cpp - preconditioner/jacobi_simple_apply_kernel.hip.cpp + preconditioner/jacobi_simple_apply_kernels.hip.cpp reorder/rcm_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp @@ -86,28 +86,28 @@ else() endif() foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) configure_file( - preconditioner/jacobi_generate_instantiate.inc.hip.cpp - preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_generate_kernels.instantiate.hip.cpp + preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) configure_file( - preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp - preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp + preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) configure_file( - preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp - preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp + preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds # Probably the same as https://github.com/llvm/llvm-project/issues/67574 # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789 set_source_files_properties( - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp PROPERTIES COMPILE_OPTIONS $<$:-O2>) list(APPEND GINKGO_HIP_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp - ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) endforeach() string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp) @@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp ) -target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP) +target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip) target_link_libraries(ginkgo_hip PUBLIC ginkgo_device) target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) @@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip) ginkgo_install_library(ginkgo_hip) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP) + ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip") endif() if(GINKGO_BUILD_TESTS) diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp index f5a1dba3977..74e6c34dc5d 100644 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/base/batch_multi_vector_kernels.hpp" -#include #include #include @@ -14,13 +13,14 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp index fa44a22b554..4f09ec66bb8 100644 --- a/hip/base/batch_struct.hip.hpp +++ b/hip/base/batch_struct.hip.hpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index fbad841fd0f..89dc67255fc 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -6,15 +6,10 @@ #define GKO_HIP_BASE_CONFIG_HIP_HPP_ -#include - - -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/math.hip.hpp" diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index 58376c2175b..be897510056 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -5,14 +5,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp index 745ba955014..5a0b762ea57 100644 --- a/hip/base/device_matrix_data_kernels.hip.cpp +++ b/hip/base/device_matrix_data_kernels.hip.cpp @@ -14,8 +14,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp index aed5e803d60..f0e17f4e873 100644 --- a/hip/base/exception.hip.cpp +++ b/hip/base/exception.hip.cpp @@ -8,7 +8,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #include @@ -23,6 +22,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 2694ce4177f..4b5ce7afa7b 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -8,15 +8,13 @@ #include -#include - - #include #include #include -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index f4dd3f1a1e8..d5dc94d6138 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_ -#include #if HIP_VERSION >= 50200000 #include #else @@ -18,8 +17,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -260,6 +260,20 @@ inline void destroy_hipblas_handle(hipblasContext* handle) } // namespace hipblas + + +namespace blas { + + +using namespace hipblas; + + +#define BLAS_OP_N HIPBLAS_OP_N +#define BLAS_OP_T HIPBLAS_OP_T +#define BLAS_OP_C HIPBLAS_OP_C + + +} // namespace blas } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 471abb3ccd5..9fd7ade8231 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_ -#include #if HIP_VERSION >= 50200000 #include #else @@ -17,8 +16,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -90,6 +90,18 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, } // namespace hiprand + + +namespace randlib { + + +using namespace hiprand; + + +#define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT + + +} // namespace randlib } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 62c7e60995e..0337f0a03c6 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_ -#include #if HIP_VERSION >= 50200000 #include #else @@ -18,7 +17,8 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { @@ -955,6 +955,20 @@ GKO_BIND_HIPSPARSE_IC0(std::complex, hipsparseZcsric02); } // namespace hipsparse + + +namespace sparselib { + + +using namespace hipsparse; + + +#define SPARSELIB_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL + + +} // namespace sparselib } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp index eb9e8a31481..6fb70c4571c 100644 --- a/hip/base/hipsparse_block_bindings.hip.hpp +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -6,7 +6,6 @@ #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ -#include #if HIP_VERSION >= 50200000 #include #else @@ -17,8 +16,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 1a00e99cac7..890b9922a4c 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -8,12 +8,12 @@ #endif -#include #include -#include "accessor/hip_helper.hpp" -#include "hip/base/types.hip.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -24,21 +24,21 @@ namespace hip { template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(gko::acc::range& range) { - return gko::acc::as_hip_range(range); + return gko::acc::as_device_range(range); } }; template struct to_device_type_impl&> { - using type = std::decay_t>()))>; static type map_to_device(const gko::acc::range& range) { - return gko::acc::as_hip_range(range); + return gko::acc::as_device_range(range); } }; diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 7c5d0c01c9c..c32fb592de0 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -8,9 +8,9 @@ #endif +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index 18532c9754c..eda18f35eab 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -8,7 +8,7 @@ #endif -#include +#include "common/cuda_hip/base/runtime.hpp" namespace gko { diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 0e14bf9f511..5fde8f518c6 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -5,12 +5,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp index 2908164cccd..5cd4b3ec58f 100644 --- a/hip/base/pointer_mode_guard.hip.hpp +++ b/hip/base/pointer_mode_guard.hip.hpp @@ -9,7 +9,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #include @@ -24,6 +23,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { namespace kernels { namespace hip { diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 0ed12a54786..46dad3be816 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include +#include -#include +#include "common/cuda_hip/base/runtime.hpp" #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp index ab6ed703da8..1fd7211b106 100644 --- a/hip/base/scoped_device_id.hip.cpp +++ b/hip/base/scoped_device_id.hip.cpp @@ -6,12 +6,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index 93c1fc008d9..b56c5104428 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -5,14 +5,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp index 44fe5b7cbeb..bd81d9f3be5 100644 --- a/hip/base/timer.hip.cpp +++ b/hip/base/timer.hip.cpp @@ -5,12 +5,10 @@ #include -#include - - #include +#include "common/cuda_hip/base/runtime.hpp" #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 8827b2bea41..9ae2224c064 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -14,7 +14,8 @@ #include #include -#include + + #if HIP_VERSION >= 50200000 #include #else @@ -26,6 +27,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { @@ -430,6 +434,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr } +using deviceComplex = hipComplex; +using deviceDoubleComplex = hipDoubleComplex; + + } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp index f57705ff408..0dc8d7a3b46 100644 --- a/hip/components/atomic.hip.hpp +++ b/hip/components/atomic.hip.hpp @@ -9,8 +9,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -21,38 +21,6 @@ namespace hip { #include "common/cuda_hip/components/atomic.hpp.inc" -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - hipComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(static_cast(&(addr->x)), val.real()); - auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); - return {real, imag}; -} - - -/** - * @internal - * - * @note It is not 'real' complex atomic add operation - */ -__forceinline__ __device__ thrust::complex atomic_add( - thrust::complex* __restrict__ address, thrust::complex val) -{ - hipDoubleComplex* addr = reinterpret_cast(address); - // Separate to real part and imag part - auto real = atomic_add(static_cast(&(addr->x)), val.real()); - auto imag = atomic_add(static_cast(&(addr->y)), val.imag()); - return {real, imag}; -} - - } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 247218a1457..e81441a092b 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -9,8 +9,8 @@ #include -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp index 0261c7549c5..290511e7583 100644 --- a/hip/components/diagonal_block_manipulation.hip.hpp +++ b/hip/components/diagonal_block_manipulation.hip.hpp @@ -9,9 +9,9 @@ #include -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index 59c0405a874..07daf486d84 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -6,14 +6,12 @@ #define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_ -#include - - #include #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index fd4fbb8ce11..4bb6fa19ec0 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -13,7 +13,7 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" namespace gko { diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp index b5065589d8e..5acde03cbec 100644 --- a/hip/components/prefix_sum.hip.hpp +++ b/hip/components/prefix_sum.hip.hpp @@ -9,8 +9,8 @@ #include -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index c8fa5e58b4f..fb0539952ff 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -9,16 +9,15 @@ #include -#include - - #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -57,7 +56,6 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - reduce_add_array<<get_stream()>>>( size, as_device_type(source), diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp index 2a6be767c2c..9222de9e1d6 100644 --- a/hip/components/searching.hip.hpp +++ b/hip/components/searching.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" #include "hip/components/intrinsics.hip.hpp" diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp index 7f98d08cf69..93ebb35833a 100644 --- a/hip/components/segment_scan.hip.hpp +++ b/hip/components/segment_scan.hip.hpp @@ -6,7 +6,7 @@ #define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp index 730c3c56401..4a664aee453 100644 --- a/hip/components/sorting.hip.hpp +++ b/hip/components/sorting.hip.hpp @@ -6,8 +6,8 @@ #define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ -#include "hip/base/config.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" namespace gko { diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp index 9fe48944b56..7627a0a2781 100644 --- a/hip/components/syncfree.hip.hpp +++ b/hip/components/syncfree.hip.hpp @@ -9,11 +9,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/config.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/memory.hip.hpp" namespace gko { diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp index 03761983e02..6f0bd44ba9c 100644 --- a/hip/components/thread_ids.hip.hpp +++ b/hip/components/thread_ids.hip.hpp @@ -6,17 +6,12 @@ #define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { namespace kernels { namespace hip { -/** - * @brief The HIP thread namespace. - * - * @ingroup hip_thread - */ namespace thread { diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index 320d847ed85..fc6718dec0d 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/distributed/vector_kernels.hpp" -#include - - #include #include #include diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp index 1dd94bb05d0..419db21b811 100644 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ b/hip/factorization/cholesky_kernels.hip.cpp @@ -20,15 +20,15 @@ #include +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/syncfree.hip.hpp" @@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr exec, } // sort postorder_cols inside rows { - const auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, mtx_nnz); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, mtx_nnz); size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, row_ptrs, postorder_cols, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - hipsparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, postorder_cols, permutation, buffer); - hipsparse::destroy(descr); + sparselib::destroy(descr); } // count nonzeros per row of L { diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp index a2de4912fdb..4080768bc07 100644 --- a/hip/factorization/factorization_kernels.hip.cpp +++ b/hip/factorization/factorization_kernels.hip.cpp @@ -5,17 +5,16 @@ #include "core/factorization/factorization_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/searching.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp index 7a845547d0d..edda974fd36 100644 --- a/hip/factorization/ic_kernels.hip.cpp +++ b/hip/factorization/ic_kernels.hip.cpp @@ -5,13 +5,11 @@ #include "core/factorization/ic_kernels.hpp" -#include - - #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -30,32 +28,32 @@ void compute(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_hipsparse_handle(); - auto desc = hipsparse::create_mat_descr(); - auto info = hipsparse::create_ic0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ic0_info(); // get buffer size for IC IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - hipsparse::ic0_buffer_size(handle, num_rows, nnz, desc, + sparselib::ic0_buffer_size(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up IC(0) - hipsparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL, + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::ic0(handle, num_rows, nnz, desc, m->get_values(), + sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::destroy_ic0_info(info); - hipsparse::destroy(desc); + sparselib::destroy_ic0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL); diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp index 071d3721536..f50df5ca75b 100644 --- a/hip/factorization/ilu_kernels.hip.cpp +++ b/hip/factorization/ilu_kernels.hip.cpp @@ -5,13 +5,11 @@ #include "core/factorization/ilu_kernels.hpp" -#include - - #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" namespace gko { @@ -30,32 +28,32 @@ void compute_lu(std::shared_ptr exec, matrix::Csr* m) { const auto id = exec->get_device_id(); - auto handle = exec->get_hipsparse_handle(); - auto desc = hipsparse::create_mat_descr(); - auto info = hipsparse::create_ilu0_info(); + auto handle = exec->get_sparselib_handle(); + auto desc = sparselib::create_mat_descr(); + auto info = sparselib::create_ilu0_info(); // get buffer size for ILU IndexType num_rows = m->get_size()[0]; IndexType nnz = m->get_num_stored_elements(); size_type buffer_size{}; - hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc, + sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, buffer_size); array buffer{exec, buffer_size}; // set up ILU(0) - hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), + sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), - info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL, + info, SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(), + sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(), m->get_const_row_ptrs(), m->get_const_col_idxs(), info, - HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); + SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data()); - hipsparse::destroy_ilu0_info(info); - hipsparse::destroy(desc); + sparselib::destroy_ilu0_info(info); + sparselib::destroy(desc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp index e1c60103dd3..ec3e771134e 100644 --- a/hip/factorization/lu_kernels.hip.cpp +++ b/hip/factorization/lu_kernels.hip.cpp @@ -17,11 +17,11 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/allocator.hpp" #include "core/matrix/csr_lookup.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/syncfree.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index dd91ac27339..e4cd0b2470b 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -10,9 +10,9 @@ #include +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 4b27383bff5..7f5dba82eba 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ict_kernels.hpp" -#include - - #include #include #include @@ -15,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -22,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" @@ -49,8 +47,7 @@ using compiled_kernels = syn::value_list; -#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" -#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc" +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" namespace { diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index b10941d44f1..49608d6801f 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -5,16 +5,13 @@ #include "core/factorization/par_ilu_kernels.hpp" -#include - - -#include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp similarity index 97% rename from hip/factorization/par_ilut_approx_filter_kernel.hip.cpp rename to hip/factorization/par_ilut_approx_filter_kernels.hip.cpp index d730e33e418..b5612ea29c6 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp @@ -8,9 +8,6 @@ #include -#include - - #include #include #include @@ -18,16 +15,17 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/sorting.hip.hpp" diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp similarity index 96% rename from hip/factorization/par_ilut_filter_kernel.hip.cpp rename to hip/factorization/par_ilut_filter_kernels.hip.cpp index eef1044878e..e6d0a6348cc 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,15 +12,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 85c2eaa7036..ddad307dc62 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -4,7 +4,7 @@ // force-top: on // prevent compilation failure related to disappearing assert(...) statements -#include +#include "common/cuda_hip/base/runtime.hpp" // force-top: off diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp similarity index 99% rename from hip/factorization/par_ilut_select_kernel.hip.cpp rename to hip/factorization/par_ilut_select_kernels.hip.cpp index b6d93e65b24..b259133b95d 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernels.hip.cpp @@ -8,14 +8,12 @@ #include -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/atomic.hip.hpp" diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp similarity index 98% rename from hip/factorization/par_ilut_spgeam_kernel.hip.cpp rename to hip/factorization/par_ilut_spgeam_kernels.hip.cpp index ad102e49488..df77b1ba7a2 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,13 +12,14 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp similarity index 97% rename from hip/factorization/par_ilut_sweep_kernel.hip.cpp rename to hip/factorization/par_ilut_sweep_kernels.hip.cpp index bdcecc609d5..0f1e6455812 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/factorization/par_ilut_kernels.hpp" -#include - - #include #include #include @@ -15,6 +12,8 @@ #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" @@ -22,7 +21,6 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" @@ -85,7 +83,6 @@ void compute_l_u_factors(syn::value_list, } } - GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, compute_l_u_factors); diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp index 432213f3083..de73576ffed 100644 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/matrix/batch_csr_kernels.hpp" -#include #include @@ -14,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp index 0d03d4ea10b..5d3b9d8cef9 100644 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -5,19 +5,21 @@ #include "core/matrix/batch_dense_kernels.hpp" -#include #include +#include #include +#include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp index 221a3ec65dd..d415f114c3b 100644 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/matrix/batch_ell_kernels.hpp" -#include #include @@ -14,12 +13,13 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp index 6c98146161e..16a267d95b6 100644 --- a/hip/matrix/batch_struct.hip.hpp +++ b/hip/matrix/batch_struct.hip.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "core/base/batch_struct.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp index 5e32e1d8502..8f7a050ef87 100644 --- a/hip/matrix/coo_kernels.hip.cpp +++ b/hip/matrix/coo_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/coo_kernels.hpp" -#include - - #include #include #include @@ -15,25 +12,21 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/matrix/dense_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" #include "hip/components/segment_scan.hip.hpp" #include "hip/components/thread_ids.hip.hpp" namespace gko { namespace kernels { -/** - * @brief The HIP namespace. - * - * @ingroup hip - */ namespace hip { /** * @brief The Coordinate matrix format namespace. diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 599a2df3669..8b3579f049c 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -28,7 +27,13 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -39,14 +44,9 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" @@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list, kernel::abstract_merge_path_spmv <<get_stream()>>>( static_cast(a->get_size()[0]), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals), + acc::as_device_range(b_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list, abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>( grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { if (grid_num > 0) { @@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list, <<get_stream()>>>( static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals), + acc::as_device_range(c_vals), as_device_type(row_out.get_data()), as_device_type(val_out.get_data())); } @@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list, grid_num, as_device_type(val_out.get_data()), as_device_type(row_out.get_data()), as_device_type(alpha->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -262,21 +263,21 @@ void classical_spmv(syn::value_list, if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( - a->get_size()[0], acc::as_hip_range(a_vals), + a->get_size()[0], acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else if (alpha != nullptr && beta != nullptr) { if (grid.x > 0 && grid.y > 0) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } } else { GKO_KERNEL_NOT_FOUND; @@ -318,20 +319,20 @@ void load_balance_spmv(std::shared_ptr exec, exec->get_stream()>>>( nwarps, static_cast(a->get_size()[0]), as_device_type(alpha->get_const_values()), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } else { if (csr_grid.x > 0 && csr_grid.y > 0) { kernel::abstract_spmv<<get_stream()>>>( nwarps, static_cast(a->get_size()[0]), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), + acc::as_device_range(a_vals), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), as_device_type(a->get_const_srow()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } } } @@ -346,24 +347,24 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType* beta, matrix::Dense* c) { - bool try_sparselib = hipsparse::is_supported::value; + bool try_sparselib = sparselib::is_supported::value; try_sparselib = try_sparselib && b->get_stride() == 1 && c->get_stride() == 1; // rocSPARSE has issues with zero matrices try_sparselib = try_sparselib && a->get_num_stored_elements() > 0; if (try_sparselib) { - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - hipsparse::spmv(exec->get_hipsparse_handle(), - HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], + sparselib::spmv(exec->get_sparselib_handle(), + SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), alpha, descr, a->get_const_values(), row_ptrs, col_idxs, b->get_const_values(), beta, c->get_values()); - hipsparse::destroy(descr); + sparselib::destroy(descr); } return try_sparselib; } @@ -397,8 +398,8 @@ bool try_sparselib_spmv(std::shared_ptr exec, return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b, beta->get_const_values(), c); } else { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto valpha = one(); const auto vbeta = zero(); return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c); @@ -535,14 +536,14 @@ void spgemm(std::shared_ptr exec, const matrix::Csr* b, matrix::Csr* c) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); - auto a_descr = hipsparse::create_mat_descr(); - auto b_descr = hipsparse::create_mat_descr(); - auto c_descr = hipsparse::create_mat_descr(); - auto d_descr = hipsparse::create_mat_descr(); - auto info = hipsparse::create_spgemm_info(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); auto alpha = one(); auto a_nnz = static_cast(a->get_num_stored_elements()); @@ -566,7 +567,7 @@ void spgemm(std::shared_ptr exec, // allocate buffer size_type buffer_size{}; - hipsparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_index, null_index, info, buffer_size); @@ -575,7 +576,7 @@ void spgemm(std::shared_ptr exec, // count nnz IndexType c_nnz{}; - hipsparse::spgemm_nnz( + sparselib::spgemm_nnz( handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index, null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer); @@ -585,17 +586,17 @@ void spgemm(std::shared_ptr exec, c_vals_array.resize_and_reset(c_nnz); auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, + sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz, null_value, null_index, null_index, c_descr, c_vals, c_row_ptrs, c_col_idxs, info, buffer); - hipsparse::destroy_spgemm_info(info); - hipsparse::destroy(d_descr); - hipsparse::destroy(c_descr); - hipsparse::destroy(b_descr); - hipsparse::destroy(a_descr); + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); } else { GKO_NOT_IMPLEMENTED; } @@ -611,14 +612,14 @@ void advanced_spgemm(std::shared_ptr exec, const matrix::Csr* d, matrix::Csr* c) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); - auto a_descr = hipsparse::create_mat_descr(); - auto b_descr = hipsparse::create_mat_descr(); - auto c_descr = hipsparse::create_mat_descr(); - auto d_descr = hipsparse::create_mat_descr(); - auto info = hipsparse::create_spgemm_info(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + auto a_descr = sparselib::create_mat_descr(); + auto b_descr = sparselib::create_mat_descr(); + auto c_descr = sparselib::create_mat_descr(); + auto d_descr = sparselib::create_mat_descr(); + auto info = sparselib::create_spgemm_info(); auto a_nnz = static_cast(a->get_num_stored_elements()); auto a_vals = a->get_const_values(); @@ -640,7 +641,7 @@ void advanced_spgemm(std::shared_ptr exec, // allocate buffer size_type buffer_size{}; - hipsparse::spgemm_buffer_size( + sparselib::spgemm_buffer_size( handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, IndexType{}, null_index, null_index, info, buffer_size); @@ -651,7 +652,7 @@ void advanced_spgemm(std::shared_ptr exec, array c_tmp_row_ptrs_array(exec, m + 1); auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data(); IndexType c_nnz{}; - hipsparse::spgemm_nnz( + sparselib::spgemm_nnz( handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index, null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer); @@ -661,7 +662,7 @@ void advanced_spgemm(std::shared_ptr exec, array c_tmp_vals_array(exec, c_nnz); auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data(); auto c_tmp_vals = c_tmp_vals_array.get_data(); - hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, + sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals, a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs, b_col_idxs, null_value, d_descr, IndexType{}, null_value, null_index, null_index, @@ -669,11 +670,11 @@ void advanced_spgemm(std::shared_ptr exec, info, buffer); // destroy hipsparse context - hipsparse::destroy_spgemm_info(info); - hipsparse::destroy(d_descr); - hipsparse::destroy(c_descr); - hipsparse::destroy(b_descr); - hipsparse::destroy(a_descr); + sparselib::destroy_spgemm_info(info); + sparselib::destroy(d_descr); + sparselib::destroy(c_descr); + sparselib::destroy(b_descr); + sparselib::destroy(a_descr); auto total_nnz = c_nnz + d->get_num_stored_elements(); auto nnz_per_row = total_nnz / m; @@ -701,12 +702,12 @@ void transpose(std::shared_ptr exec, if (orig->get_size()[0] == 0) { return; } - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - hipsparse::transpose( - exec->get_hipsparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -728,12 +729,12 @@ void conj_transpose(std::shared_ptr exec, const auto block_size = default_block_size; const auto grid_size = ceildiv(trans->get_num_stored_elements(), block_size); - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC; hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO; - hipsparse::transpose( - exec->get_hipsparse_handle(), orig->get_size()[0], + sparselib::transpose( + exec->get_sparselib_handle(), orig->get_size()[0], orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), @@ -753,9 +754,9 @@ template void sort_by_column_index(std::shared_ptr exec, matrix::Csr* to_sort) { - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); auto m = IndexType(to_sort->get_size()[0]); auto n = IndexType(to_sort->get_size()[1]); auto nnz = IndexType(to_sort->get_num_stored_elements()); @@ -771,23 +772,23 @@ void sort_by_column_index(std::shared_ptr exec, // init identity permutation array permutation_array(exec, nnz); auto permutation = permutation_array.get_data(); - hipsparse::create_identity_permutation(handle, nnz, permutation); + components::fill_seq_array(exec, permutation, nnz); // allocate buffer size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, + sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); // sort column indices - hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, + sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs, permutation, buffer); // sort values - hipsparse::gather(handle, nnz, tmp_vals, vals, permutation); + sparselib::gather(handle, nnz, tmp_vals, vals, permutation); - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 36e581049e0..8fed3c97c1b 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/dense_kernels.hpp" -#include - - #include #include #include @@ -20,12 +17,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/utils.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -56,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr exec, matrix::Dense* result, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); } else { compute_dot(exec, x, y, result, tmp); } @@ -81,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); } else { compute_conj_dot(exec, x, y, result, tmp); } @@ -105,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr exec, array& tmp) { if (x->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); } else { compute_norm2(exec, x, result, tmp); } @@ -127,19 +125,18 @@ void simple_apply(std::shared_ptr exec, const matrix::Dense* b, matrix::Dense* c) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N, - c->get_size()[1], c->get_size()[0], - a->get_size()[1], &alpha, b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), &beta, c->get_values(), - c->get_stride()); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); } else { dense::fill(exec, c, zero()); } @@ -158,15 +155,15 @@ void apply(std::shared_ptr exec, const matrix::Dense* a, const matrix::Dense* b, const matrix::Dense* beta, matrix::Dense* c) { - if (hipblas::is_supported::value) { + if (blas::is_supported::value) { if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { if (a->get_size()[1] > 0) { - hipblas::gemm( - exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), a->get_stride(), - beta->get_const_values(), c->get_values(), c->get_stride()); + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); } else { dense::scale(exec, beta, c); } @@ -184,17 +181,17 @@ void transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, - orig->get_size()[0], orig->get_size()[1], &alpha, - orig->get_const_values(), orig->get_stride(), &beta, - trans->get_const_values(), trans->get_stride(), - trans->get_values(), trans->get_stride()); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -209,17 +206,17 @@ void conj_transpose(std::shared_ptr exec, const matrix::Dense* orig, matrix::Dense* trans) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N, - orig->get_size()[0], orig->get_size()[1], &alpha, - orig->get_const_values(), orig->get_stride(), &beta, - trans->get_values(), trans->get_stride(), - trans->get_values(), trans->get_stride()); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp index deedb9543ec..01033004c6b 100644 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ b/hip/matrix/diagonal_kernels.hip.cpp @@ -5,16 +5,14 @@ #include "core/matrix/diagonal_kernels.hpp" -#include - - #include #include -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 51c34430f5c..4f1ff6a3539 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -8,9 +8,6 @@ #include -#include - - #include #include #include @@ -18,19 +15,20 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -133,20 +131,21 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(a_vals), + nrows, num_worker_per_row, acc::as_device_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), + num_stored_elements_per_row, acc::as_device_range(b_vals), as_device_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = acc::range( + std::array{1}, alpha->get_const_values()); if (grid_size.x > 0 && grid_size.y > 0) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(alpha_val), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), + nrows, num_worker_per_row, acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), as_device_type(c->get_values()), c->get_stride()); } @@ -215,7 +214,7 @@ void spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the hip kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ @@ -249,7 +248,7 @@ void advanced_spmv(std::shared_ptr exec, const int num_worker_per_row = std::get<2>(data); /** - * info is the parameter for selecting the hip kernel. + * info is the parameter for selecting the device kernel. * for info == 0, it uses the kernel by warp_size threads with atomic * operation for other value, it uses the kernel without atomic_add */ diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp index b84e7644e80..0286aff0bba 100644 --- a/hip/matrix/fbcsr_kernels.template.hip.cpp +++ b/hip/matrix/fbcsr_kernels.template.hip.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -25,6 +24,13 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/unified/base/kernel_launch.hpp" #include "core/base/array_access.hpp" #include "core/base/block_sizes.hpp" @@ -34,22 +40,17 @@ #include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/hipsparse_block_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" + namespace gko { namespace kernels { namespace hip { @@ -82,15 +83,15 @@ void dense_transpose(std::shared_ptr exec, if (nrows == 0) { return; } - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); { - hipblas::pointer_mode_guard pm_guard(handle); + blas::pointer_mode_guard pm_guard(handle); auto alpha = one(); auto beta = zero(); - hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols, - &alpha, orig, orig_stride, &beta, trans, trans_stride, - trans, trans_stride); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); } } else { GKO_NOT_IMPLEMENTED; @@ -116,12 +117,12 @@ void spmv(std::shared_ptr exec, dense::fill(exec, c, zero()); return; } - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); - hipsparse::pointer_mode_guard pm_guard(handle); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); const auto alpha = one(); const auto beta = zero(); - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -135,21 +136,21 @@ void spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), &beta, c->get_values()); } else { const auto trans_stride = nrows; auto trans_c = array(exec, nrows * nrhs); - hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, - HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), in_stride, &beta, trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -173,11 +174,11 @@ void advanced_spmv(std::shared_ptr exec, dense::scale(exec, beta, c); return; } - if (hipsparse::is_supported::value) { - auto handle = exec->get_hipsparse_handle(); + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); const auto alphp = alpha->get_const_values(); const auto betap = beta->get_const_values(); - auto descr = hipsparse::create_mat_descr(); + auto descr = sparselib::create_mat_descr(); const auto row_ptrs = a->get_const_row_ptrs(); const auto col_idxs = a->get_const_col_idxs(); const auto values = a->get_const_values(); @@ -191,7 +192,7 @@ void advanced_spmv(std::shared_ptr exec, const auto in_stride = b->get_stride(); const auto out_stride = c->get_stride(); if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), betap, c->get_values()); } else { @@ -199,27 +200,83 @@ void advanced_spmv(std::shared_ptr exec, auto trans_c = array(exec, nrows * nrhs); dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), trans_stride, trans_c.get_data()); - hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, - HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), in_stride, betap, trans_c.get_data(), trans_stride); dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), out_stride, c->get_values()); } - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } } +namespace { + + +template +void transpose_blocks_impl(syn::value_list, + std::shared_ptr exec, + matrix::Fbcsr* const mat) +{ + constexpr int subwarp_size = config::warp_size; + const auto nbnz = mat->get_num_stored_blocks(); + const auto numthreads = nbnz * subwarp_size; + const auto block_size = default_block_size; + const auto grid_dim = ceildiv(numthreads, block_size); + if (grid_dim > 0) { + kernel::transpose_blocks + <<get_stream()>>>( + nbnz, mat->get_values()); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, + transpose_blocks_impl); + + +} // namespace + + template void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const input, - matrix::Fbcsr* const output) + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) { - fallback_transpose(exec, input, output); +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { + const int bs = orig->get_block_size(); + const IndexType nnzb = + static_cast(orig->get_num_stored_blocks()); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, + trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), + copyValues, idxBase, buffer); + + // transpose blocks + select_transpose_blocks( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), exec, trans); + } else +#endif + { + fallback_transpose(exec, orig, trans); + } } diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index dc397b20892..31e180b4414 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #else @@ -21,6 +20,9 @@ #include +#include "common/cuda_hip/base/runtime.hpp" + + namespace gko { diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp index 8028dd0777f..f1e15c946e0 100644 --- a/hip/matrix/sellp_kernels.hip.cpp +++ b/hip/matrix/sellp_kernels.hip.cpp @@ -5,9 +5,6 @@ #include "core/matrix/sellp_kernels.hpp" -#include - - #include #include #include @@ -15,10 +12,11 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index e5a6900cdfe..487b134d28a 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -5,25 +5,25 @@ #include "core/matrix/sparsity_csr_kernels.hpp" -#include #include #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -42,7 +42,11 @@ namespace sparsity_csr { constexpr int classical_oversubscription = 32; constexpr int default_block_size = 512; +#ifdef GKO_COMPILING_HIP constexpr int spmv_block_size = 256; +#else +constexpr int spmv_block_size = 128; +#endif constexpr int warps_in_block = 4; @@ -106,16 +110,16 @@ void classical_spmv(syn::value_list, a->get_size()[0], as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -169,21 +173,21 @@ void sort_by_column_index(std::shared_ptr exec, const auto num_cols = static_cast(to_sort->get_size()[1]); const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - if (hipsparse::is_supported::value) { - const auto handle = exec->get_hipsparse_handle(); - auto descr = hipsparse::create_mat_descr(); + if (sparselib::is_supported::value) { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); array permutation_array(exec, to_sort->get_num_nonzeros()); auto permutation = permutation_array.get_data(); components::fill_seq_array(exec, permutation, to_sort->get_num_nonzeros()); size_type buffer_size{}; - hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, + sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs, col_idxs, buffer_size); array buffer_array{exec, buffer_size}; auto buffer = buffer_array.get_data(); - hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, + sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, col_idxs, permutation, buffer); - hipsparse::destroy(descr); + sparselib::destroy(descr); } else { fallback_sort(exec, to_sort); } diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp index ed81d1c66dc..18c1f0957c4 100644 --- a/hip/multigrid/pgm_kernels.hip.cpp +++ b/hip/multigrid/pgm_kernels.hip.cpp @@ -19,8 +19,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index 6d58244a41a..f3969c16b81 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -6,9 +6,9 @@ #define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_ +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/matrix/batch_struct.hpp" #include "core/preconditioner/batch_jacobi_helpers.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index 7339bd0a754..4eaf65cc438 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -5,21 +5,18 @@ #include "core/preconditioner/isai_kernels.hpp" -#include - - #include #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp similarity index 100% rename from hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp rename to hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp similarity index 94% rename from hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp rename to hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp index 326b9f6b720..358c6f3b337 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp @@ -5,20 +5,18 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +33,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index 6e9c279a46f..2185e124db6 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -6,7 +6,7 @@ #include -#include "hip/base/config.hip.hpp" +#include "common/cuda_hip/base/config.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp similarity index 91% rename from hip/preconditioner/jacobi_generate_kernel.hip.cpp rename to hip/preconditioner/jacobi_generate_kernels.hip.cpp index 713be193250..6365f6c132e 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp @@ -5,21 +5,19 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/diagonal_block_manipulation.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -38,7 +36,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" template +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/diagonal_block_manipulation.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -35,7 +35,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index 1646a7fb376..a3b2b7e5412 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -5,19 +5,17 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -34,9 +32,9 @@ namespace jacobi { // a total of 32/16 warps (1024 threads) -#if GINKGO_HIP_PLATFORM_HCC +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC constexpr int default_num_warps = 16; -#else // GINKGO_HIP_PLATFORM_NVCC +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC constexpr int default_num_warps = 32; #endif // with current architectures, at most 32 warps can be scheduled per SM (and diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp similarity index 93% rename from hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp rename to hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp index 0763e986d41..37b78f17469 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp @@ -5,20 +5,18 @@ #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +33,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" template +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -32,7 +32,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp index 0c83c728e79..9a5739064eb 100644 --- a/hip/reorder/rcm_kernels.hip.cpp +++ b/hip/reorder/rcm_kernels.hip.cpp @@ -25,9 +25,9 @@ #include +#include "common/cuda_hip/components/memory.hpp" #include "core/base/array_access.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index c62c11405a5..fdeb0580931 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/solver/batch_bicgstab_kernels.hpp" -#include #include #include @@ -14,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index d61eead6fab..47c2bc498eb 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -5,7 +5,6 @@ #include "core/solver/batch_cg_kernels.hpp" -#include #include #include @@ -14,15 +13,16 @@ #include +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 794ac9fd8a6..2f2df4ddf84 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -14,19 +14,19 @@ #include -#include "accessor/hip_helper.hpp" +#include "accessor/cuda_hip_helper.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" #include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/solver/cb_gmres_accessor.hpp" -#include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" @@ -118,7 +118,7 @@ void restart(std::shared_ptr exec, restart_1_kernel <<get_stream()>>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, @@ -147,7 +147,7 @@ void restart(std::shared_ptr exec, residual_norm->get_stride(), as_device_type(arnoldi_norm->get_const_values() + 2 * stride_arnoldi), - stride_arnoldi, acc::as_hip_range(krylov_bases)); + stride_arnoldi, acc::as_device_range(krylov_bases)); } const auto grid_dim_2 = @@ -160,7 +160,7 @@ void restart(std::shared_ptr exec, residual->get_stride(), as_device_type(residual_norm->get_const_values()), as_device_type(residual_norm_collection->get_values()), - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(next_krylov_basis->get_values()), next_krylov_basis->get_stride(), as_device_type(final_iter_nums->get_data())); @@ -214,7 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, as_device_type(next_krylov_basis->get_const_values()), stride_next_krylov, as_device_type(arnoldi_norm->get_values()), as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis + // nrmP = norm(next_krylov_basis) zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, hessenberg_iter->get_values()); if (dim_size[1] > 1) { @@ -222,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } else { @@ -231,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(stop_status)); } @@ -243,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); @@ -272,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), as_device_type(num_reorth->get_data())); num_reorth_host = get_element(*num_reorth, 0); @@ -285,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, <<get_stream()>>>( dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } else { @@ -294,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[0], as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(buffer_iter->get_values()), stride_buffer, as_device_type(stop_status)); } @@ -306,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter + 1, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), + stride_next_krylov, acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_values()), stride_hessenberg, as_device_type(buffer_iter->get_const_values()), stride_buffer, @@ -338,7 +338,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, exec->get_stream()>>>( dim_size[1], as_device_type(arnoldi_norm->get_values()), stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), as_device_type(stop_status), as_device_type(reorth_status), num_reorth->get_data()); num_reorth_host = get_element(*num_reorth, 0); @@ -350,7 +350,7 @@ void finish_arnoldi_CGS(std::shared_ptr exec, default_block_size, 0, exec->get_stream()>>>( iter, dim_size[0], dim_size[1], as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), + acc::as_device_range(krylov_bases), as_device_type(hessenberg_iter->get_const_values()), stride_hessenberg, as_device_type(stop_status)); // next_krylov_basis /= hessenberg(iter, iter + 1) @@ -464,7 +464,7 @@ void calculate_qy(std::shared_ptr exec, calculate_Qy_kernel <<get_stream()>>>( - num_rows, num_cols, acc::as_hip_range(krylov_bases), + num_rows, num_cols, acc::as_device_range(krylov_bases), as_device_type(y->get_const_values()), y->get_stride(), as_device_type(before_preconditioner->get_values()), stride_before_preconditioner, diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index d05bc1a9f6f..9fac4be8547 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -10,7 +10,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #else @@ -22,12 +21,13 @@ #include +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/base/types.hip.hpp" namespace gko { @@ -63,7 +63,7 @@ struct SolveStruct : gko::solver::SolveStruct { factor_descr, unit_diag ? HIPSPARSE_DIAG_TYPE_UNIT : HIPSPARSE_DIAG_TYPE_NON_UNIT)); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info)); - policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL; + policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL; } SolveStruct(const SolveStruct&) = delete; @@ -114,18 +114,18 @@ void generate_kernel(std::shared_ptr exec, if (matrix->get_size()[0] == 0) { return; } - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { solve_struct = std::make_shared(is_upper, unit_diag); if (auto hip_solve_struct = std::dynamic_pointer_cast( solve_struct)) { - auto handle = exec->get_hipsparse_handle(); + auto handle = exec->get_sparselib_handle(); { - hipsparse::pointer_mode_guard pm_guard(handle); - hipsparse::csrsv2_buffer_size( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::pointer_mode_guard pm_guard(handle); + sparselib::csrsv2_buffer_size( + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), @@ -139,8 +139,8 @@ void generate_kernel(std::shared_ptr exec, hip_solve_struct->factor_work_vec = exec->alloc(hip_solve_struct->factor_work_size); - hipsparse::csrsv2_analysis( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::csrsv2_analysis( + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), hip_solve_struct->factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(), @@ -170,17 +170,17 @@ void solve_kernel(std::shared_ptr exec, } using vec = matrix::Dense; - if (hipsparse::is_supported::value) { + if (sparselib::is_supported::value) { if (auto hip_solve_struct = dynamic_cast(solve_struct)) { ValueType one = 1.0; - auto handle = exec->get_hipsparse_handle(); + auto handle = exec->get_sparselib_handle(); { - hipsparse::pointer_mode_guard pm_guard(handle); + sparselib::pointer_mode_guard pm_guard(handle); if (b->get_stride() == 1) { - hipsparse::csrsv2_solve( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::csrsv2_solve( + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, hip_solve_struct->factor_descr, @@ -194,8 +194,8 @@ void solve_kernel(std::shared_ptr exec, dense::transpose(exec, b, trans_b); dense::transpose(exec, x, trans_x); for (IndexType i = 0; i < trans_b->get_size()[0]; i++) { - hipsparse::csrsv2_solve( - handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + sparselib::csrsv2_solve( + handle, SPARSELIB_OPERATION_NON_TRANSPOSE, matrix->get_size()[0], matrix->get_num_stored_elements(), &one, hip_solve_struct->factor_descr, diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 83dbfe61f48..b1ef414c091 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -9,20 +9,19 @@ #include -#include - - #include #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/randlib_bindings.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hiprand_bindings.hip.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -71,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr exec, bool deterministic) { if (!deterministic) { - auto gen = hiprand::rand_generator(std::random_device{}(), - HIPRAND_RNG_PSEUDO_DEFAULT, + auto gen = randlib::rand_generator(std::random_device{}(), + RANDLIB_RNG_PSEUDO_DEFAULT, exec->get_stream()); - hiprand::rand_vector( + randlib::rand_vector( gen, subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), 0.0, 1.0, subspace_vectors->get_values()); - hiprand::destroy(gen); + randlib::destroy(gen); } } @@ -147,9 +146,8 @@ void update_g_and_u(std::shared_ptr exec, as_device_type(alpha->get_values()), stop_status->get_const_data()); } else { - hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, - g_k->get_values(), g_k->get_stride(), - alpha->get_values()); + blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), + g_k->get_stride(), alpha->get_values()); } update_g_k_and_u_kernel <<get_stride(), default_block_size), @@ -198,8 +196,8 @@ void update_m(std::shared_ptr exec, const size_type nrhs, as_device_type(g_k->get_const_values()), g_k->get_stride(), as_device_type(m_i), stop_status->get_const_data()); } else { - hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); + blas::dot(exec->get_blas_handle(), size, p_i, 1, + g_k->get_const_values(), g_k->get_stride(), m_i); } } } diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 08f35d3d674..d355940a487 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #else @@ -21,9 +20,10 @@ #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp index 41aab8003bd..f68105ba6d8 100644 --- a/hip/solver/multigrid_kernels.hip.cpp +++ b/hip/solver/multigrid_kernels.hip.cpp @@ -5,18 +5,16 @@ #include "core/solver/multigrid_kernels.hpp" -#include - - #include #include #include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "core/components/fill_array_kernels.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index cd6b0719844..2a31e450d27 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -8,7 +8,6 @@ #include -#include #if HIP_VERSION >= 50200000 #include #else @@ -21,9 +20,10 @@ #include -#include "hip/base/hipsparse_bindings.hip.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 8c7caeb32b8..3d24daa5bd5 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -10,8 +10,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index d790dd652f0..7f2b0646ea2 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -5,17 +5,15 @@ #include "core/stop/residual_norm_kernels.hpp" -#include - - #include #include #include +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" #include "core/base/array_access.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 2c25f5b3a7a..8462cbe5716 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -23,8 +23,8 @@ #include +#include "common/cuda_hip/base/types.hpp" #include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp index d22dfeca0b6..53f4b9a72a0 100644 --- a/hip/test/components/cooperative_groups.hip.cpp +++ b/hip/test/components/cooperative_groups.hip.cpp @@ -8,9 +8,6 @@ // force-top: off -#include "hip/components/cooperative_groups.hip.hpp" - - #include #include @@ -22,7 +19,8 @@ #include -#include "hip/base/types.hip.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 7bfab76f795..b8ee2f03d29 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -24,7 +24,7 @@ #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 1db0c6e9562..2662d367f4d 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -23,7 +23,7 @@ #include -#include "hip/components/cooperative_groups.hip.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" #include "hip/test/utils.hip.hpp" diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index c1e3f54a720..761405c0b3d 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1600,14 +1600,29 @@ class CudaExecutor : public detail::ExecutorBase, * * @return the cublas handle (cublasContext*) for this executor */ - cublasContext* get_cublas_handle() const { return cublas_handle_.get(); } + GKO_DEPRECATED("use get_blas_handle() instead") + cublasContext* get_cublas_handle() const { return get_blas_handle(); } + + /** + * @copydoc get_cublas_handle() + */ + cublasContext* get_blas_handle() const { return cublas_handle_.get(); } /** * Get the cusparse handle for this executor * * @return the cusparse handle (cusparseContext*) for this executor */ + GKO_DEPRECATED("use get_sparselib_handle() instead") cusparseContext* get_cusparse_handle() const + { + return get_sparselib_handle(); + } + + /** + * @copydoc get_cusparse_handle() + */ + cusparseContext* get_sparselib_handle() const { return cusparse_handle_.get(); } @@ -1805,14 +1820,29 @@ class HipExecutor : public detail::ExecutorBase, * * @return the hipblas handle (hipblasContext*) for this executor */ - hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); } + GKO_DEPRECATED("use get_blas_handle() instead") + hipblasContext* get_hipblas_handle() const { return get_blas_handle(); } + + /** + * @copydoc get_hipblas_handle() + */ + hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); } /** * Get the hipsparse handle for this executor * * @return the hipsparse handle (hipsparseContext*) for this executor */ + GKO_DEPRECATED("use get_sparselib_handle() instead") hipsparseContext* get_hipsparse_handle() const + { + return get_sparselib_handle(); + } + + /** + * @copydoc get_hipsparse_handle() + */ + hipsparseContext* get_sparselib_handle() const { return hipsparse_handle_.get(); } diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 59d49e44140..41bec80673f 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -54,7 +54,7 @@ target_sources(ginkgo_omp ) ginkgo_compile_features(ginkgo_omp) -target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP) +target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp) # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom # precision implementation (mantissa segmentation) @@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp) ginkgo_install_library(ginkgo_omp) if (GINKGO_CHECK_CIRCULAR_DEPS) - ginkgo_check_headers(ginkgo_omp GKO_COMPILING_OMP) + ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp") endif() if(GINKGO_BUILD_TESTS) diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp index ab15e1a99a3..07749d9bed2 100644 --- a/test/base/batch_multi_vector_kernels.cpp +++ b/test/base/batch_multi_vector_kernels.cpp @@ -312,8 +312,8 @@ TEST_F(MultiVector, CopySingleIsEquivalentToRef) gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy( + this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } @@ -325,8 +325,8 @@ TEST_F(MultiVector, CopyIsEquivalentToRef) gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(), y.get()); - gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(), - dy.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy( + this->exec, dx.get(), dy.get()); GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0); } diff --git a/test/base/executor.cpp b/test/base/executor.cpp index 8ea3b01fb24..541360d01d4 100644 --- a/test/base/executor.cpp +++ b/test/base/executor.cpp @@ -72,7 +72,7 @@ TEST_F(Executor, RunsCorrectOperation) exec->run(ExampleOperation(value)); - ASSERT_EQ(EXEC_NAMESPACE::value, value); + ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value); } @@ -104,7 +104,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation) exec->run(omp_lambda, cuda_lambda, hip_lambda, dpcpp_lambda); - ASSERT_EQ(EXEC_NAMESPACE::value, value); + ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value); } diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp index 044202fd8e2..b16b5fb9046 100644 --- a/test/base/index_range.cpp +++ b/test/base/index_range.cpp @@ -30,7 +30,7 @@ class IndexRange : public CommonTestFixture { void run_range_for(std::shared_ptr exec, gko::array& result_array) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto result, auto size) { for (auto i : gko::irange{size}) { diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 55e1268a77a..c746a5b3461 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -46,7 +46,7 @@ move_only_type move_only_val{}; namespace gko { namespace kernels { -namespace EXEC_NAMESPACE { +namespace GKO_DEVICE_NAMESPACE { template <> @@ -57,7 +57,7 @@ struct to_device_type_impl { }; -} // namespace EXEC_NAMESPACE +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko @@ -108,7 +108,7 @@ class KernelLaunch : public CommonTestFixture { // nvcc doesn't like device lambdas declared in complex classes, move it out void run1d(std::shared_ptr exec, size_type dim, int* data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); @@ -129,7 +129,7 @@ TEST_F(KernelLaunch, Runs1D) void run1d(std::shared_ptr exec, gko::array& data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -155,7 +155,7 @@ TEST_F(KernelLaunch, Runs1DArray) void run1d(std::shared_ptr exec, Mtx* m) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -193,7 +193,7 @@ TEST_F(KernelLaunch, Runs1DDense) void run2d(std::shared_ptr exec, int* data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); @@ -215,7 +215,7 @@ TEST_F(KernelLaunch, Runs2D) void run2d(std::shared_ptr exec, gko::array& data) { - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); @@ -242,7 +242,7 @@ TEST_F(KernelLaunch, Runs2DArray) void run2d(std::shared_ptr exec, Mtx* m1, Mtx* m2, Mtx* m3) { - gko::kernels::EXEC_NAMESPACE::run_kernel_solver( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { @@ -280,8 +280,8 @@ void run2d(std::shared_ptr exec, Mtx* m1, Mtx* m2, Mtx* m3) }, dim<2>{4, 4}, m2->get_stride(), m1, static_cast(m1), m1->get_const_values(), - gko::kernels::EXEC_NAMESPACE::default_stride(m2), - gko::kernels::EXEC_NAMESPACE::row_vector(m3), m2->get_values(), + gko::kernels::GKO_DEVICE_NAMESPACE::default_stride(m2), + gko::kernels::GKO_DEVICE_NAMESPACE::row_vector(m3), m2->get_values(), m3->get_values(), move_only_val); } @@ -297,7 +297,7 @@ void run1d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; auto run_reduction = [&](int64 init, size_type size) { - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction( exec, [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -343,7 +343,7 @@ void run1d_reduction_cached(std::shared_ptr exec, gko::array temp(exec); for (const auto& size : sizes) { temp.clear(); - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached( exec, [] GKO_KERNEL(auto i) { return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -366,7 +366,7 @@ void run2d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; auto run_reduction = [&](int64 init, gko::dim<2> size) { - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -435,7 +435,7 @@ void run2d_reduction_cached(std::shared_ptr exec, gko::array temp(exec); for (const auto& dim : dims) { temp.clear(); - gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -482,7 +482,7 @@ void run2d_row_reduction(std::shared_ptr exec) static_cast(num_cols) * (num_cols + 1) * (i + 1); } - gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -527,7 +527,7 @@ void run2d_row_reduction_cached(std::shared_ptr exec, host_ref.get_data()[i] = dim[1] + i + 1; } - gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), @@ -576,7 +576,7 @@ void run2d_col_reduction(std::shared_ptr exec) static_cast(num_rows) * (num_rows + 1) * (i + 1); } - gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction( exec, [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); @@ -620,7 +620,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, host_ref.get_data()[i] = dim[0] + i + 1; } - gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp index 6e00ad6e185..08dd52f35e3 100644 --- a/test/components/absolute_array_kernels.cpp +++ b/test/components/absolute_array_kernels.cpp @@ -46,7 +46,7 @@ class AbsoluteArray : public CommonTestFixture { TEST_F(AbsoluteArray, InplaceEqualsReference) { - gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array( exec, dvals.get_data(), total_size); gko::kernels::reference::components::inplace_absolute_array( ref, vals.get_data(), total_size); @@ -57,7 +57,7 @@ TEST_F(AbsoluteArray, InplaceEqualsReference) TEST_F(AbsoluteArray, InplaceComplexEqualsReference) { - gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array( exec, dcomplex_vals.get_data(), total_size); gko::kernels::reference::components::inplace_absolute_array( ref, complex_vals.get_data(), total_size); @@ -71,7 +71,7 @@ TEST_F(AbsoluteArray, OutplaceEqualsReference) gko::array abs_vals(ref, total_size); gko::array dabs_vals(exec, total_size); - gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array( exec, dvals.get_const_data(), total_size, dabs_vals.get_data()); gko::kernels::reference::components::outplace_absolute_array( ref, vals.get_const_data(), total_size, abs_vals.get_data()); @@ -85,7 +85,7 @@ TEST_F(AbsoluteArray, OutplaceComplexEqualsReference) gko::array abs_vals(ref, total_size); gko::array dabs_vals(exec, total_size); - gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array( exec, dcomplex_vals.get_const_data(), total_size, dabs_vals.get_data()); gko::kernels::reference::components::outplace_absolute_array( ref, complex_vals.get_const_data(), total_size, abs_vals.get_data()); diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 9ccf63e5c88..3997c5830ea 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -47,7 +47,7 @@ TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes, TYPED_TEST(FillArray, EqualsReference) { using T = typename TestFixture::value_type; - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dvals.get_data(), this->total_size, T(1523)); GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals); @@ -57,7 +57,7 @@ TYPED_TEST(FillArray, EqualsReference) TYPED_TEST(FillArray, FillSeqEqualsReference) { using T = typename TestFixture::value_type; - gko::kernels::EXEC_NAMESPACE::components::fill_seq_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_seq_array( this->exec, this->dvals.get_data(), this->total_size); GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp index fee77ea5986..053171ffbe2 100644 --- a/test/components/format_conversion_kernels.cpp +++ b/test/components/format_conversion_kernels.cpp @@ -63,7 +63,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs) ptrs.fill(0); TypeParam* output = nullptr; - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs( this->exec, ptrs.get_const_data(), this->size, output); // mustn't segfault @@ -75,7 +75,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToIdxs) auto ref_idxs = this->idxs; this->idxs.fill(-1); - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs( this->exec, this->ptrs.get_const_data(), this->size, this->idxs.get_data()); @@ -90,7 +90,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs) this->ptrs.fill(-1); TypeParam* input = nullptr; - gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, input, 0, this->size, this->ptrs.get_data()); GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs); @@ -102,7 +102,7 @@ TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef) auto ref_ptrs = this->ptrs; this->ptrs.fill(-1); - gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs( this->exec, this->idxs.get_const_data(), this->idxs.get_size(), this->size, this->ptrs.get_data()); @@ -115,7 +115,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef) auto ref_sizes = this->sizes; this->sizes.fill(12345); - gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes( + gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_sizes( this->exec, this->ptrs.get_const_data(), this->size, this->sizes.get_data()); diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp index cf1777bb6ae..73cb0c7874e 100644 --- a/test/components/prefix_sum_kernels.cpp +++ b/test/components/prefix_sum_kernels.cpp @@ -57,7 +57,7 @@ TYPED_TEST(PrefixSum, EqualsReference) SCOPED_TRACE(size); gko::kernels::reference::components::prefix_sum_nonnegative( this->ref, this->vals.get_data(), size); - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, this->dvals.get_data(), size); GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals); @@ -74,7 +74,7 @@ TYPED_TEST(PrefixSum, WorksCloseToOverflow) std::is_unsigned::value; gko::array data{this->exec, I({max - 1, 1, 0})}; - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()); GKO_ASSERT_ARRAY_EQ(data, I({0, max - 1, max})); @@ -86,7 +86,7 @@ TYPED_TEST(PrefixSum, DoesntOverflowFromLastElement) const auto max = std::numeric_limits::max(); gko::array data{this->exec, I({2, max - 1})}; - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()); GKO_ASSERT_ARRAY_EQ(data, I({0, 2})); @@ -103,7 +103,7 @@ TYPED_TEST(PrefixSum, ThrowsOnOverflow) {max / 3, max / 2, max / 4, max / 3, max / 4}}; ASSERT_THROW( - gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative( + gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative( this->exec, data.get_data(), data.get_size()), gko::OverflowError); } diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index cd6c2a8d7bf..dfc2e046c84 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -50,7 +50,7 @@ TYPED_TEST(ReduceArray, EqualsReference) { gko::kernels::reference::components::reduce_add_array(this->ref, this->vals, this->out); - gko::kernels::EXEC_NAMESPACE::components::reduce_add_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::reduce_add_array( this->exec, this->dvals, this->dout); GKO_ASSERT_ARRAY_EQ(this->out, this->dout); diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp index 458ca594a56..cafd7b4da35 100644 --- a/test/distributed/index_map_kernels.cpp +++ b/test/distributed/index_map_kernels.cpp @@ -97,7 +97,7 @@ TEST_F(IndexMapBuildMapping, BuildMappingSameAsRef) gko::kernels::reference::index_map::build_mapping( ref, part.get(), query, target_ids, remote_local_idxs, remote_global_idxs, remote_sizes); - gko::kernels::EXEC_NAMESPACE::index_map::build_mapping( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping( exec, dpart.get(), dquery, dtarget_ids, dremote_local_idxs, dremote_global_idxs, dremote_sizes); @@ -136,7 +136,7 @@ class IndexMap : public CommonTestFixture { gko::kernels::reference::index_map::build_mapping( ref, part.get(), connections, target_ids, flat_remote_local_idxs, flat_remote_global_idxs, remote_sizes); - gko::kernels::EXEC_NAMESPACE::index_map::build_mapping( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping( exec, dpart.get(), dconnections, dtarget_ids, dflat_remote_local_idxs, dflat_remote_global_idxs, dremote_sizes); @@ -247,7 +247,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::local, dresult); @@ -275,7 +275,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::local, dresult); @@ -304,7 +304,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); @@ -330,7 +330,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); @@ -355,7 +355,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::combined, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::combined, dresult); @@ -385,7 +385,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceWithInvalidIndexSameAsRef) ref, part.get(), target_ids, to_device_const(remote_global_idxs), this_rank, query, gko::experimental::distributed::index_space::non_local, result); - gko::kernels::EXEC_NAMESPACE::index_map::map_to_local( + gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local( exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs), this_rank, dquery, gko::experimental::distributed::index_space::non_local, dresult); diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 5e3677db2f4..8445aee6a0e 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -72,7 +72,7 @@ class Matrix : public CommonTestFixture { ref, input, row_partition.get(), col_partition.get(), part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, non_local_col_idxs, non_local_values); - gko::kernels::EXEC_NAMESPACE::distributed_matrix:: + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: separate_local_nonlocal( exec, d_input, d_row_partition.get(), d_col_partition.get(), part, d_local_row_idxs, d_local_col_idxs, d_local_values, diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp index 8121a720908..9e985ffec9e 100644 --- a/test/distributed/partition_helper_kernels.cpp +++ b/test/distributed/partition_helper_kernels.cpp @@ -147,8 +147,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges) auto offsets = make_array(this->exec, create_ranges(100)); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, offsets, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, offsets, result); ASSERT_TRUE(result); } @@ -163,8 +163,8 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges) make_array(this->exec, remove_indices(full_range_ends, removal_idxs)); bool result = true; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_FALSE(result); } @@ -176,8 +176,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange) auto start_ends = make_array(this->ref, create_ranges(1)); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_TRUE(result); } @@ -189,8 +189,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement) auto start_ends = gko::array(this->exec, {1}); bool result = false; - gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges( - this->exec, start_ends, result); + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers:: + check_consecutive_ranges(this->exec, start_ends, result); ASSERT_TRUE(result); } @@ -206,7 +206,7 @@ TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges) auto expected_start_ends = start_ends; auto expected_part_ids = part_ids_arr; - gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start( this->exec, start_ends, part_ids_arr); GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); @@ -227,7 +227,7 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges) auto part_ids_arr = gko::array( this->exec, shuffled.second.begin(), shuffled.second.end()); - gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start( this->exec, start_ends, part_ids_arr); GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends); @@ -242,7 +242,7 @@ TYPED_TEST(PartitionHelpers, CanCompressRanges) auto ranges = make_array(this->exec, create_ranges(expected_offsets)); gko::array offsets{this->exec, expected_offsets.size()}; - gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges( + gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::compress_ranges( this->exec, ranges, offsets); GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets)); diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index e8e3d6a7e7b..86faca6b2b2 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -61,7 +61,7 @@ class Vector : public CommonTestFixture { gko::kernels::reference::distributed_vector::build_local( ref, input, partition.get(), part, output.get()); - gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local( + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_vector::build_local( exec, d_input, d_partition.get(), part, d_output.get()); GKO_ASSERT_MTX_NEAR(output, d_output, 0); diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index 82c59477fd8..c1d0a6c7336 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -150,7 +150,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) gko::kernels::reference::cholesky::symbolic_count( this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, drow_nnz.get_data(), this->dtmp); GKO_ASSERT_ARRAY_EQ(drow_nnz, row_nnz); @@ -189,12 +189,12 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) std::unique_ptr dforest; gko::factorization::compute_elim_forest(dmtx.get(), dforest); gko::array dtmp_ptrs{this->exec, num_rows + 1}; - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp); gko::kernels::reference::cholesky::symbolic_factorize( this->ref, mtx.get(), *forest, l_factor.get(), this->tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_factorize( this->exec, dmtx.get(), *dforest, dl_factor.get(), this->dtmp); GKO_ASSERT_MTX_EQ_SPARSITY(dl_factor, l_factor); @@ -239,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks) elimination_forest dforest{this->exec, static_cast(mtx->get_size()[0])}; - gko::kernels::EXEC_NAMESPACE::cholesky::forest_from_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor( this->exec, dfactors.get(), dforest); this->assert_equal_forests(*forest, dforest); @@ -367,7 +367,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef) this->forall_matrices([this] { const auto nnz = this->mtx_chol->get_num_stored_elements(); std::fill_n(this->mtx_chol->get_values(), nnz, gko::zero()); - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dmtx_chol->get_values(), nnz, gko::zero()); gko::array diag_idxs{this->ref, this->num_rows}; @@ -380,7 +380,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), transpose_idxs.get_data(), this->mtx_chol.get()); - gko::kernels::EXEC_NAMESPACE::cholesky::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -410,7 +410,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), transpose_idxs.get_data(), this->mtx_chol.get()); - gko::kernels::EXEC_NAMESPACE::cholesky::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -422,7 +422,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef) this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_const_data(), transpose_idxs.get_const_data(), *this->forest, this->mtx_chol.get(), tmp); - gko::kernels::EXEC_NAMESPACE::cholesky::factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::factorize( this->exec, this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), ddiag_idxs.get_const_data(), dtranspose_idxs.get_const_data(), diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp index fdcaa0cfad0..0ea06bed506 100644 --- a/test/factorization/lu_kernels.cpp +++ b/test/factorization/lu_kernels.cpp @@ -156,7 +156,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef) std::fill_n(this->mtx_lu->get_values(), this->mtx_lu->get_num_stored_elements(), gko::zero()); - gko::kernels::EXEC_NAMESPACE::components::fill_array( + gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array( this->exec, this->dmtx_lu->get_values(), this->dmtx_lu->get_num_stored_elements(), gko::zero()); gko::array diag_idxs{this->ref, this->num_rows}; @@ -166,7 +166,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef) this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), this->mtx_lu.get()); - gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -191,7 +191,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef) this->ref, this->mtx.get(), this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_data(), this->mtx_lu.get()); - gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize( this->exec, this->dmtx.get(), this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), @@ -201,7 +201,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef) this->ref, this->storage_offsets.get_const_data(), this->row_descs.get_const_data(), this->storage.get_const_data(), diag_idxs.get_const_data(), this->mtx_lu.get(), tmp); - gko::kernels::EXEC_NAMESPACE::lu_factorization::factorize( + gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize( this->exec, this->dstorage_offsets.get_const_data(), this->drow_descs.get_const_data(), this->dstorage.get_const_data(), ddiag_idxs.get_const_data(), this->dmtx_lu.get(), dtmp); diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 57086a1550d..40a40b5acf5 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -100,7 +100,7 @@ TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef) gko::kernels::reference::par_ic_factorization::init_factor( this->ref, this->mtx_l.get()); - gko::kernels::EXEC_NAMESPACE::par_ic_factorization::init_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::init_factor( this->exec, this->dmtx_l.get()); GKO_ASSERT_MTX_NEAR(this->mtx_l, this->dmtx_l, r::value); @@ -118,7 +118,7 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) gko::kernels::reference::par_ic_factorization::compute_factor( this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get()); - gko::kernels::EXEC_NAMESPACE::par_ic_factorization::compute_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor( this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get()); GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 254c2e4a40e..81d1dd83ffb 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -118,7 +118,7 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef) gko::kernels::reference::par_ict_factorization::add_candidates( this->ref, mtx_llh.get(), this->mtx.get(), this->mtx_l.get(), res_mtx_l.get()); - gko::kernels::EXEC_NAMESPACE::par_ict_factorization::add_candidates( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::add_candidates( this->exec, dmtx_llh.get(), this->dmtx.get(), this->dmtx_l.get(), dres_mtx_l.get()); @@ -140,9 +140,9 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) gko::kernels::reference::par_ict_factorization::compute_factor( this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get()); for (int i = 0; i < 20; ++i) { - gko::kernels::EXEC_NAMESPACE::par_ict_factorization::compute_factor( - this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(), - dmtx_l_coo.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization:: + compute_factor(this->exec, this->dmtx_ani.get(), + this->dmtx_l_ani.get(), dmtx_l_coo.get()); } GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 94e2eb6512f..0d853af0745 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -89,8 +89,8 @@ class ParIlu : public CommonTestFixture { { gko::kernels::reference::factorization::initialize_row_ptrs_l_u( ref, mtx.get(), l_row_ptrs, u_row_ptrs); - gko::kernels::EXEC_NAMESPACE::factorization::initialize_row_ptrs_l_u( - exec, dmtx.get(), dl_row_ptrs, du_row_ptrs); + gko::kernels::GKO_DEVICE_NAMESPACE::factorization:: + initialize_row_ptrs_l_u(exec, dmtx.get(), dl_row_ptrs, du_row_ptrs); } void initialize_lu(std::unique_ptr& l, std::unique_ptr& u, @@ -121,7 +121,7 @@ class ParIlu : public CommonTestFixture { gko::kernels::reference::factorization::initialize_l_u( ref, mtx.get(), l.get(), u.get()); - gko::kernels::EXEC_NAMESPACE::factorization::initialize_l_u( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l_u( exec, dmtx.get(), dl.get(), du.get()); } @@ -139,7 +139,7 @@ class ParIlu : public CommonTestFixture { gko::kernels::reference::par_ilu_factorization::compute_l_u_factors( ref, iterations, coo.get(), l.get(), u_transpose_mtx.get()); - gko::kernels::EXEC_NAMESPACE::par_ilu_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilu_factorization:: compute_l_u_factors(exec, iterations, dcoo.get(), dl.get(), u_transpose_dmtx.get()); auto u_lin_op = u_transpose_mtx->transpose(); @@ -160,7 +160,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), true); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), true); ASSERT_TRUE(mtx->is_sorted_by_column_index()); @@ -176,7 +176,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsortedEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), false); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), false); ASSERT_FALSE(mtx->is_sorted_by_column_index()); @@ -193,7 +193,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquareEquivalentToRef) gko::kernels::reference::factorization::add_diagonal_elements( this->ref, mtx.get(), true); - gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements( this->exec, dmtx.get(), true); ASSERT_TRUE(mtx->is_sorted_by_column_index()); diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index c4ad7fe412a..7d46f7979ac 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -151,8 +151,8 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization::threshold_select( ref, mtx.get(), rank, tmp, tmp2, res); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_select( - exec, dmtx.get(), rank, dtmp, dtmp2, dres); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_select(exec, dmtx.get(), rank, dtmp, dtmp2, dres); ASSERT_NEAR(res, dres, tolerance); } @@ -174,9 +174,9 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization::threshold_filter( ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter( - exec, local_dmtx.get(), threshold, dres.get(), dres_coo.get(), - lower); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_filter(exec, local_dmtx.get(), threshold, dres.get(), + dres_coo.get(), lower); GKO_ASSERT_MTX_NEAR(res, dres, 0); GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); @@ -208,7 +208,7 @@ class ParIlut : public CommonTestFixture { gko::kernels::reference::par_ilut_factorization:: threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold, res.get(), res_coo.get()); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: threshold_filter_approx(exec, dmtx.get(), rank, dtmp, dthreshold, dres.get(), dres_coo.get()); @@ -283,8 +283,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::threshold_filter( this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter( - this->exec, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true); + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: + threshold_filter(this->exec, this->dmtx_l.get(), 0.5, dres.get(), + null_coo, true); GKO_ASSERT_MTX_NEAR(res, dres, 0); GKO_ASSERT_MTX_EQ_SPARSITY(res, dres); @@ -346,7 +347,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::threshold_filter_approx( this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(), null_coo); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: threshold_filter_approx(this->exec, this->dmtx_l.get(), rank, dtmp, dthreshold, dres.get(), null_coo); @@ -393,7 +394,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) gko::kernels::reference::par_ilut_factorization::add_candidates( this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(), this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get()); - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::add_candidates( + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::add_candidates( this->exec, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(), this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get()); @@ -422,7 +423,7 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(), this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get()); for (int i = 0; i < 20; ++i) { - gko::kernels::EXEC_NAMESPACE::par_ilut_factorization:: + gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization:: compute_l_u_factors(this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(), dmtx_l_coo.get(), this->dmtx_u_ani.get(), dmtx_u_coo.get(), diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index 347425175bb..d3a7bb8f8e5 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -149,7 +149,7 @@ void assert_lookup_correct(std::shared_ptr exec, const auto row_ptrs = mtx->get_const_row_ptrs(); const auto col_idxs = mtx->get_const_col_idxs(); gko::array correct{exec, {true}}; - gko::kernels::EXEC_NAMESPACE::run_kernel( + gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel( exec, [] GKO_KERNEL(auto row, auto num_cols, auto row_ptrs, auto col_idxs, auto storage_offsets, auto storage, auto row_descs, @@ -215,7 +215,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks) // otherwise things might crash gko::kernels::reference::csr::build_lookup_offsets( this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); - gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup_offsets( this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, dstorage_offsets); @@ -238,7 +238,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks) gko::kernels::reference::csr::build_lookup( this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, row_descs, storage); - gko::kernels::EXEC_NAMESPACE::csr::build_lookup( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup( this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, dstorage_offsets, drow_descs, dstorage); diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 713593b4ae5..4ff8e749766 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -1346,7 +1346,7 @@ TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef) gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span( this->ref, this->mtx2.get(), rspan, cspan, &row_nnz); - gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_span( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::calculate_nonzeros_per_row_in_span( this->exec, this->dmtx2.get(), rspan, cspan, &drow_nnz); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); @@ -1382,7 +1382,7 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef) gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(), rspan, cspan, smat1.get()); - gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix( this->exec, this->dmtx2.get(), rspan, cspan, sdmat1.get()); GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0); @@ -1408,8 +1408,9 @@ TEST_F(Csr, CalculateNnzPerRowInIndexSetIsEquivalentToRef) gko::kernels::reference::csr::calculate_nonzeros_per_row_in_index_set( this->ref, this->mtx2.get(), rset, cset, row_nnz.get_data()); - gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_index_set( - this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::csr:: + calculate_nonzeros_per_row_in_index_set( + this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data()); GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); } @@ -1446,7 +1447,7 @@ TEST_F(Csr, ComputeSubmatrixFromIndexSetIsEquivalentToRef) gko::kernels::reference::csr::compute_submatrix_from_index_set( this->ref, this->mtx2.get(), rset, cset, smat1.get()); - gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix_from_index_set( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix_from_index_set( this->exec, this->dmtx2.get(), drset, dcset, sdmat1.get()); GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0); @@ -1501,7 +1502,7 @@ TEST_F(Csr, CanDetectMissingDiagonalEntry) auto mtx = gko::clone(exec, ref_mtx); bool has_diags = true; - gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist( exec, mtx.get(), has_diags); ASSERT_FALSE(has_diags); @@ -1516,7 +1517,7 @@ TEST_F(Csr, CanDetectWhenAllDiagonalEntriesArePresent) auto mtx = gko::clone(exec, ref_mtx); bool has_diags = true; - gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist( + gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist( exec, mtx.get(), has_diags); ASSERT_TRUE(has_diags); diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index 25b82215dcd..56ca536187e 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -603,7 +603,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) gko::kernels::reference::dense::count_nonzeros_per_row( ref, x.get(), nnz_per_row.get_data()); - gko::kernels::EXEC_NAMESPACE::dense::count_nonzeros_per_row( + gko::kernels::GKO_DEVICE_NAMESPACE::dense::count_nonzeros_per_row( exec, dx.get(), dnnz_per_row.get_data()); auto tmp = gko::array(ref, dnnz_per_row); @@ -621,8 +621,8 @@ TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef) gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(), max_nnz); - gko::kernels::EXEC_NAMESPACE::dense::compute_max_nnz_per_row(exec, dx.get(), - dmax_nnz); + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_max_nnz_per_row( + exec, dx.get(), dmax_nnz); ASSERT_EQ(max_nnz, dmax_nnz); } @@ -2017,7 +2017,7 @@ TEST_F(Dense, ComputeNorm2SquaredIsEquivalentToRef) gko::kernels::reference::dense::compute_squared_norm2( ref, x.get(), norm_expected.get(), tmp); - gko::kernels::EXEC_NAMESPACE::dense::compute_squared_norm2( + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_squared_norm2( exec, dx.get(), dnorm.get(), dtmp); GKO_ASSERT_MTX_NEAR(dnorm, norm_expected, r::value); @@ -2033,7 +2033,7 @@ TEST_F(Dense, ComputesSqrt) auto dmtx = gko::clone(exec, mtx); gko::kernels::reference::dense::compute_sqrt(ref, mtx.get()); - gko::kernels::EXEC_NAMESPACE::dense::compute_sqrt(exec, dmtx.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_sqrt(exec, dmtx.get()); GKO_ASSERT_MTX_NEAR(mtx, dmtx, r::value); } diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp index f6b9a9d1edb..b61d97a0a7a 100644 --- a/test/matrix/ell_kernels.cpp +++ b/test/matrix/ell_kernels.cpp @@ -533,7 +533,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) gko::kernels::reference::ell::count_nonzeros_per_row( ref, mtx.get(), nnz_per_row.get_data()); - gko::kernels::EXEC_NAMESPACE::ell::count_nonzeros_per_row( + gko::kernels::GKO_DEVICE_NAMESPACE::ell::count_nonzeros_per_row( exec, dmtx.get(), dnnz_per_row.get_data()); GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row); diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp index 6fc3caf60ad..010bd7faa86 100644 --- a/test/matrix/sparsity_csr_kernels.cpp +++ b/test/matrix/sparsity_csr_kernels.cpp @@ -64,8 +64,8 @@ TEST_F(SparsityCsr, KernelDiagonalElementPrefixSumIsEquivalentToRef) gko::kernels::reference::sparsity_csr::diagonal_element_prefix_sum( ref, mtx.get(), prefix_sum.get_data()); - gko::kernels::EXEC_NAMESPACE::sparsity_csr::diagonal_element_prefix_sum( - exec, dmtx.get(), dprefix_sum.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr:: + diagonal_element_prefix_sum(exec, dmtx.get(), dprefix_sum.get_data()); GKO_ASSERT_ARRAY_EQ(prefix_sum, dprefix_sum); } @@ -88,7 +88,7 @@ TEST_F(SparsityCsr, KernelRemoveDiagonalElementsIsEquivalentToRef) gko::kernels::reference::sparsity_csr::remove_diagonal_elements( ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), prefix_sum.get_const_data(), out_mtx.get()); - gko::kernels::EXEC_NAMESPACE::sparsity_csr::remove_diagonal_elements( + gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::remove_diagonal_elements( exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), dprefix_sum.get_const_data(), dout_mtx.get()); diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp index a5f2d32fe32..10e5cf01a7a 100644 --- a/test/multigrid/pgm_kernels.cpp +++ b/test/multigrid/pgm_kernels.cpp @@ -159,8 +159,8 @@ TEST_F(Pgm, MatchEdgeIsEquivalentToRef) auto d_x = d_unfinished_agg; gko::kernels::reference::pgm::match_edge(ref, strongest_neighbor, x); - gko::kernels::EXEC_NAMESPACE::pgm::match_edge(exec, d_strongest_neighbor, - d_x); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::match_edge( + exec, d_strongest_neighbor, d_x); GKO_ASSERT_ARRAY_EQ(d_x, x); } @@ -173,8 +173,8 @@ TEST_F(Pgm, CountUnaggIsEquivalentToRef) index_type d_num_unagg; gko::kernels::reference::pgm::count_unagg(ref, unfinished_agg, &num_unagg); - gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg, - &d_num_unagg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg, + &d_num_unagg); ASSERT_EQ(d_num_unagg, num_unagg); } @@ -187,7 +187,7 @@ TEST_F(Pgm, RenumberIsEquivalentToRef) index_type d_num_agg; gko::kernels::reference::pgm::renumber(ref, agg, &num_agg); - gko::kernels::EXEC_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg); ASSERT_EQ(d_num_agg, num_agg); GKO_ASSERT_ARRAY_EQ(d_agg, agg); @@ -203,7 +203,7 @@ TEST_F(Pgm, FindStrongestNeighborIsEquivalentToRef) gko::kernels::reference::pgm::find_strongest_neighbor( ref, weight_csr.get(), weight_diag.get(), agg, snb); - gko::kernels::EXEC_NAMESPACE::pgm::find_strongest_neighbor( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::find_strongest_neighbor( exec, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb); GKO_ASSERT_ARRAY_EQ(d_snb, snb); @@ -220,7 +220,7 @@ TEST_F(Pgm, AssignToExistAggIsEquivalentToRef) gko::kernels::reference::pgm::assign_to_exist_agg( ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg); - gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg( exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg); GKO_ASSERT_ARRAY_EQ(d_x, x); @@ -234,9 +234,10 @@ TEST_F(Pgm, AssignToExistAggUnderteminsticIsEquivalentToRef) auto d_intermediate_agg = gko::array(exec, 0); index_type d_num_unagg; - gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg( exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg); - gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_agg, &d_num_unagg); + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_agg, + &d_num_unagg); // only test whether all elements are aggregated. GKO_ASSERT_EQ(d_num_unagg, 0); @@ -257,7 +258,7 @@ TEST_F(Pgm, GatherIndexIsEquivalentToRef) gko::kernels::reference::pgm::gather_index(ref, num, orig.get_const_data(), map.get_const_data(), result.get_data()); - gko::kernels::EXEC_NAMESPACE::pgm::gather_index( + gko::kernels::GKO_DEVICE_NAMESPACE::pgm::gather_index( exec, num, d_orig.get_const_data(), d_map.get_const_data(), d_result.get_data()); diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp index 30dbfa271ee..f8a1bd015ef 100644 --- a/test/preconditioner/batch_jacobi_kernels.cpp +++ b/test/preconditioner/batch_jacobi_kernels.cpp @@ -117,7 +117,7 @@ class BatchJacobi : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp index 57f8c14ac27..6e737d31790 100644 --- a/test/preconditioner/isai_kernels.cpp +++ b/test/preconditioner/isai_kernels.cpp @@ -122,7 +122,7 @@ TEST_F(Isai, IsaiGenerateLinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -145,7 +145,7 @@ TEST_F(Isai, IsaiGenerateUinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -168,7 +168,7 @@ TEST_F(Isai, IsaiGenerateAinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -191,7 +191,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseShortIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -214,7 +214,7 @@ TEST_F(Isai, IsaiGenerateLinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), true); @@ -237,7 +237,7 @@ TEST_F(Isai, IsaiGenerateUinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_tri_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -260,7 +260,7 @@ TEST_F(Isai, IsaiGenerateAinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -283,7 +283,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_general_inverse( ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); - gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse( exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), false); @@ -315,7 +315,7 @@ TEST_F(Isai, IsaiGenerateExcessLinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -346,7 +346,7 @@ TEST_F(Isai, IsaiGenerateExcessUinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -377,7 +377,7 @@ TEST_F(Isai, IsaiGenerateExcessAinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -408,7 +408,7 @@ TEST_F(Isai, IsaiGenerateExcessSpdinverseLongIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); @@ -439,7 +439,7 @@ TEST_F(Isai, IsaiGeneratePartialExcessIsEquivalentToRef) gko::kernels::reference::isai::generate_excess_system( ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), excess.get(), e_rhs.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system( exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(), da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u); @@ -467,7 +467,7 @@ TEST_F(Isai, IsaiScaleExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scale_excess_solution( ref, a1.get_const_data(), e_rhs.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution( exec, da1.get_const_data(), de_rhs.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); @@ -490,7 +490,7 @@ TEST_F(Isai, IsaiScalePartialExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scale_excess_solution( ref, a1.get_const_data(), e_rhs.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution( exec, da1.get_const_data(), de_rhs.get(), 5u, 10u); GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); @@ -514,7 +514,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionLIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -540,7 +540,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionUIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -566,7 +566,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionAIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -592,7 +592,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionSpdIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); @@ -618,7 +618,7 @@ TEST_F(Isai, IsaiScatterPartialExcessSolutionIsEquivalentToRef) gko::kernels::reference::isai::scatter_excess_solution( ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u); - gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution( + gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution( exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u); GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp index 821a8a6d29c..14bca65e41f 100644 --- a/test/solver/batch_bicgstab_kernels.cpp +++ b/test/solver/batch_bicgstab_kernels.cpp @@ -52,7 +52,7 @@ class BatchBicgstab : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp index 49f0db2a09b..7c013020686 100644 --- a/test/solver/batch_cg_kernels.cpp +++ b/test/solver/batch_cg_kernels.cpp @@ -50,7 +50,7 @@ class BatchCg : public CommonTestFixture { const gko::batch::BatchLinOp* prec, const Mtx* mtx, const MVec* b, MVec* x, LogData& log_data) { - gko::kernels::EXEC_NAMESPACE::batch_cg::apply< + gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply< typename Mtx::value_type>(executor, settings, mtx, prec, b, x, log_data); }; diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp index 616f7eff096..ab63b01f9cc 100644 --- a/test/solver/bicg_kernels.cpp +++ b/test/solver/bicg_kernels.cpp @@ -139,7 +139,7 @@ TEST_F(Bicg, BicgInitializeIsEquivalentToRef) gko::kernels::reference::bicg::initialize( ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(), d_q2.get(), d_stop_status.get()); @@ -165,7 +165,7 @@ TEST_F(Bicg, BicgStep1IsEquivalentToRef) gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(), z2.get(), rho.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_1( exec, d_p.get(), d_z.get(), d_p2.get(), d_z2.get(), d_rho.get(), d_prev_rho.get(), d_stop_status.get()); @@ -183,7 +183,7 @@ TEST_F(Bicg, BicgStep2IsEquivalentToRef) gko::kernels::reference::bicg::step_2( ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicg::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_2( exec, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(), d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get()); diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp index a63ff7f39f4..4f68edd6a8e 100644 --- a/test/solver/bicgstab_kernels.cpp +++ b/test/solver/bicgstab_kernels.cpp @@ -176,7 +176,7 @@ TEST_F(Bicgstab, BicgstabInitializeIsEquivalentToRef) ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(), v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(), gamma.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::initialize( exec, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(), d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), @@ -207,7 +207,7 @@ TEST_F(Bicgstab, BicgstabStep1IsEquivalentToRef) gko::kernels::reference::bicgstab::step_1( ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_1( exec, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(), d_alpha.get(), d_omega.get(), d_stop_status.get()); @@ -222,7 +222,7 @@ TEST_F(Bicgstab, BicgstabStep2IsEquivalentToRef) gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(), rho.get(), alpha.get(), beta.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_2( exec, d_r.get(), d_s.get(), d_v.get(), d_rho.get(), d_alpha.get(), d_beta.get(), d_stop_status.get()); @@ -238,7 +238,7 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef) gko::kernels::reference::bicgstab::step_3( ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), beta.get(), gamma.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::bicgstab::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_3( exec, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(), d_stop_status.get()); diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 4f854a26180..3b5f5956c2e 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -209,7 +209,7 @@ TEST_F(CbGmres, CbGmresInitialize1IsEquivalentToRef) gko::kernels::reference::cb_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), stop_status.get(), default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get(), default_krylov_dim_mixed); @@ -230,7 +230,7 @@ TEST_F(CbGmres, CbGmresInitialize2IsEquivalentToRef) residual_norm_collection.get(), arnoldi_norm.get(), range_helper.get_range(), next_krylov_basis.get(), final_iter_nums.get(), tmp, default_krylov_dim_mixed); - gko::kernels::EXEC_NAMESPACE::cb_gmres::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_arnoldi_norm.get(), d_range_helper.get_range(), d_next_krylov_basis.get(), @@ -255,7 +255,7 @@ TEST_F(CbGmres, CbGmresStep1IsEquivalentToRef) range_helper.get_range(), hessenberg_iter.get(), buffer_iter.get(), arnoldi_norm.get(), iter, final_iter_nums.get(), stop_status.get(), reorth_status.get(), num_reorth.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::arnoldi( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::arnoldi( exec, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_range_helper.get_range(), d_hessenberg_iter.get(), @@ -285,7 +285,7 @@ TEST_F(CbGmres, CbGmresStep2IsEquivalentToRef) ref, residual_norm_collection.get(), range_helper.get_range().get_accessor().to_const(), hessenberg.get(), y.get(), before_preconditioner.get(), final_iter_nums.get()); - gko::kernels::EXEC_NAMESPACE::cb_gmres::solve_krylov( + gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::solve_krylov( exec, d_residual_norm_collection.get(), d_range_helper.get_range().get_accessor().to_const(), d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(), diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp index 41af16489a2..be9dc052314 100644 --- a/test/solver/cg_kernels.cpp +++ b/test/solver/cg_kernels.cpp @@ -114,7 +114,7 @@ TEST_F(Cg, CgInitializeIsEquivalentToRef) gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_prev_rho.get(), d_rho.get(), d_stop_status.get()); @@ -134,9 +134,9 @@ TEST_F(Cg, CgStep1IsEquivalentToRef) gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::step_1(exec, d_p.get(), d_z.get(), - d_rho.get(), d_prev_rho.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_1( + exec, d_p.get(), d_z.get(), d_rho.get(), d_prev_rho.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_p, p, ::r::value); GKO_ASSERT_MTX_NEAR(d_z, z, ::r::value); @@ -149,9 +149,9 @@ TEST_F(Cg, CgStep2IsEquivalentToRef) gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cg::step_2(exec, d_x.get(), d_r.get(), - d_p.get(), d_q.get(), d_beta.get(), - d_rho.get(), d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_2( + exec, d_x.get(), d_r.get(), d_p.get(), d_q.get(), d_beta.get(), + d_rho.get(), d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_x, x, ::r::value); GKO_ASSERT_MTX_NEAR(d_r, r, ::r::value); diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp index 123f76727b5..6c2bab293e3 100644 --- a/test/solver/cgs_kernels.cpp +++ b/test/solver/cgs_kernels.cpp @@ -167,7 +167,7 @@ TEST_F(Cgs, CgsInitializeIsEquivalentToRef) ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(), u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(), rho_prev.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::initialize( exec, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(), d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(), d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(), @@ -197,7 +197,7 @@ TEST_F(Cgs, CgsStep1IsEquivalentToRef) gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(), q.get(), beta.get(), rho.get(), rho_prev.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_1( exec, d_r.get(), d_u.get(), d_p.get(), d_q.get(), d_beta.get(), d_rho.get(), d_rho_prev.get(), d_stop_status.get()); @@ -214,7 +214,7 @@ TEST_F(Cgs, CgsStep2IsEquivalentToRef) gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(), t.get(), alpha.get(), rho.get(), gamma.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_2( exec, d_u.get(), d_v_hat.get(), d_q.get(), d_t.get(), d_alpha.get(), d_rho.get(), d_gamma.get(), d_stop_status.get()); @@ -231,7 +231,7 @@ TEST_F(Cgs, CgsStep3IsEquivalentToRef) gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(), x.get(), alpha.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::cgs::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_3( exec, d_t.get(), d_u_hat.get(), d_r.get(), d_x.get(), d_alpha.get(), d_stop_status.get()); diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp index faf7225c883..f1f09f759bc 100644 --- a/test/solver/fcg_kernels.cpp +++ b/test/solver/fcg_kernels.cpp @@ -122,7 +122,7 @@ TEST_F(Fcg, FcgInitializeIsEquivalentToRef) gko::kernels::reference::fcg::initialize( ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(), prev_rho.get(), rho.get(), rho_t.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::initialize( exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(), d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get()); @@ -144,9 +144,9 @@ TEST_F(Fcg, FcgStep1IsEquivalentToRef) gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(), prev_rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::step_1(exec, d_p.get(), d_z.get(), - d_rho_t.get(), d_prev_rho.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_1( + exec, d_p.get(), d_z.get(), d_rho_t.get(), d_prev_rho.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(d_p, p, ::r::value); GKO_ASSERT_MTX_NEAR(d_z, z, ::r::value); @@ -159,7 +159,7 @@ TEST_F(Fcg, FcgStep2IsEquivalentToRef) gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(), p.get(), q.get(), beta.get(), rho.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::fcg::step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_2( exec, d_x.get(), d_r.get(), d_t.get(), d_p.get(), d_q.get(), d_beta.get(), d_rho.get(), d_stop_status.get()); diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp index 575d55ded87..7a00b3fed30 100644 --- a/test/solver/gcr_kernels.cpp +++ b/test/solver/gcr_kernels.cpp @@ -153,7 +153,7 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef) gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize( exec, d_b.get(), d_residual.get(), d_stop_status.get_data()); GKO_ASSERT_MTX_NEAR(d_residual, residual, r::value); @@ -168,7 +168,7 @@ TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef) gko::kernels::reference::gcr::restart(ref, residual.get(), A_residual.get(), p_bases.get(), Ap_bases.get(), final_iter_nums.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::restart( exec, d_residual.get(), d_A_residual.get(), d_p_bases.get(), d_Ap_bases.get(), d_final_iter_nums.get_data()); @@ -186,7 +186,7 @@ TEST_F(Gcr, GcrStep1IsEquivalentToRef) gko::kernels::reference::gcr::step_1(ref, x.get(), residual.get(), p.get(), Ap.get(), Ap_norm.get(), rAp.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gcr::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::gcr::step_1( exec, d_x.get(), d_residual.get(), d_p.get(), d_Ap.get(), d_Ap_norm.get(), d_rAp.get(), d_stop_status.get_data()); diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp index ac9139d81aa..08259c91ce0 100644 --- a/test/solver/gmres_kernels.cpp +++ b/test/solver/gmres_kernels.cpp @@ -159,7 +159,7 @@ TEST_F(Gmres, GmresKernelInitializeIsEquivalentToRef) gko::kernels::reference::common_gmres::initialize( ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::initialize( exec, d_b.get(), d_residual.get(), d_givens_sin.get(), d_givens_cos.get(), d_stop_status.get_data()); @@ -180,7 +180,7 @@ TEST_F(Gmres, GmresKernelRestartIsEquivalentToRef) ref, residual.get(), residual_norm.get(), residual_norm_collection.get(), krylov_bases.get(), final_iter_nums.get_data()); - gko::kernels::EXEC_NAMESPACE::gmres::restart( + gko::kernels::GKO_DEVICE_NAMESPACE::gmres::restart( exec, d_residual.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_krylov_bases.get(), d_final_iter_nums.get_data()); @@ -202,7 +202,7 @@ TEST_F(Gmres, GmresKernelHessenbergQRIsEquivalentToRef) ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), hessenberg_iter.get(), iter, final_iter_nums.get_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr( exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get_data(), d_stop_status.get_const_data()); @@ -228,7 +228,7 @@ TEST_F(Gmres, GmresKernelHessenbergQROnSingleRHSIsEquivalentToRef) ref, givens_sin.get(), givens_cos.get(), residual_norm.get(), residual_norm_collection.get(), hessenberg_iter.get(), iter, final_iter_nums.get_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr( exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(), d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter, d_final_iter_nums.get_data(), d_stop_status.get_const_data()); @@ -252,7 +252,7 @@ TEST_F(Gmres, GmresKernelSolveKrylovIsEquivalentToRef) gko::kernels::reference::common_gmres::solve_krylov( ref, residual_norm_collection.get(), hessenberg.get(), y.get(), final_iter_nums.get_const_data(), stop_status.get_const_data()); - gko::kernels::EXEC_NAMESPACE::common_gmres::solve_krylov( + gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::solve_krylov( exec, d_residual_norm_collection.get(), d_hessenberg.get(), d_y.get(), d_final_iter_nums.get_const_data(), d_stop_status.get_const_data()); @@ -267,7 +267,7 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef) gko::kernels::reference::gmres::multi_axpy( ref, krylov_bases.get(), y.get(), before_preconditioner.get(), final_iter_nums.get_const_data(), stop_status.get_data()); - gko::kernels::EXEC_NAMESPACE::gmres::multi_axpy( + gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_axpy( exec, d_krylov_bases.get(), d_y.get(), d_before_preconditioner.get(), d_final_iter_nums.get_const_data(), d_stop_status.get_data()); diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp index 31c7df99168..b165824dbe0 100644 --- a/test/solver/idr_kernels.cpp +++ b/test/solver/idr_kernels.cpp @@ -160,7 +160,7 @@ TEST_F(Idr, IdrInitializeIsEquivalentToRef) gko::kernels::reference::idr::initialize(ref, nrhs, m.get(), p.get(), true, stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::initialize( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::initialize( exec, nrhs, d_m.get(), d_p.get(), true, d_stop_status.get()); GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); @@ -176,7 +176,7 @@ TEST_F(Idr, IdrStep1IsEquivalentToRef) gko::kernels::reference::idr::step_1(ref, nrhs, k, m.get(), f.get(), r.get(), g.get(), c.get(), v.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_1( exec, nrhs, k, d_m.get(), d_f.get(), d_r.get(), d_g.get(), d_c.get(), d_v.get(), d_stop_status.get()); @@ -192,9 +192,9 @@ TEST_F(Idr, IdrStep2IsEquivalentToRef) gko::size_type k = 2; gko::kernels::reference::idr::step_2(ref, nrhs, k, omega.get(), v.get(), c.get(), u.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_2(exec, nrhs, k, d_omega.get(), - d_v.get(), d_c.get(), d_u.get(), - d_stop_status.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_2( + exec, nrhs, k, d_omega.get(), d_v.get(), d_c.get(), d_u.get(), + d_stop_status.get()); GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); } @@ -208,7 +208,7 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef) gko::kernels::reference::idr::step_3( ref, nrhs, k, p.get(), g.get(), v.get(), u.get(), m.get(), f.get(), alpha.get(), r.get(), x.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::step_3( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_3( exec, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(), d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get()); @@ -230,7 +230,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(), residual_norm.get(), omega.get(), stop_status.get()); - gko::kernels::EXEC_NAMESPACE::idr::compute_omega( + gko::kernels::GKO_DEVICE_NAMESPACE::idr::compute_omega( exec, nrhs, kappa, d_tht.get(), d_residual_norm.get(), d_omega.get(), d_stop_status.get()); diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp index 99550dfd99f..7a8e84324bd 100644 --- a/test/solver/ir_kernels.cpp +++ b/test/solver/ir_kernels.cpp @@ -55,7 +55,7 @@ TEST_F(Ir, InitializeIsEquivalentToRef) auto d_stop_status = gko::array(exec, stop_status); gko::kernels::reference::ir::initialize(ref, &stop_status); - gko::kernels::EXEC_NAMESPACE::ir::initialize(exec, &d_stop_status); + gko::kernels::GKO_DEVICE_NAMESPACE::ir::initialize(exec, &d_stop_status); auto tmp = gko::array(ref, d_stop_status); for (int i = 0; i < stop_status.get_size(); ++i) { diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp index 139cb1a4647..4b4b0157df5 100644 --- a/test/solver/multigrid_kernels.cpp +++ b/test/solver/multigrid_kernels.cpp @@ -144,7 +144,7 @@ TEST_F(Multigrid, MultigridKCycleStep1IsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_step_1( ref, alpha.get(), rho.get(), v.get(), g.get(), d.get(), e.get()); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_1( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_1( exec, d_alpha.get(), d_rho.get(), d_v.get(), d_g.get(), d_d.get(), d_e.get()); @@ -161,7 +161,7 @@ TEST_F(Multigrid, MultigridKCycleStep2IsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_step_2( ref, alpha.get(), rho.get(), gamma.get(), beta.get(), zeta.get(), d.get(), e.get()); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_2( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_2( exec, d_alpha.get(), d_rho.get(), d_gamma.get(), d_beta.get(), d_zeta.get(), d_d.get(), d_e.get()); @@ -179,11 +179,11 @@ TEST_F(Multigrid, MultigridKCycleCheckStopIsEquivalentToRef) gko::kernels::reference::multigrid::kcycle_check_stop( ref, old_norm.get(), new_norm.get(), 1.0, is_stop_10); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop( exec, d_old_norm.get(), d_new_norm.get(), 1.0, d_is_stop_10); gko::kernels::reference::multigrid::kcycle_check_stop( ref, old_norm.get(), new_norm.get(), 0.5, is_stop_5); - gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop( + gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop( exec, d_old_norm.get(), d_new_norm.get(), 0.5, d_is_stop_5); GKO_ASSERT_EQ(d_is_stop_10, is_stop_10);