diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d2cae1ddf5f..709f2b4f53a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,19 +93,6 @@ trigger_pipeline: # Build jobs # Job with example runs. -# cuda 9.2 and friends -build/cuda92/nompi/gcc/all/release/shared: - extends: - - .build_and_test_template - - .default_variables - - .quick_test_condition - - .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_HIP: "ON" - BUILD_TYPE: "Release" - # cuda 10.1 and friends # Build CUDA NVIDIA without omp # Make sure that our jobs run when HWLOC is diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 50dfbe9d2f8..cad06674aee 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -24,12 +24,6 @@ - cpu - controller -.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017: - image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017 - tags: - - private_ci - - nvidia-gpu - .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019: image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019 tags: diff --git a/README.md b/README.md index be7572c19ea..21948853c01 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ For Ginkgo core library: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ or _NVHPC Package 22.7+_ +* _CUDA 10.1+_ or _NVHPC Package 22.7+_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) @@ -58,7 +58,7 @@ The Ginkgo HIP module has the following __additional__ requirements: * _ROCm 4.5+_ * the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: * _AMD_ backend (using the `clang` compiler) - * _9.2 <= CUDA < 11_ backend + * _10.1 <= CUDA < 11_ backend * if the hipFFT package is available, it is used to implement the FFT LinOps. The Ginkgo DPC++ module has the following __additional__ requirements: @@ -90,7 +90,7 @@ following: The Ginkgo CUDA module has the following __additional__ requirements: -* _CUDA 9.2+_ +* _CUDA 10.1+_ * _Microsoft Visual Studio_ * Any host compiler restrictions your version of CUDA may impose also apply here. For the newest CUDA version, this information can be found in the diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index c5ba334e983..88a1b4e777a 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -85,12 +85,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_ "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.") endif() -if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION - MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() -endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 5b7a268c7b6..bb141450b25 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21) set(CMAKE_HIP_ARCHITECTURES OFF) endif() -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2) - message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.") -endif() - if(NOT DEFINED ROCM_PATH) if(DEFINED ENV{ROCM_PATH}) set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed") @@ -197,16 +192,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") # Remove false positive CUDA warnings when calling one() and zero() list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda) - if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}" - AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2" - AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" ) - ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION) - - if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*") - message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue." - "Consider using a different CUDA host compiler or CUDA version.") - endif() - endif() # select GPU architecture include(cmake/Modules/CudaArchitectureSelector.cmake) cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc index 5930902ed37..faf0ad15146 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc @@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, array& col_idxs) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto value_ptr = - reinterpret_cast(values.get_const_data()); + using device_value_type = device_type; + auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_num_elems(); // count nonzeros - auto nnz = - thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { - return is_nonzero(fake_complex_unpack(value)); - }); + auto nnz = thrust::count_if( + thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { return is_nonzero(value); }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr exec, // copy nonzeros auto it = thrust::make_zip_iterator(thrust::make_tuple( row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr)); - auto out_it = thrust::make_zip_iterator(thrust::make_tuple( - new_row_idxs.get_data(), new_col_idxs.get_data(), - reinterpret_cast(new_values.get_data()))); - thrust::copy_if( - thrust_policy(exec), it, it + size, out_it, - [] __device__(tuple_type entry) { - return is_nonzero(fake_complex_unpack(thrust::get<2>(entry))); - }); + auto out_it = thrust::make_zip_iterator( + thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(), + as_device_type(new_values.get_data()))); + thrust::copy_if(thrust_policy(exec), it, it + size, out_it, + [] __device__(tuple_type entry) { + return is_nonzero(thrust::get<2>(entry)); + }); // swap out storage values = std::move(new_values); row_idxs = std::move(new_row_idxs); @@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr exec, size_type, array& values, array& row_idxs, array& col_idxs) { - using device_value_type = device_member_type; const auto size = values.get_num_elems(); const auto rows = row_idxs.get_const_data(); const auto cols = col_idxs.get_const_data(); @@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr exec, size_type, // reduce duplicates auto in_locs = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - auto in_vals = - reinterpret_cast(values.get_const_data()); + auto in_vals = as_device_type(values.get_const_data()); auto out_locs = thrust::make_zip_iterator(thrust::make_tuple( new_row_idxs.get_data(), new_col_idxs.get_data())); - auto out_vals = - reinterpret_cast(new_values.get_data()); + auto out_vals = as_device_type(new_values.get_data()); thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size, in_vals, out_locs, out_vals); // swap out storage @@ -127,13 +117,9 @@ template void sort_row_major(std::shared_ptr exec, device_matrix_data& data) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; auto it = thrust::make_zip_iterator( thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs())); - auto vals = reinterpret_cast(data.get_values()); + auto vals = as_device_type(data.get_values()); thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(), vals); } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index c370075c8a8..3f02337747e 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -872,11 +872,7 @@ void convert_to_fbcsr(std::shared_ptr exec, } auto in_rows = in_row_idxs.get_data(); auto in_cols = in_col_idxs.get_data(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(in_values.get_data()); + auto in_vals = as_device_type(in_values.get_data()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -924,17 +920,17 @@ void convert_to_fbcsr(std::shared_ptr exec, // fill in values components::fill_array(exec, block_value_array.get_data(), num_blocks * bs * bs, zero()); - thrust::for_each_n( - thrust_policy(exec), iota, num_blocks, - [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, - values] __device__(size_type i) { - const auto block_begin = block_ptrs[i]; - const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; - for (auto nz = block_begin; nz < block_end; nz++) { - values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); - } - }); + thrust::for_each_n(thrust_policy(exec), iota, num_blocks, + [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, + in_vals, values] __device__(size_type i) { + const auto block_begin = block_ptrs[i]; + const auto block_end = + i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; + for (auto nz = block_begin; nz < block_end; nz++) { + values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = in_vals[nz]; + } + }); } @@ -1130,13 +1126,10 @@ void fallback_transpose(std::shared_ptr exec, const auto nnz = output->get_num_stored_elements(); const auto in_row_ptrs = input->get_const_row_ptrs(); const auto in_col_idxs = input->get_const_col_idxs(); - // workaround for CUDA 9.2 Thrust unconstrained constructor issues - const auto in_vals = reinterpret_cast*>( - input->get_const_values()); + const auto in_vals = as_device_type(input->get_const_values()); const auto out_row_ptrs = output->get_row_ptrs(); const auto out_col_idxs = output->get_col_idxs(); - const auto out_vals = - reinterpret_cast*>(output->get_values()); + const auto out_vals = as_device_type(output->get_values()); array out_row_idxs{exec, nnz}; components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows, out_col_idxs); @@ -1156,8 +1149,7 @@ void fallback_sort(std::shared_ptr exec, { const auto row_ptrs = to_sort->get_const_row_ptrs(); const auto col_idxs = to_sort->get_col_idxs(); - const auto vals = - reinterpret_cast*>(to_sort->get_values()); + const auto vals = as_device_type(to_sort->get_values()); const auto nnz = to_sort->get_num_stored_elements(); const auto num_rows = to_sort->get_size()[0]; array row_idx_array(exec, nnz); diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index d71d593b0a2..607ec5046ea 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr exec, } auto in_rows = data.get_row_idxs(); auto in_cols = data.get_col_idxs(); - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - auto in_vals = - reinterpret_cast*>(data.get_values()); + auto in_vals = as_device_type(data.get_values()); auto in_loc_it = thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols)); thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz, @@ -232,8 +228,7 @@ void fill_in_matrix_data(std::shared_ptr exec, const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; for (auto nz = block_begin; nz < block_end; nz++) { block_values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = - fake_complex_unpack(in_vals[nz]); + (in_rows[nz] % bs)] = in_vals[nz]; } }); } diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc index d8b6c4786b0..b08e86efaaa 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc @@ -45,11 +45,7 @@ template void sort_row_major(std::shared_ptr exec, size_type nnz, IndexType* row_idxs, IndexType* col_idxs, ValueType* vals) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); // Because reduce_by_key is not deterministic, so we do not need // stable_sort_by_key @@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr exec, const IndexType* col_idxs, const ValueType* vals, matrix::Coo* coarse_coo) { - // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken - // due to overly generic assignment operator and constructor leading to - // ambiguities. So we need to use our own fake_complex type - using device_value_type = device_member_type; - auto vals_it = reinterpret_cast(vals); + auto vals_it = as_device_type(vals); auto key_it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs)); - auto coarse_vals_it = - reinterpret_cast(coarse_coo->get_values()); + auto coarse_vals_it = as_device_type(coarse_coo->get_values()); auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple( coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs())); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index f4e034998bd..1704fdd1f9c 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -154,8 +154,8 @@ void convert_to_sellp(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + slice_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += slice_size; } }, @@ -185,8 +185,8 @@ void convert_to_ell(std::shared_ptr exec, for (auto i = row_begin; i < row_begin + ell_length; i++) { cols[out_idx] = i < row_end ? in_cols[i] : invalid_index(); - values[out_idx] = i < row_end ? unpack_member(in_values[i]) - : zero(values[out_idx]); + values[out_idx] = + i < row_end ? in_values[i] : zero(values[out_idx]); out_idx += ell_stride; } }, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index bfdb4a5f854..f42b11f510d 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -198,7 +198,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct { }; -#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) +#else template struct CudaSolveStruct : gko::solver::SolveStruct { diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 1573169527d..5ec1718ca4d 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -142,7 +142,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") endif() target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES}) elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - find_package(CUDA 9.2 REQUIRED) + find_package(CUDA 10.1 REQUIRED) target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES}) endif() diff --git a/third_party/identify_stream_usage/identify_stream_usage.cpp b/third_party/identify_stream_usage/identify_stream_usage.cpp index a88de4ee427..5cdd4d30b09 100644 --- a/third_party/identify_stream_usage/identify_stream_usage.cpp +++ b/third_party/identify_stream_usage/identify_stream_usage.cpp @@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel, size_t sharedMem, cudaStream_t stream), ARG(func, gridDim, blockDim, args, sharedMem, stream)); -#if CUDA_VERSION >= 10000 - DEFINE_OVERLOAD(cudaLaunchHostFunc, ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData), ARG(stream, fn, userData)); -#endif - // Memory transfer APIS: // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY DEFINE_OVERLOAD(cudaMemPrefetchAsync,