Merge CUDA 9.2 support

This removes CUDA 9.2 CI pipelines and required workarounds from Ginkgo. Related PR: #1382
ginkgo-project · Aug 7, 2023 · cd1d2a2 · cd1d2a2
2 parents 1882753 + a2112b9
commit cd1d2a2
Show file tree

Hide file tree

Showing 13 changed files with 44 additions and 127 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -93,19 +93,6 @@ trigger_pipeline:
 
 # Build jobs
 # Job with example runs.
-# cuda 9.2 and friends
-build/cuda92/nompi/gcc/all/release/shared:
-  extends:
-    - .build_and_test_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
-    BUILD_TYPE: "Release"
-
 # cuda 10.1 and friends
 # Build CUDA NVIDIA without omp
 # Make sure that our jobs run when HWLOC is

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
@@ -24,12 +24,6 @@
     - cpu
     - controller
 
-.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017:
-  image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017
-  tags:
-    - private_ci
-    - nvidia-gpu
-
 .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
   image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
   tags:

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ For Ginkgo core library:
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 9.2+_ or _NVHPC Package 22.7+_
+*   _CUDA 10.1+_ or _NVHPC Package 22.7+_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
     [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
@@ -58,7 +58,7 @@ The Ginkgo HIP module has the following __additional__ requirements:
 * _ROCm 4.5+_
 *    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
     * _AMD_ backend (using the `clang` compiler)
-    * _9.2 <= CUDA < 11_ backend
+    * _10.1 <= CUDA < 11_ backend
 * if the hipFFT package is available, it is used to implement the FFT LinOps.
 
 The Ginkgo DPC++ module has the following __additional__ requirements:
@@ -90,7 +90,7 @@ following:
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 9.2+_
+*   _CUDA 10.1+_
 *   _Microsoft Visual Studio_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -85,12 +85,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_
         "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.")
 endif()
 
-if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION
-    MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
-    ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)
-
-    if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
-        message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
-            "Consider using a different CUDA host compiler or CUDA version.")
-    endif()
-endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21)
     set(CMAKE_HIP_ARCHITECTURES OFF)
 endif()
 
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
-    AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
-    message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.")
-endif()
-
 if(NOT DEFINED ROCM_PATH)
     if(DEFINED ENV{ROCM_PATH})
         set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
@@ -197,16 +192,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
     # Remove false positive CUDA warnings when calling one<T>() and zero<T>()
     list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda)
 
-    if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
-            AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2"
-            AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
-        ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)
-
-        if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
-            message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
-                "Consider using a different CUDA host compiler or CUDA version.")
-        endif()
-    endif()
     # select GPU architecture    
     include(cmake/Modules/CudaArchitectureSelector.cmake)
     cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH

diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
@@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
                   array<IndexType>& col_idxs)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto value_ptr =
-        reinterpret_cast<const device_value_type*>(values.get_const_data());
+    using device_value_type = device_type<ValueType>;
+    auto value_ptr = as_device_type(values.get_const_data());
     auto size = values.get_num_elems();
     // count nonzeros
-    auto nnz =
-        thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size,
-                         [] __device__(device_value_type value) {
-                             return is_nonzero(fake_complex_unpack(value));
-                         });
+    auto nnz = thrust::count_if(
+        thrust_policy(exec), value_ptr, value_ptr + size,
+        [] __device__(device_value_type value) { return is_nonzero(value); });
     if (nnz < size) {
         using tuple_type =
             thrust::tuple<IndexType, IndexType, device_value_type>;
@@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
         // copy nonzeros
         auto it = thrust::make_zip_iterator(thrust::make_tuple(
             row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr));
-        auto out_it = thrust::make_zip_iterator(thrust::make_tuple(
-            new_row_idxs.get_data(), new_col_idxs.get_data(),
-            reinterpret_cast<device_value_type*>(new_values.get_data())));
-        thrust::copy_if(
-            thrust_policy(exec), it, it + size, out_it,
-            [] __device__(tuple_type entry) {
-                return is_nonzero(fake_complex_unpack(thrust::get<2>(entry)));
-            });
+        auto out_it = thrust::make_zip_iterator(
+            thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(),
+                               as_device_type(new_values.get_data())));
+        thrust::copy_if(thrust_policy(exec), it, it + size, out_it,
+                        [] __device__(tuple_type entry) {
+                            return is_nonzero(thrust::get<2>(entry));
+                        });
         // swap out storage
         values = std::move(new_values);
         row_idxs = std::move(new_row_idxs);
@@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
                     array<ValueType>& values, array<IndexType>& row_idxs,
                     array<IndexType>& col_idxs)
 {
-    using device_value_type = device_member_type<ValueType>;
     const auto size = values.get_num_elems();
     const auto rows = row_idxs.get_const_data();
     const auto cols = col_idxs.get_const_data();
@@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
         // reduce duplicates
         auto in_locs =
             thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
-        auto in_vals =
-            reinterpret_cast<const device_value_type*>(values.get_const_data());
+        auto in_vals = as_device_type(values.get_const_data());
         auto out_locs = thrust::make_zip_iterator(thrust::make_tuple(
             new_row_idxs.get_data(), new_col_idxs.get_data()));
-        auto out_vals =
-            reinterpret_cast<device_value_type*>(new_values.get_data());
+        auto out_vals = as_device_type(new_values.get_data());
         thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size,
                               in_vals, out_locs, out_vals);
         // swap out storage
@@ -127,13 +117,9 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                     device_matrix_data<ValueType, IndexType>& data)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
     auto it = thrust::make_zip_iterator(
         thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs()));
-    auto vals = reinterpret_cast<device_value_type*>(data.get_values());
+    auto vals = as_device_type(data.get_values());
     thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(),
                         vals);
 }

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -872,11 +872,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
     auto in_rows = in_row_idxs.get_data();
     auto in_cols = in_col_idxs.get_data();
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    auto in_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
+    auto in_vals = as_device_type(in_values.get_data());
     auto in_loc_it =
         thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
     thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
@@ -924,17 +920,17 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     // fill in values
     components::fill_array(exec, block_value_array.get_data(),
                            num_blocks * bs * bs, zero<ValueType>());
-    thrust::for_each_n(
-        thrust_policy(exec), iota, num_blocks,
-        [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
-         values] __device__(size_type i) {
-            const auto block_begin = block_ptrs[i];
-            const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
-            for (auto nz = block_begin; nz < block_end; nz++) {
-                values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
-            }
-        });
+    thrust::for_each_n(thrust_policy(exec), iota, num_blocks,
+                       [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols,
+                        in_vals, values] __device__(size_type i) {
+                           const auto block_begin = block_ptrs[i];
+                           const auto block_end =
+                               i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
+                           for (auto nz = block_begin; nz < block_end; nz++) {
+                               values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                                      (in_rows[nz] % bs)] = in_vals[nz];
+                           }
+                       });
 }
 
 
@@ -1130,13 +1126,10 @@ void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
     const auto nnz = output->get_num_stored_elements();
     const auto in_row_ptrs = input->get_const_row_ptrs();
     const auto in_col_idxs = input->get_const_col_idxs();
-    // workaround for CUDA 9.2 Thrust unconstrained constructor issues
-    const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
-        input->get_const_values());
+    const auto in_vals = as_device_type(input->get_const_values());
     const auto out_row_ptrs = output->get_row_ptrs();
     const auto out_col_idxs = output->get_col_idxs();
-    const auto out_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
+    const auto out_vals = as_device_type(output->get_values());
     array<IndexType> out_row_idxs{exec, nnz};
     components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
                                      out_col_idxs);
@@ -1156,8 +1149,7 @@ void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    const auto vals =
-        reinterpret_cast<device_member_type<ValueType>*>(to_sort->get_values());
+    const auto vals = as_device_type(to_sort->get_values());
     const auto nnz = to_sort->get_num_stored_elements();
     const auto num_rows = to_sort->get_size()[0];
     array<IndexType> row_idx_array(exec, nnz);

diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
     auto in_rows = data.get_row_idxs();
     auto in_cols = data.get_col_idxs();
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    auto in_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(data.get_values());
+    auto in_vals = as_device_type(data.get_values());
     auto in_loc_it =
         thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
     thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
@@ -232,8 +228,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
             const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
             for (auto nz = block_begin; nz < block_end; nz++) {
                 block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                             (in_rows[nz] % bs)] =
-                    fake_complex_unpack(in_vals[nz]);
+                             (in_rows[nz] % bs)] = in_vals[nz];
             }
         });
 }

diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
@@ -45,11 +45,7 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
                     IndexType* row_idxs, IndexType* col_idxs, ValueType* vals)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto vals_it = reinterpret_cast<device_value_type*>(vals);
+    auto vals_it = as_device_type(vals);
     auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
     // Because reduce_by_key is not deterministic, so we do not need
     // stable_sort_by_key
@@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
                         const IndexType* col_idxs, const ValueType* vals,
                         matrix::Coo<ValueType, IndexType>* coarse_coo)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto vals_it = reinterpret_cast<const device_value_type*>(vals);
+    auto vals_it = as_device_type(vals);
     auto key_it =
         thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
 
-    auto coarse_vals_it =
-        reinterpret_cast<device_value_type*>(coarse_coo->get_values());
+    auto coarse_vals_it = as_device_type(coarse_coo->get_values());
     auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple(
         coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs()));
 

diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
@@ -154,8 +154,8 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
             for (auto i = row_begin; i < row_begin + slice_length; i++) {
                 cols[out_idx] =
                     i < row_end ? in_cols[i] : invalid_index<IndexType>();
-                values[out_idx] = i < row_end ? unpack_member(in_values[i])
-                                              : zero(values[out_idx]);
+                values[out_idx] =
+                    i < row_end ? in_values[i] : zero(values[out_idx]);
                 out_idx += slice_size;
             }
         },
@@ -185,8 +185,8 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
             for (auto i = row_begin; i < row_begin + ell_length; i++) {
                 cols[out_idx] =
                     i < row_end ? in_cols[i] : invalid_index<IndexType>();
-                values[out_idx] = i < row_end ? unpack_member(in_values[i])
-                                              : zero(values[out_idx]);
+                values[out_idx] =
+                    i < row_end ? in_values[i] : zero(values[out_idx]);
                 out_idx += ell_stride;
             }
         },

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
@@ -198,7 +198,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
 };
 
 
-#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+#else
 
 template <typename ValueType, typename IndexType>
 struct CudaSolveStruct : gko::solver::SolveStruct {

diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
@@ -142,7 +142,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
     endif()
     target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES})
 elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    find_package(CUDA 9.2 REQUIRED)
+    find_package(CUDA 10.1 REQUIRED)
     target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
 endif()
 

diff --git a/third_party/identify_stream_usage/identify_stream_usage.cpp b/third_party/identify_stream_usage/identify_stream_usage.cpp
@@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
                     size_t sharedMem, cudaStream_t stream),
                 ARG(func, gridDim, blockDim, args, sharedMem, stream));
 
-#if CUDA_VERSION >= 10000
-
 DEFINE_OVERLOAD(cudaLaunchHostFunc,
                 ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
                 ARG(stream, fn, userData));
 
-#endif
-
 // Memory transfer APIS:
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
 DEFINE_OVERLOAD(cudaMemPrefetchAsync,