Skip to content

Commit

Permalink
Merge CUDA 9.2 support
Browse files Browse the repository at this point in the history
This removes CUDA 9.2 CI pipelines and required workarounds from Ginkgo.

Related PR: #1382
  • Loading branch information
upsj authored Aug 7, 2023
2 parents 1882753 + a2112b9 commit cd1d2a2
Show file tree
Hide file tree
Showing 13 changed files with 44 additions and 127 deletions.
13 changes: 0 additions & 13 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,6 @@ trigger_pipeline:
# Build jobs
# Job with example runs.
# cuda 9.2 and friends
build/cuda92/nompi/gcc/all/release/shared:
extends:
- .build_and_test_template
- .default_variables
- .quick_test_condition
- .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017
variables:
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_HIP: "ON"
BUILD_TYPE: "Release"

# cuda 10.1 and friends
# Build CUDA NVIDIA without omp
# Make sure that our jobs run when HWLOC is
Expand Down
6 changes: 0 additions & 6 deletions .gitlab/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@
- cpu
- controller

.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017:
image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017
tags:
- private_ci
- nvidia-gpu

.use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
tags:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ For Ginkgo core library:

The Ginkgo CUDA module has the following __additional__ requirements:

* _CUDA 9.2+_ or _NVHPC Package 22.7+_
* _CUDA 10.1+_ or _NVHPC Package 22.7+_
* Any host compiler restrictions your version of CUDA may impose also apply
here. For the newest CUDA version, this information can be found in the
[CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
Expand All @@ -58,7 +58,7 @@ The Ginkgo HIP module has the following __additional__ requirements:
* _ROCm 4.5+_
* the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
* _AMD_ backend (using the `clang` compiler)
* _9.2 <= CUDA < 11_ backend
* _10.1 <= CUDA < 11_ backend
* if the hipFFT package is available, it is used to implement the FFT LinOps.

The Ginkgo DPC++ module has the following __additional__ requirements:
Expand Down Expand Up @@ -90,7 +90,7 @@ following:

The Ginkgo CUDA module has the following __additional__ requirements:

* _CUDA 9.2+_
* _CUDA 10.1+_
* _Microsoft Visual Studio_
* Any host compiler restrictions your version of CUDA may impose also apply
here. For the newest CUDA version, this information can be found in the
Expand Down
9 changes: 0 additions & 9 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_
"The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.")
endif()

if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION
MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)

if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
"Consider using a different CUDA host compiler or CUDA version.")
endif()
endif()
15 changes: 0 additions & 15 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21)
set(CMAKE_HIP_ARCHITECTURES OFF)
endif()

if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.")
endif()

if(NOT DEFINED ROCM_PATH)
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
Expand Down Expand Up @@ -197,16 +192,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
# Remove false positive CUDA warnings when calling one<T>() and zero<T>()
list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda)

if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2"
AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)

if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
"Consider using a different CUDA host compiler or CUDA version.")
endif()
endif()
# select GPU architecture
include(cmake/Modules/CudaArchitectureSelector.cmake)
cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH
Expand Down
44 changes: 15 additions & 29 deletions common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
array<ValueType>& values, array<IndexType>& row_idxs,
array<IndexType>& col_idxs)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto value_ptr =
reinterpret_cast<const device_value_type*>(values.get_const_data());
using device_value_type = device_type<ValueType>;
auto value_ptr = as_device_type(values.get_const_data());
auto size = values.get_num_elems();
// count nonzeros
auto nnz =
thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size,
[] __device__(device_value_type value) {
return is_nonzero(fake_complex_unpack(value));
});
auto nnz = thrust::count_if(
thrust_policy(exec), value_ptr, value_ptr + size,
[] __device__(device_value_type value) { return is_nonzero(value); });
if (nnz < size) {
using tuple_type =
thrust::tuple<IndexType, IndexType, device_value_type>;
Expand All @@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
// copy nonzeros
auto it = thrust::make_zip_iterator(thrust::make_tuple(
row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr));
auto out_it = thrust::make_zip_iterator(thrust::make_tuple(
new_row_idxs.get_data(), new_col_idxs.get_data(),
reinterpret_cast<device_value_type*>(new_values.get_data())));
thrust::copy_if(
thrust_policy(exec), it, it + size, out_it,
[] __device__(tuple_type entry) {
return is_nonzero(fake_complex_unpack(thrust::get<2>(entry)));
});
auto out_it = thrust::make_zip_iterator(
thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(),
as_device_type(new_values.get_data())));
thrust::copy_if(thrust_policy(exec), it, it + size, out_it,
[] __device__(tuple_type entry) {
return is_nonzero(thrust::get<2>(entry));
});
// swap out storage
values = std::move(new_values);
row_idxs = std::move(new_row_idxs);
Expand All @@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
array<ValueType>& values, array<IndexType>& row_idxs,
array<IndexType>& col_idxs)
{
using device_value_type = device_member_type<ValueType>;
const auto size = values.get_num_elems();
const auto rows = row_idxs.get_const_data();
const auto cols = col_idxs.get_const_data();
Expand All @@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
// reduce duplicates
auto in_locs =
thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
auto in_vals =
reinterpret_cast<const device_value_type*>(values.get_const_data());
auto in_vals = as_device_type(values.get_const_data());
auto out_locs = thrust::make_zip_iterator(thrust::make_tuple(
new_row_idxs.get_data(), new_col_idxs.get_data()));
auto out_vals =
reinterpret_cast<device_value_type*>(new_values.get_data());
auto out_vals = as_device_type(new_values.get_data());
thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size,
in_vals, out_locs, out_vals);
// swap out storage
Expand All @@ -127,13 +117,9 @@ template <typename ValueType, typename IndexType>
void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
device_matrix_data<ValueType, IndexType>& data)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto it = thrust::make_zip_iterator(
thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs()));
auto vals = reinterpret_cast<device_value_type*>(data.get_values());
auto vals = as_device_type(data.get_values());
thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(),
vals);
}
Expand Down
38 changes: 15 additions & 23 deletions common/cuda_hip/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -872,11 +872,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
}
auto in_rows = in_row_idxs.get_data();
auto in_cols = in_col_idxs.get_data();
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
auto in_vals =
reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
auto in_vals = as_device_type(in_values.get_data());
auto in_loc_it =
thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
Expand Down Expand Up @@ -924,17 +920,17 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
// fill in values
components::fill_array(exec, block_value_array.get_data(),
num_blocks * bs * bs, zero<ValueType>());
thrust::for_each_n(
thrust_policy(exec), iota, num_blocks,
[block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
values] __device__(size_type i) {
const auto block_begin = block_ptrs[i];
const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
}
});
thrust::for_each_n(thrust_policy(exec), iota, num_blocks,
[block_ptrs, nnz, num_blocks, bs, in_rows, in_cols,
in_vals, values] __device__(size_type i) {
const auto block_begin = block_ptrs[i];
const auto block_end =
i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] = in_vals[nz];
}
});
}


Expand Down Expand Up @@ -1130,13 +1126,10 @@ void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = output->get_num_stored_elements();
const auto in_row_ptrs = input->get_const_row_ptrs();
const auto in_col_idxs = input->get_const_col_idxs();
// workaround for CUDA 9.2 Thrust unconstrained constructor issues
const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
input->get_const_values());
const auto in_vals = as_device_type(input->get_const_values());
const auto out_row_ptrs = output->get_row_ptrs();
const auto out_col_idxs = output->get_col_idxs();
const auto out_vals =
reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
const auto out_vals = as_device_type(output->get_values());
array<IndexType> out_row_idxs{exec, nnz};
components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
out_col_idxs);
Expand All @@ -1156,8 +1149,7 @@ void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
{
const auto row_ptrs = to_sort->get_const_row_ptrs();
const auto col_idxs = to_sort->get_col_idxs();
const auto vals =
reinterpret_cast<device_member_type<ValueType>*>(to_sort->get_values());
const auto vals = as_device_type(to_sort->get_values());
const auto nnz = to_sort->get_num_stored_elements();
const auto num_rows = to_sort->get_size()[0];
array<IndexType> row_idx_array(exec, nnz);
Expand Down
9 changes: 2 additions & 7 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
}
auto in_rows = data.get_row_idxs();
auto in_cols = data.get_col_idxs();
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
auto in_vals =
reinterpret_cast<device_member_type<ValueType>*>(data.get_values());
auto in_vals = as_device_type(data.get_values());
auto in_loc_it =
thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
Expand Down Expand Up @@ -232,8 +228,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] =
fake_complex_unpack(in_vals[nz]);
(in_rows[nz] % bs)] = in_vals[nz];
}
});
}
Expand Down
15 changes: 3 additions & 12 deletions common/cuda_hip/multigrid/pgm_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,7 @@ template <typename ValueType, typename IndexType>
void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
IndexType* row_idxs, IndexType* col_idxs, ValueType* vals)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto vals_it = reinterpret_cast<device_value_type*>(vals);
auto vals_it = as_device_type(vals);
auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
// Because reduce_by_key is not deterministic, so we do not need
// stable_sort_by_key
Expand All @@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
const IndexType* col_idxs, const ValueType* vals,
matrix::Coo<ValueType, IndexType>* coarse_coo)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto vals_it = reinterpret_cast<const device_value_type*>(vals);
auto vals_it = as_device_type(vals);
auto key_it =
thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));

auto coarse_vals_it =
reinterpret_cast<device_value_type*>(coarse_coo->get_values());
auto coarse_vals_it = as_device_type(coarse_coo->get_values());
auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple(
coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs()));

Expand Down
8 changes: 4 additions & 4 deletions common/unified/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
for (auto i = row_begin; i < row_begin + slice_length; i++) {
cols[out_idx] =
i < row_end ? in_cols[i] : invalid_index<IndexType>();
values[out_idx] = i < row_end ? unpack_member(in_values[i])
: zero(values[out_idx]);
values[out_idx] =
i < row_end ? in_values[i] : zero(values[out_idx]);
out_idx += slice_size;
}
},
Expand Down Expand Up @@ -185,8 +185,8 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
for (auto i = row_begin; i < row_begin + ell_length; i++) {
cols[out_idx] =
i < row_end ? in_cols[i] : invalid_index<IndexType>();
values[out_idx] = i < row_end ? unpack_member(in_values[i])
: zero(values[out_idx]);
values[out_idx] =
i < row_end ? in_values[i] : zero(values[out_idx]);
out_idx += ell_stride;
}
},
Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
};


#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
#else

template <typename ValueType, typename IndexType>
struct CudaSolveStruct : gko::solver::SolveStruct {
Expand Down
2 changes: 1 addition & 1 deletion hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
endif()
target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES})
elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
find_package(CUDA 9.2 REQUIRED)
find_package(CUDA 10.1 REQUIRED)
target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
endif()

Expand Down
4 changes: 0 additions & 4 deletions third_party/identify_stream_usage/identify_stream_usage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
size_t sharedMem, cudaStream_t stream),
ARG(func, gridDim, blockDim, args, sharedMem, stream));

#if CUDA_VERSION >= 10000

DEFINE_OVERLOAD(cudaLaunchHostFunc,
ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
ARG(stream, fn, userData));

#endif

// Memory transfer APIS:
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
DEFINE_OVERLOAD(cudaMemPrefetchAsync,
Expand Down

0 comments on commit cd1d2a2

Please sign in to comment.