Skip to content

Commit

Permalink
remove CUDA 9.2 support
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Aug 7, 2023
1 parent 1882753 commit a2112b9
Show file tree
Hide file tree
Showing 13 changed files with 44 additions and 127 deletions.
13 changes: 0 additions & 13 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,6 @@ trigger_pipeline:
# Build jobs
# Job with example runs.
# cuda 9.2 and friends
build/cuda92/nompi/gcc/all/release/shared:
extends:
- .build_and_test_template
- .default_variables
- .quick_test_condition
- .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017
variables:
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_HIP: "ON"
BUILD_TYPE: "Release"

# cuda 10.1 and friends
# Build CUDA NVIDIA without omp
# Make sure that our jobs run when HWLOC is
Expand Down
6 changes: 0 additions & 6 deletions .gitlab/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@
- cpu
- controller

.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017:
image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017
tags:
- private_ci
- nvidia-gpu

.use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
tags:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ For Ginkgo core library:

The Ginkgo CUDA module has the following __additional__ requirements:

* _CUDA 9.2+_ or _NVHPC Package 22.7+_
* _CUDA 10.1+_ or _NVHPC Package 22.7+_
* Any host compiler restrictions your version of CUDA may impose also apply
here. For the newest CUDA version, this information can be found in the
[CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
Expand All @@ -58,7 +58,7 @@ The Ginkgo HIP module has the following __additional__ requirements:
* _ROCm 4.5+_
* the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
* _AMD_ backend (using the `clang` compiler)
* _9.2 <= CUDA < 11_ backend
* _10.1 <= CUDA < 11_ backend
* if the hipFFT package is available, it is used to implement the FFT LinOps.

The Ginkgo DPC++ module has the following __additional__ requirements:
Expand Down Expand Up @@ -90,7 +90,7 @@ following:

The Ginkgo CUDA module has the following __additional__ requirements:

* _CUDA 9.2+_
* _CUDA 10.1+_
* _Microsoft Visual Studio_
* Any host compiler restrictions your version of CUDA may impose also apply
here. For the newest CUDA version, this information can be found in the
Expand Down
9 changes: 0 additions & 9 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_
"The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.")
endif()

if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION
MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)

if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
"Consider using a different CUDA host compiler or CUDA version.")
endif()
endif()
15 changes: 0 additions & 15 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21)
set(CMAKE_HIP_ARCHITECTURES OFF)
endif()

if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.")
endif()

if(NOT DEFINED ROCM_PATH)
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
Expand Down Expand Up @@ -197,16 +192,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
# Remove false positive CUDA warnings when calling one<T>() and zero<T>()
list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda)

if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2"
AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)

if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
"Consider using a different CUDA host compiler or CUDA version.")
endif()
endif()
# select GPU architecture
include(cmake/Modules/CudaArchitectureSelector.cmake)
cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH
Expand Down
44 changes: 15 additions & 29 deletions common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
array<ValueType>& values, array<IndexType>& row_idxs,
array<IndexType>& col_idxs)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto value_ptr =
reinterpret_cast<const device_value_type*>(values.get_const_data());
using device_value_type = device_type<ValueType>;
auto value_ptr = as_device_type(values.get_const_data());
auto size = values.get_num_elems();
// count nonzeros
auto nnz =
thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size,
[] __device__(device_value_type value) {
return is_nonzero(fake_complex_unpack(value));
});
auto nnz = thrust::count_if(
thrust_policy(exec), value_ptr, value_ptr + size,
[] __device__(device_value_type value) { return is_nonzero(value); });
if (nnz < size) {
using tuple_type =
thrust::tuple<IndexType, IndexType, device_value_type>;
Expand All @@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
// copy nonzeros
auto it = thrust::make_zip_iterator(thrust::make_tuple(
row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr));
auto out_it = thrust::make_zip_iterator(thrust::make_tuple(
new_row_idxs.get_data(), new_col_idxs.get_data(),
reinterpret_cast<device_value_type*>(new_values.get_data())));
thrust::copy_if(
thrust_policy(exec), it, it + size, out_it,
[] __device__(tuple_type entry) {
return is_nonzero(fake_complex_unpack(thrust::get<2>(entry)));
});
auto out_it = thrust::make_zip_iterator(
thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(),
as_device_type(new_values.get_data())));
thrust::copy_if(thrust_policy(exec), it, it + size, out_it,
[] __device__(tuple_type entry) {
return is_nonzero(thrust::get<2>(entry));
});
// swap out storage
values = std::move(new_values);
row_idxs = std::move(new_row_idxs);
Expand All @@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
array<ValueType>& values, array<IndexType>& row_idxs,
array<IndexType>& col_idxs)
{
using device_value_type = device_member_type<ValueType>;
const auto size = values.get_num_elems();
const auto rows = row_idxs.get_const_data();
const auto cols = col_idxs.get_const_data();
Expand All @@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
// reduce duplicates
auto in_locs =
thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
auto in_vals =
reinterpret_cast<const device_value_type*>(values.get_const_data());
auto in_vals = as_device_type(values.get_const_data());
auto out_locs = thrust::make_zip_iterator(thrust::make_tuple(
new_row_idxs.get_data(), new_col_idxs.get_data()));
auto out_vals =
reinterpret_cast<device_value_type*>(new_values.get_data());
auto out_vals = as_device_type(new_values.get_data());
thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size,
in_vals, out_locs, out_vals);
// swap out storage
Expand All @@ -127,13 +117,9 @@ template <typename ValueType, typename IndexType>
void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
device_matrix_data<ValueType, IndexType>& data)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto it = thrust::make_zip_iterator(
thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs()));
auto vals = reinterpret_cast<device_value_type*>(data.get_values());
auto vals = as_device_type(data.get_values());
thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(),
vals);
}
Expand Down
38 changes: 15 additions & 23 deletions common/cuda_hip/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -872,11 +872,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
}
auto in_rows = in_row_idxs.get_data();
auto in_cols = in_col_idxs.get_data();
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
auto in_vals =
reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
auto in_vals = as_device_type(in_values.get_data());
auto in_loc_it =
thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
Expand Down Expand Up @@ -924,17 +920,17 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
// fill in values
components::fill_array(exec, block_value_array.get_data(),
num_blocks * bs * bs, zero<ValueType>());
thrust::for_each_n(
thrust_policy(exec), iota, num_blocks,
[block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
values] __device__(size_type i) {
const auto block_begin = block_ptrs[i];
const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
}
});
thrust::for_each_n(thrust_policy(exec), iota, num_blocks,
[block_ptrs, nnz, num_blocks, bs, in_rows, in_cols,
in_vals, values] __device__(size_type i) {
const auto block_begin = block_ptrs[i];
const auto block_end =
i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] = in_vals[nz];
}
});
}


Expand Down Expand Up @@ -1130,13 +1126,10 @@ void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = output->get_num_stored_elements();
const auto in_row_ptrs = input->get_const_row_ptrs();
const auto in_col_idxs = input->get_const_col_idxs();
// workaround for CUDA 9.2 Thrust unconstrained constructor issues
const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
input->get_const_values());
const auto in_vals = as_device_type(input->get_const_values());
const auto out_row_ptrs = output->get_row_ptrs();
const auto out_col_idxs = output->get_col_idxs();
const auto out_vals =
reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
const auto out_vals = as_device_type(output->get_values());
array<IndexType> out_row_idxs{exec, nnz};
components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
out_col_idxs);
Expand All @@ -1156,8 +1149,7 @@ void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
{
const auto row_ptrs = to_sort->get_const_row_ptrs();
const auto col_idxs = to_sort->get_col_idxs();
const auto vals =
reinterpret_cast<device_member_type<ValueType>*>(to_sort->get_values());
const auto vals = as_device_type(to_sort->get_values());
const auto nnz = to_sort->get_num_stored_elements();
const auto num_rows = to_sort->get_size()[0];
array<IndexType> row_idx_array(exec, nnz);
Expand Down
9 changes: 2 additions & 7 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
}
auto in_rows = data.get_row_idxs();
auto in_cols = data.get_col_idxs();
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
auto in_vals =
reinterpret_cast<device_member_type<ValueType>*>(data.get_values());
auto in_vals = as_device_type(data.get_values());
auto in_loc_it =
thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
Expand Down Expand Up @@ -232,8 +228,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
(in_rows[nz] % bs)] =
fake_complex_unpack(in_vals[nz]);
(in_rows[nz] % bs)] = in_vals[nz];
}
});
}
Expand Down
15 changes: 3 additions & 12 deletions common/cuda_hip/multigrid/pgm_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,7 @@ template <typename ValueType, typename IndexType>
void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
IndexType* row_idxs, IndexType* col_idxs, ValueType* vals)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto vals_it = reinterpret_cast<device_value_type*>(vals);
auto vals_it = as_device_type(vals);
auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
// Because reduce_by_key is not deterministic, so we do not need
// stable_sort_by_key
Expand All @@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
const IndexType* col_idxs, const ValueType* vals,
matrix::Coo<ValueType, IndexType>* coarse_coo)
{
// workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
// due to overly generic assignment operator and constructor leading to
// ambiguities. So we need to use our own fake_complex type
using device_value_type = device_member_type<ValueType>;
auto vals_it = reinterpret_cast<const device_value_type*>(vals);
auto vals_it = as_device_type(vals);
auto key_it =
thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));

auto coarse_vals_it =
reinterpret_cast<device_value_type*>(coarse_coo->get_values());
auto coarse_vals_it = as_device_type(coarse_coo->get_values());
auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple(
coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs()));

Expand Down
8 changes: 4 additions & 4 deletions common/unified/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
for (auto i = row_begin; i < row_begin + slice_length; i++) {
cols[out_idx] =
i < row_end ? in_cols[i] : invalid_index<IndexType>();
values[out_idx] = i < row_end ? unpack_member(in_values[i])
: zero(values[out_idx]);
values[out_idx] =
i < row_end ? in_values[i] : zero(values[out_idx]);
out_idx += slice_size;
}
},
Expand Down Expand Up @@ -185,8 +185,8 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
for (auto i = row_begin; i < row_begin + ell_length; i++) {
cols[out_idx] =
i < row_end ? in_cols[i] : invalid_index<IndexType>();
values[out_idx] = i < row_end ? unpack_member(in_values[i])
: zero(values[out_idx]);
values[out_idx] =
i < row_end ? in_values[i] : zero(values[out_idx]);
out_idx += ell_stride;
}
},
Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
};


#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
#else

template <typename ValueType, typename IndexType>
struct CudaSolveStruct : gko::solver::SolveStruct {
Expand Down
2 changes: 1 addition & 1 deletion hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
endif()
target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES})
elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
find_package(CUDA 9.2 REQUIRED)
find_package(CUDA 10.1 REQUIRED)
target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
endif()

Expand Down
4 changes: 0 additions & 4 deletions third_party/identify_stream_usage/identify_stream_usage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
size_t sharedMem, cudaStream_t stream),
ARG(func, gridDim, blockDim, args, sharedMem, stream));

#if CUDA_VERSION >= 10000

DEFINE_OVERLOAD(cudaLaunchHostFunc,
ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
ARG(stream, fn, userData));

#endif

// Memory transfer APIS:
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
DEFINE_OVERLOAD(cudaMemPrefetchAsync,
Expand Down

0 comments on commit a2112b9

Please sign in to comment.