Skip to content

Commit

Permalink
Merge fix for triangular solvers on Windows CUDA
Browse files Browse the repository at this point in the history
This fixes the incorrect usage of isnan in triangular solvers for CUDA with MSVC.
Additionally, it fixes remaining test execution issues, and prevents future issues
by fixing the alignment of uninitialized_array.

Related PR: #1665
  • Loading branch information
upsj authored Aug 14, 2024
2 parents ceee174 + 95fe3f4 commit 2c06c8a
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 25 deletions.
3 changes: 1 addition & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,7 @@ build/windows-cuda/release/shared:
- mkdir install
- cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
- cmake --build build --config Release -j16
# we disable these tests until the triangular solver issues are resolved
# - ctest --test-dir build -C Release --no-tests=error --output-on-failure
- ctest --test-dir build -C Release --no-tests=error --output-on-failure
- $env:PATH+=";$pwd/install/bin"
- cmake --install build --config Release
- cmake --build build --target test_install --config Release
Expand Down
7 changes: 7 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
ginkgo_compile_features(cusparse_linops_${type})
endfunction()

function(ginkgo_benchmark_hipsparse_linops type def)
Expand All @@ -31,13 +32,15 @@ function(ginkgo_benchmark_hipsparse_linops type def)
target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
ginkgo_compile_features(hipsparse_linops_${type})
endfunction()

function(ginkgo_benchmark_onemkl_linops type def)
add_library(onemkl_linops_${type} utils/dpcpp_linops.dp.cpp)
# make the dependency public to catch issues
target_compile_definitions(onemkl_linops_${type} PUBLIC ${def})
target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP)
ginkgo_compile_features(onemkl_linops_${type})
endfunction()


Expand Down Expand Up @@ -116,6 +119,7 @@ if (GINKGO_BUILD_CUDA)
ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
add_library(cuda_timer utils/cuda_timer.cpp)
target_link_libraries(cuda_timer ginkgo CUDA::cudart)
ginkgo_compile_features(cuda_timer)
endif()
if (GINKGO_BUILD_HIP)
ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
Expand All @@ -125,6 +129,7 @@ if (GINKGO_BUILD_HIP)
set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
add_library(hip_timer utils/hip_timer.hip.cpp)
target_link_libraries(hip_timer ginkgo)
ginkgo_compile_features(hip_timer)
endif()

if (GINKGO_BUILD_SYCL)
Expand All @@ -136,11 +141,13 @@ if (GINKGO_BUILD_SYCL)
target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
target_link_libraries(dpcpp_timer ginkgo)
ginkgo_compile_features(dpcpp_timer)
endif()

if (GINKGO_BUILD_MPI)
add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp)
target_link_libraries(mpi_timer ginkgo)
ginkgo_compile_features(mpi_timer)
endif()

add_subdirectory(blas)
Expand Down
9 changes: 6 additions & 3 deletions benchmark/test/test_framework.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,12 @@ def sanitize_json_text(input: str) -> List[str]:
and pretty-printed to replace the original JSON input.
"""

result = json.dumps(sanitize_json(json.loads(input)), indent=4)
# json.dumps doesn't add a trailing newline
return result.splitlines() + [""]
try:
result = json.dumps(sanitize_json(json.loads(input)), indent=4)
# json.dumps doesn't add a trailing newline
return result.splitlines() + [""]
except Exception as e:
return f"Error: {str(e)}"


def sanitize_text(
Expand Down
43 changes: 38 additions & 5 deletions common/cuda_hip/components/uninitialized_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <ginkgo/core/base/types.hpp>

#include "common/cuda_hip/base/thrust.hpp"


namespace gko {
namespace kernels {
Expand All @@ -34,7 +36,7 @@ class uninitialized_array {
*/
constexpr GKO_ATTRIBUTES operator const ValueType*() const noexcept
{
return &(*this)[0];
return data_;
}

/**
Expand All @@ -43,7 +45,7 @@ class uninitialized_array {
*
* @return the non-const pointer to the first entry of the array.
*/
GKO_ATTRIBUTES operator ValueType*() noexcept { return &(*this)[0]; }
GKO_ATTRIBUTES operator ValueType*() noexcept { return data_; }

/**
* constexpr array access operator.
Expand All @@ -56,7 +58,7 @@ class uninitialized_array {
constexpr GKO_ATTRIBUTES const ValueType& operator[](
size_type pos) const noexcept
{
return reinterpret_cast<const ValueType*>(data_)[pos];
return data_[pos];
}

/**
Expand All @@ -69,11 +71,42 @@ class uninitialized_array {
*/
GKO_ATTRIBUTES ValueType& operator[](size_type pos) noexcept
{
return reinterpret_cast<ValueType*>(data_)[pos];
return data_[pos];
}

private:
ValueType data_[size];
};


template <typename ValueType, size_type size>
class uninitialized_array<thrust::complex<ValueType>, size> {
public:
constexpr GKO_ATTRIBUTES operator const thrust::complex<ValueType>*()
const noexcept
{
return &(*this)[0];
}

GKO_ATTRIBUTES operator thrust::complex<ValueType>*() noexcept
{
return &(*this)[0];
}

constexpr GKO_ATTRIBUTES const thrust::complex<ValueType>& operator[](
size_type pos) const noexcept
{
return reinterpret_cast<const thrust::complex<ValueType>*>(data_)[pos];
}

GKO_ATTRIBUTES thrust::complex<ValueType>& operator[](
size_type pos) noexcept
{
return reinterpret_cast<thrust::complex<ValueType>*>(data_)[pos];
}

private:
unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
ValueType data_[2 * size];
};


Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/matrix/csr_kernels.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ __device__ void merge_path_reduce(const IndexType nwarps,
}
}
}
__shared__ uninitialized_array<IndexType, spmv_block_size> tmp_ind;
__shared__ IndexType tmp_ind[spmv_block_size];
__shared__ uninitialized_array<arithmetic_type, spmv_block_size> tmp_val;
tmp_val[threadIdx.x] = value;
tmp_ind[threadIdx.x] = row;
Expand Down
60 changes: 54 additions & 6 deletions cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#define GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_


#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
Expand Down Expand Up @@ -342,6 +343,52 @@ constexpr int default_block_size = 512;
constexpr int fallback_block_size = 32;


/** Returns an unsigned type matching the size of the given float type. */
template <typename T>
struct float_to_unsigned_impl {};

template <>
struct float_to_unsigned_impl<double> {
using type = uint64;
};

template <>
struct float_to_unsigned_impl<float> {
using type = uint32;
};


/**
* Checks if a floating point number representation matches the representation
* of the quiet NaN with value gko::nan() exactly.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
is_nan_exact(const T& value)
{
using type = typename float_to_unsigned_impl<T>::type;
type value_bytes{};
type nan_bytes{};
auto nan_value = nan<T>();
using std::memcpy;
memcpy(&value_bytes, &value, sizeof(value));
memcpy(&nan_bytes, &nan_value, sizeof(value));
return value_bytes == nan_bytes;
}


/**
* Checks if any component of the complex value matches the quiet NaN with
* value gko::nan() exactly.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
is_nan_exact(const T& value)
{
return is_nan_exact(value.real()) || is_nan_exact(value.imag());
}


template <bool is_upper, typename ValueType, typename IndexType>
__global__ void sptrsv_naive_caching_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
Expand Down Expand Up @@ -399,11 +446,12 @@ __global__ void sptrsv_naive_caching_kernel(
ValueType val{};
if (shmem_possible) {
const auto dependency_shid = dependency_gid % default_block_size;
while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) {
while (is_nan_exact(
val = load_relaxed_shared(x_s + dependency_shid))) {
}
} else {
while (
is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) {
while (is_nan_exact(
val = load_relaxed(x + dependency * x_stride + rhs))) {
}
}

Expand All @@ -418,7 +466,7 @@ __global__ void sptrsv_naive_caching_kernel(
store_relaxed(x + row * x_stride + rhs, r);

// This check to ensure no infinite loops happen.
if (is_nan(r)) {
if (is_nan_exact(r)) {
store_relaxed_shared(x_s + self_shid, zero<ValueType>());
store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
Expand Down Expand Up @@ -460,7 +508,7 @@ __global__ void sptrsv_naive_legacy_kernel(
auto col = colidxs[j];
while (j != row_end) {
auto x_val = load_relaxed(x + col * x_stride + rhs);
while (!is_nan(x_val)) {
while (!is_nan_exact(x_val)) {
sum += vals[j] * x_val;
j += row_step;
col = colidxs[j];
Expand All @@ -478,7 +526,7 @@ __global__ void sptrsv_naive_legacy_kernel(
// after we encountered the diagonal, we are done
// this also skips entries outside the triangle
j = row_end;
if (is_nan(r)) {
if (is_nan_exact(r)) {
store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
}
Expand Down
15 changes: 11 additions & 4 deletions include/ginkgo/core/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1220,10 +1220,14 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
* @return `true` if the value is NaN.
*/
template <typename T>
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
is_nan(const T& value)
GKO_DEPRECATED(
"is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
"removed in a future release, without replacement")
GKO_INLINE GKO_ATTRIBUTES
std::enable_if_t<!is_complex_s<T>::value, bool> is_nan(const T& value)
{
return std::isnan(value);
using std::isnan;
return isnan(value);
}


Expand All @@ -1237,10 +1241,13 @@ is_nan(const T& value)
* @return `true` if any component of the given value is NaN.
*/
template <typename T>
GKO_DEPRECATED(
"is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
"removed in a future release, without replacement")
GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
const T& value)
{
return std::isnan(value.real()) || std::isnan(value.imag());
return is_nan(value.real()) || is_nan(value.imag());
}


Expand Down
2 changes: 1 addition & 1 deletion test/solver/gcr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ TEST_F(Gcr, GcrApplyOneRHSIsEquivalentToRef)
exec_solver->apply(d_b.get(), d_x.get());

GKO_ASSERT_MTX_NEAR(d_b, b, 0);
GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e2);
GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
}


Expand Down
4 changes: 2 additions & 2 deletions test/solver/lower_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(LowerTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down Expand Up @@ -417,7 +417,7 @@ TEST_F(LowerTrs, ClassicalApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down
2 changes: 1 addition & 1 deletion test/solver/upper_trs_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(UpperTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
solver->apply(b, x);
d_solver->apply(db, dx);

GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
}


Expand Down

0 comments on commit 2c06c8a

Please sign in to comment.