Merge fix for triangular solvers on Windows CUDA

This fixes the incorrect usage of isnan in triangular solvers for CUDA with MSVC. Additionally, it fixes remaining test execution issues, and prevents future issues by fixing the alignment of uninitialized_array. Related PR: #1665
ginkgo-project · Aug 14, 2024 · 2c06c8a · 2c06c8a
2 parents ceee174 + 95fe3f4
commit 2c06c8a
Show file tree

Hide file tree

Showing 10 changed files with 122 additions and 25 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -486,8 +486,7 @@ build/windows-cuda/release/shared:
     - mkdir install
     - cmake -B build -DBUILD_SHARED_LIBS=ON -DGINKGO_BUILD_CUDA=ON "-DCMAKE_INSTALL_PREFIX=$pwd\install" .
     - cmake --build build --config Release -j16
-# we disable these tests until the triangular solver issues are resolved
-#   - ctest --test-dir build -C Release --no-tests=error --output-on-failure
+    - ctest --test-dir build -C Release --no-tests=error --output-on-failure
     - $env:PATH+=";$pwd/install/bin"
     - cmake --install build --config Release
     - cmake --build build --target test_install --config Release

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -22,6 +22,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
     target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
+    ginkgo_compile_features(cusparse_linops_${type})
 endfunction()
 
 function(ginkgo_benchmark_hipsparse_linops type def)
@@ -31,13 +32,15 @@ function(ginkgo_benchmark_hipsparse_linops type def)
     target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
     target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
+    ginkgo_compile_features(hipsparse_linops_${type})
 endfunction()
 
 function(ginkgo_benchmark_onemkl_linops type def)
     add_library(onemkl_linops_${type} utils/dpcpp_linops.dp.cpp)
     # make the dependency public to catch issues
     target_compile_definitions(onemkl_linops_${type} PUBLIC ${def})
     target_link_libraries(onemkl_linops_${type} PRIVATE Ginkgo::ginkgo MKL::MKL_DPCPP)
+    ginkgo_compile_features(onemkl_linops_${type})
 endfunction()
 
 
@@ -116,6 +119,7 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     add_library(cuda_timer utils/cuda_timer.cpp)
     target_link_libraries(cuda_timer ginkgo CUDA::cudart)
+    ginkgo_compile_features(cuda_timer)
 endif()
 if (GINKGO_BUILD_HIP)
     ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
@@ -125,6 +129,7 @@ if (GINKGO_BUILD_HIP)
     set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
     add_library(hip_timer utils/hip_timer.hip.cpp)
     target_link_libraries(hip_timer ginkgo)
+    ginkgo_compile_features(hip_timer)
 endif()
 
 if (GINKGO_BUILD_SYCL)
@@ -136,11 +141,13 @@ if (GINKGO_BUILD_SYCL)
     target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
     gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
     target_link_libraries(dpcpp_timer ginkgo)
+    ginkgo_compile_features(dpcpp_timer)
 endif()
 
 if (GINKGO_BUILD_MPI)
     add_library(mpi_timer ${Ginkgo_SOURCE_DIR}/benchmark/utils/mpi_timer.cpp)
     target_link_libraries(mpi_timer ginkgo)
+    ginkgo_compile_features(mpi_timer)
 endif()
 
 add_subdirectory(blas)

diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
@@ -90,9 +90,12 @@ def sanitize_json_text(input: str) -> List[str]:
     and pretty-printed to replace the original JSON input.
     """
 
-    result = json.dumps(sanitize_json(json.loads(input)), indent=4)
-    # json.dumps doesn't add a trailing newline
-    return result.splitlines() + [""]
+    try:
+        result = json.dumps(sanitize_json(json.loads(input)), indent=4)
+        # json.dumps doesn't add a trailing newline
+        return result.splitlines() + [""]
+    except Exception as e:
+        return f"Error: {str(e)}"
 
 
 def sanitize_text(

diff --git a/common/cuda_hip/components/uninitialized_array.hpp b/common/cuda_hip/components/uninitialized_array.hpp
@@ -8,6 +8,8 @@
 
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/thrust.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -34,7 +36,7 @@ class uninitialized_array {
      */
     constexpr GKO_ATTRIBUTES operator const ValueType*() const noexcept
     {
-        return &(*this)[0];
+        return data_;
     }
 
     /**
@@ -43,7 +45,7 @@ class uninitialized_array {
      *
      * @return the non-const pointer to the first entry of the array.
      */
-    GKO_ATTRIBUTES operator ValueType*() noexcept { return &(*this)[0]; }
+    GKO_ATTRIBUTES operator ValueType*() noexcept { return data_; }
 
     /**
      * constexpr array access operator.
@@ -56,7 +58,7 @@ class uninitialized_array {
     constexpr GKO_ATTRIBUTES const ValueType& operator[](
         size_type pos) const noexcept
     {
-        return reinterpret_cast<const ValueType*>(data_)[pos];
+        return data_[pos];
     }
 
     /**
@@ -69,11 +71,42 @@ class uninitialized_array {
      */
     GKO_ATTRIBUTES ValueType& operator[](size_type pos) noexcept
     {
-        return reinterpret_cast<ValueType*>(data_)[pos];
+        return data_[pos];
+    }
+
+private:
+    ValueType data_[size];
+};
+
+
+template <typename ValueType, size_type size>
+class uninitialized_array<thrust::complex<ValueType>, size> {
+public:
+    constexpr GKO_ATTRIBUTES operator const thrust::complex<ValueType>*()
+        const noexcept
+    {
+        return &(*this)[0];
+    }
+
+    GKO_ATTRIBUTES operator thrust::complex<ValueType>*() noexcept
+    {
+        return &(*this)[0];
+    }
+
+    constexpr GKO_ATTRIBUTES const thrust::complex<ValueType>& operator[](
+        size_type pos) const noexcept
+    {
+        return reinterpret_cast<const thrust::complex<ValueType>*>(data_)[pos];
+    }
+
+    GKO_ATTRIBUTES thrust::complex<ValueType>& operator[](
+        size_type pos) noexcept
+    {
+        return reinterpret_cast<thrust::complex<ValueType>*>(data_)[pos];
     }
 
 private:
-    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+    ValueType data_[2 * size];
 };
 
 

diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -335,7 +335,7 @@ __device__ void merge_path_reduce(const IndexType nwarps,
             }
         }
     }
-    __shared__ uninitialized_array<IndexType, spmv_block_size> tmp_ind;
+    __shared__ IndexType tmp_ind[spmv_block_size];
     __shared__ uninitialized_array<arithmetic_type, spmv_block_size> tmp_val;
     tmp_val[threadIdx.x] = value;
     tmp_ind[threadIdx.x] = row;

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
@@ -6,6 +6,7 @@
 #define GKO_CUDA_SOLVER_COMMON_TRS_KERNELS_CUH_
 
 
+#include <cstring>
 #include <functional>
 #include <iostream>
 #include <memory>
@@ -342,6 +343,52 @@ constexpr int default_block_size = 512;
 constexpr int fallback_block_size = 32;
 
 
+/** Returns an unsigned type matching the size of the given float type. */
+template <typename T>
+struct float_to_unsigned_impl {};
+
+template <>
+struct float_to_unsigned_impl<double> {
+    using type = uint64;
+};
+
+template <>
+struct float_to_unsigned_impl<float> {
+    using type = uint32;
+};
+
+
+/**
+ * Checks if a floating point number representation matches the representation
+ * of the quiet NaN with value gko::nan() exactly.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    using type = typename float_to_unsigned_impl<T>::type;
+    type value_bytes{};
+    type nan_bytes{};
+    auto nan_value = nan<T>();
+    using std::memcpy;
+    memcpy(&value_bytes, &value, sizeof(value));
+    memcpy(&nan_bytes, &nan_value, sizeof(value));
+    return value_bytes == nan_bytes;
+}
+
+
+/**
+ * Checks if any component of the complex value matches the quiet NaN with
+ * value gko::nan() exactly.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool>
+is_nan_exact(const T& value)
+{
+    return is_nan_exact(value.real()) || is_nan_exact(value.imag());
+}
+
+
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
@@ -399,11 +446,12 @@ __global__ void sptrsv_naive_caching_kernel(
         ValueType val{};
         if (shmem_possible) {
             const auto dependency_shid = dependency_gid % default_block_size;
-            while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) {
+            while (is_nan_exact(
+                val = load_relaxed_shared(x_s + dependency_shid))) {
             }
         } else {
-            while (
-                is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) {
+            while (is_nan_exact(
+                val = load_relaxed(x + dependency * x_stride + rhs))) {
             }
         }
 
@@ -418,7 +466,7 @@ __global__ void sptrsv_naive_caching_kernel(
     store_relaxed(x + row * x_stride + rhs, r);
 
     // This check to ensure no infinite loops happen.
-    if (is_nan(r)) {
+    if (is_nan_exact(r)) {
         store_relaxed_shared(x_s + self_shid, zero<ValueType>());
         store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
         *nan_produced = true;
@@ -460,7 +508,7 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto col = colidxs[j];
     while (j != row_end) {
         auto x_val = load_relaxed(x + col * x_stride + rhs);
-        while (!is_nan(x_val)) {
+        while (!is_nan_exact(x_val)) {
             sum += vals[j] * x_val;
             j += row_step;
             col = colidxs[j];
@@ -478,7 +526,7 @@ __global__ void sptrsv_naive_legacy_kernel(
             // after we encountered the diagonal, we are done
             // this also skips entries outside the triangle
             j = row_end;
-            if (is_nan(r)) {
+            if (is_nan_exact(r)) {
                 store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
                 *nan_produced = true;
             }

diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
@@ -1220,10 +1220,14 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
  * @return `true` if the value is NaN.
  */
 template <typename T>
-GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
-is_nan(const T& value)
+GKO_DEPRECATED(
+    "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
+    "removed in a future release, without replacement")
+GKO_INLINE GKO_ATTRIBUTES
+    std::enable_if_t<!is_complex_s<T>::value, bool> is_nan(const T& value)
 {
-    return std::isnan(value);
+    using std::isnan;
+    return isnan(value);
 }
 
 
@@ -1237,10 +1241,13 @@ is_nan(const T& value)
  * @return `true` if any component of the given value is NaN.
  */
 template <typename T>
+GKO_DEPRECATED(
+    "is_nan can't be used safely on the device (MSVC+CUDA), and will thus be "
+    "removed in a future release, without replacement")
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
     const T& value)
 {
-    return std::isnan(value.real()) || std::isnan(value.imag());
+    return is_nan(value.real()) || is_nan(value.imag());
 }
 
 

diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
@@ -222,7 +222,7 @@ TEST_F(Gcr, GcrApplyOneRHSIsEquivalentToRef)
     exec_solver->apply(d_b.get(), d_x.get());
 
     GKO_ASSERT_MTX_NEAR(d_b, b, 0);
-    GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e2);
+    GKO_ASSERT_MTX_NEAR(d_x, x, r<value_type>::value * 1e3);
 }
 
 

diff --git a/test/solver/lower_trs_kernels.cpp b/test/solver/lower_trs_kernels.cpp
@@ -152,7 +152,7 @@ TEST_F(LowerTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }
 
 
@@ -417,7 +417,7 @@ TEST_F(LowerTrs, ClassicalApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }
 
 

diff --git a/test/solver/upper_trs_kernels.cpp b/test/solver/upper_trs_kernels.cpp
@@ -152,7 +152,7 @@ TEST_F(UpperTrs, ApplyTriangularDenseMtxIsEquivalentToRef)
     solver->apply(b, x);
     d_solver->apply(db, dx);
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-13);
 }