Merge unified factorization kernels

This merges the code bases for CUDA and HIP factorization kernels completely Related PR: #1647
ginkgo-project · Jul 15, 2024 · 6ea8133 · 6ea8133
2 parents ded657c + 669028f
commit 6ea8133
Show file tree

Hide file tree

Showing 31 changed files with 548 additions and 1,815 deletions.
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -1,6 +1,7 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
+    base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
@@ -9,9 +10,18 @@ set(CUDA_HIP_SOURCES
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
+    factorization/ic_kernels.cpp
+    factorization/ilu_kernels.cpp
     factorization/lu_kernels.cpp
     factorization/par_ic_kernels.cpp
+    factorization/par_ict_kernels.cpp
     factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_approx_filter_kernels.cpp
+    factorization/par_ilut_filter_kernels.cpp
+    factorization/par_ilut_select_common.cpp
+    factorization/par_ilut_select_kernels.cpp
+    factorization/par_ilut_spgeam_kernels.cpp
+    factorization/par_ilut_sweep_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp

diff --git a/cuda/base/index_set_kernels.cpp → common/cuda_hip/base/index_set_kernels.cpp b/cuda/base/index_set_kernels.cpp → common/cuda_hip/base/index_set_kernels.cpp
@@ -13,17 +13,7 @@
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The Cuda namespace.
- *
- * @ingroup cuda
- */
-namespace cuda {
-/**
- * @brief The index_set namespace.
- *
- * @ingroup index_set
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace idx_set {
 
 
@@ -78,6 +68,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
 
 
 }  // namespace idx_set
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/factorization/ic_kernels.cu → common/cuda_hip/factorization/ic_kernels.cpp b/cuda/factorization/ic_kernels.cu → common/cuda_hip/factorization/ic_kernels.cpp
@@ -6,17 +6,13 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The ic factorization namespace.
- *
- * @ingroup factor
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace ic_factorization {
 
 
@@ -50,7 +46,7 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
-#if (CUDA_VERSION >= 11040)
+#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
@@ -62,6 +58,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
 
 
 }  // namespace ic_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/factorization/ilu_kernels.cu → ...on/cuda_hip/factorization/ilu_kernels.cpp b/cuda/factorization/ilu_kernels.cu → ...on/cuda_hip/factorization/ilu_kernels.cpp
@@ -6,17 +6,13 @@
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
-/**
- * @brief The ilu factorization namespace.
- *
- * @ingroup factor
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace ilu_factorization {
 
 
@@ -50,7 +46,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                     SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
-#if (CUDA_VERSION >= 11040)
+#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
@@ -63,6 +59,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace ilu_factorization
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/...hip/factorization/par_ict_kernels.hpp.inc → ...uda_hip/factorization/par_ict_kernels.cpp b/...hip/factorization/par_ict_kernels.hpp.inc → ...uda_hip/factorization/par_ict_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ict_kernels.hpp"
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -275,3 +318,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llh_row_ptrs = llh->get_const_row_ptrs();
+    auto llh_col_idxs = llh->get_const_col_idxs();
+    auto llh_vals = llh->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
+                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
+                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
+                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::ict_sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()));
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llh->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/...on/par_ilut_approx_filter_kernels.hip.cpp → ...zation/par_ilut_approx_filter_kernels.cpp b/...on/par_ilut_approx_filter_kernels.hip.cpp → ...zation/par_ilut_approx_filter_kernels.cpp
@@ -20,18 +20,20 @@
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/sorting.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -45,10 +47,6 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
 template <int subwarp_size, typename ValueType, typename IndexType>
 void threshold_filter_approx(syn::value_list<int, subwarp_size>,
                              std::shared_ptr<const DefaultExecutor> exec,
@@ -175,6 +173,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_config.hpp b/common/cuda_hip/factorization/par_ilut_config.hpp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
+#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
+
+#include "common/cuda_hip/base/config.hpp"
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_