Skip to content

Commit

Permalink
Merge unified factorization kernels
Browse files Browse the repository at this point in the history
This merges the code bases for CUDA and HIP factorization kernels completely

Related PR: #1647
  • Loading branch information
upsj authored Jul 15, 2024
2 parents ded657c + 669028f commit 6ea8133
Show file tree
Hide file tree
Showing 31 changed files with 548 additions and 1,815 deletions.
10 changes: 10 additions & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
set(CUDA_HIP_SOURCES
base/device_matrix_data_kernels.cpp
base/index_set_kernels.cpp
components/prefix_sum_kernels.cpp
distributed/index_map_kernels.cpp
distributed/matrix_kernels.cpp
Expand All @@ -9,9 +10,18 @@ set(CUDA_HIP_SOURCES
distributed/vector_kernels.cpp
factorization/cholesky_kernels.cpp
factorization/factorization_kernels.cpp
factorization/ic_kernels.cpp
factorization/ilu_kernels.cpp
factorization/lu_kernels.cpp
factorization/par_ic_kernels.cpp
factorization/par_ict_kernels.cpp
factorization/par_ilu_kernels.cpp
factorization/par_ilut_approx_filter_kernels.cpp
factorization/par_ilut_filter_kernels.cpp
factorization/par_ilut_select_common.cpp
factorization/par_ilut_select_kernels.cpp
factorization/par_ilut_spgeam_kernels.cpp
factorization/par_ilut_sweep_kernels.cpp
matrix/coo_kernels.cpp
matrix/dense_kernels.cpp
matrix/diagonal_kernels.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,7 @@

namespace gko {
namespace kernels {
/**
* @brief The Cuda namespace.
*
* @ingroup cuda
*/
namespace cuda {
/**
* @brief The index_set namespace.
*
* @ingroup index_set
*/
namespace GKO_DEVICE_NAMESPACE {
namespace idx_set {


Expand Down Expand Up @@ -78,6 +68,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(


} // namespace idx_set
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,13 @@

#include <ginkgo/core/base/array.hpp>

#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/sparselib_bindings.hpp"


namespace gko {
namespace kernels {
namespace cuda {
/**
* @brief The ic factorization namespace.
*
* @ingroup factor
*/
namespace GKO_DEVICE_NAMESPACE {
namespace ic_factorization {


Expand Down Expand Up @@ -50,7 +46,7 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());

// CUDA 11.4 has a use-after-free bug on Turing
#if (CUDA_VERSION >= 11040)
#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
exec->synchronize();
#endif

Expand All @@ -62,6 +58,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);


} // namespace ic_factorization
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,13 @@

#include <ginkgo/core/base/array.hpp>

#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/sparselib_bindings.hpp"


namespace gko {
namespace kernels {
namespace cuda {
/**
* @brief The ilu factorization namespace.
*
* @ingroup factor
*/
namespace GKO_DEVICE_NAMESPACE {
namespace ilu_factorization {


Expand Down Expand Up @@ -50,7 +46,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());

// CUDA 11.4 has a use-after-free bug on Turing
#if (CUDA_VERSION >= 11040)
#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
exec->synchronize();
#endif

Expand All @@ -63,6 +59,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


} // namespace ilu_factorization
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,49 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/factorization/par_ict_kernels.hpp"

#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/matrix/coo.hpp>
#include <ginkgo/core/matrix/csr.hpp>
#include <ginkgo/core/matrix/dense.hpp>

#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/components/intrinsics.hpp"
#include "common/cuda_hip/components/memory.hpp"
#include "common/cuda_hip/components/merging.hpp"
#include "common/cuda_hip/components/prefix_sum.hpp"
#include "common/cuda_hip/components/reduction.hpp"
#include "common/cuda_hip/components/searching.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/coo_builder.hpp"
#include "core/matrix/csr_builder.hpp"
#include "core/matrix/csr_kernels.hpp"
#include "core/synthesizer/implementation_selection.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
/**
* @brief The parallel ICT factorization namespace.
*
* @ingroup factor
*/
namespace par_ict_factorization {


constexpr int default_block_size = 512;


// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


namespace kernel {


Expand Down Expand Up @@ -275,3 +318,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(


} // namespace kernel


namespace {


template <int subwarp_size, typename ValueType, typename IndexType>
void add_candidates(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* llh,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* l,
matrix::Csr<ValueType, IndexType>* l_new)
{
auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
auto subwarps_per_block = default_block_size / subwarp_size;
auto num_blocks = ceildiv(num_rows, subwarps_per_block);
matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
auto llh_row_ptrs = llh->get_const_row_ptrs();
auto llh_col_idxs = llh->get_const_col_idxs();
auto llh_vals = llh->get_const_values();
auto a_row_ptrs = a->get_const_row_ptrs();
auto a_col_idxs = a->get_const_col_idxs();
auto a_vals = a->get_const_values();
auto l_row_ptrs = l->get_const_row_ptrs();
auto l_col_idxs = l->get_const_col_idxs();
auto l_vals = l->get_const_values();
auto l_new_row_ptrs = l_new->get_row_ptrs();
// count non-zeros per row
if (num_blocks > 0) {
kernel::ict_tri_spgeam_nnz<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
l_new_row_ptrs, num_rows);
}

// build row ptrs
components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);

// resize output arrays
auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
l_new_builder.get_value_array().resize_and_reset(l_new_nnz);

auto l_new_col_idxs = l_new->get_col_idxs();
auto l_new_vals = l_new->get_values();

// fill columns and values
if (num_blocks > 0) {
kernel::ict_tri_spgeam_init<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
l_new_col_idxs, as_device_type(l_new_vals), num_rows);
}
}


GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);


template <int subwarp_size, typename ValueType, typename IndexType>
void compute_factor(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
matrix::Csr<ValueType, IndexType>* l,
const matrix::Coo<ValueType, IndexType>* l_coo)
{
auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
auto block_size = default_block_size / subwarp_size;
auto num_blocks = ceildiv(total_nnz, block_size);
if (num_blocks > 0) {
kernel::ict_sweep<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
a->get_const_row_ptrs(), a->get_const_col_idxs(),
as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
}
}


GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);


} // namespace


template <typename ValueType, typename IndexType>
void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* llh,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* l,
matrix::Csr<ValueType, IndexType>* l_new)
{
auto num_rows = a->get_size()[0];
auto total_nnz =
llh->get_num_stored_elements() + a->get_num_stored_elements();
auto total_nnz_per_row = total_nnz / num_rows;
select_add_candidates(
compiled_kernels(),
[&](int compiled_subwarp_size) {
return total_nnz_per_row <= compiled_subwarp_size ||
compiled_subwarp_size == config::warp_size;
},
syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);


template <typename ValueType, typename IndexType>
void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
matrix::Csr<ValueType, IndexType>* l,
const matrix::Coo<ValueType, IndexType>* l_coo)
{
auto num_rows = a->get_size()[0];
auto total_nnz = 2 * l->get_num_stored_elements();
auto total_nnz_per_row = total_nnz / num_rows;
select_compute_factor(
compiled_kernels(),
[&](int compiled_subwarp_size) {
return total_nnz_per_row <= compiled_subwarp_size ||
compiled_subwarp_size == config::warp_size;
},
syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);


} // namespace par_ict_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@
#include "common/cuda_hip/components/prefix_sum.hpp"
#include "common/cuda_hip/components/sorting.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp"
#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/factorization/par_ilut_kernels.hpp"
#include "core/matrix/coo_builder.hpp"
#include "core/matrix/csr_builder.hpp"
#include "core/matrix/csr_kernels.hpp"
#include "core/synthesizer/implementation_selection.hpp"
#include "hip/factorization/par_ilut_select_common.hip.hpp"


namespace gko {
namespace kernels {
namespace hip {
namespace GKO_DEVICE_NAMESPACE {
/**
* @brief The parallel ILUT factorization namespace.
*
Expand All @@ -45,10 +47,6 @@ using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"


template <int subwarp_size, typename ValueType, typename IndexType>
void threshold_filter_approx(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
Expand Down Expand Up @@ -175,6 +173,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


} // namespace par_ilut_factorization
} // namespace hip
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
29 changes: 29 additions & 0 deletions common/cuda_hip/factorization/par_ilut_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_

#include "common/cuda_hip/base/config.hpp"

namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace par_ilut_factorization {


constexpr int default_block_size = 512;


// subwarp sizes for add_candidates kernels
using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


} // namespace par_ilut_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko

#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
Loading

0 comments on commit 6ea8133

Please sign in to comment.