Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify factorization kernels #1647

Merged
merged 5 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
set(CUDA_HIP_SOURCES
base/device_matrix_data_kernels.cpp
base/index_set_kernels.cpp
components/prefix_sum_kernels.cpp
distributed/index_map_kernels.cpp
distributed/matrix_kernels.cpp
Expand All @@ -9,9 +10,18 @@ set(CUDA_HIP_SOURCES
distributed/vector_kernels.cpp
factorization/cholesky_kernels.cpp
factorization/factorization_kernels.cpp
factorization/ic_kernels.cpp
factorization/ilu_kernels.cpp
factorization/lu_kernels.cpp
factorization/par_ic_kernels.cpp
factorization/par_ict_kernels.cpp
factorization/par_ilu_kernels.cpp
factorization/par_ilut_approx_filter_kernels.cpp
factorization/par_ilut_filter_kernels.cpp
factorization/par_ilut_select_common.cpp
factorization/par_ilut_select_kernels.cpp
factorization/par_ilut_spgeam_kernels.cpp
factorization/par_ilut_sweep_kernels.cpp
matrix/coo_kernels.cpp
matrix/dense_kernels.cpp
matrix/diagonal_kernels.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,7 @@

namespace gko {
namespace kernels {
/**
* @brief The Cuda namespace.
*
* @ingroup cuda
*/
namespace cuda {
/**
* @brief The index_set namespace.
*
* @ingroup index_set
*/
namespace GKO_DEVICE_NAMESPACE {
namespace idx_set {


Expand Down Expand Up @@ -78,6 +68,6 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(


} // namespace idx_set
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,13 @@

#include <ginkgo/core/base/array.hpp>

#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/sparselib_bindings.hpp"


namespace gko {
namespace kernels {
namespace cuda {
/**
* @brief The ic factorization namespace.
*
* @ingroup factor
*/
namespace GKO_DEVICE_NAMESPACE {
namespace ic_factorization {


Expand Down Expand Up @@ -50,7 +46,7 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());

// CUDA 11.4 has a use-after-free bug on Turing
#if (CUDA_VERSION >= 11040)
#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
exec->synchronize();
#endif

Expand All @@ -62,6 +58,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);


} // namespace ic_factorization
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,13 @@

#include <ginkgo/core/base/array.hpp>

#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/sparselib_bindings.hpp"


namespace gko {
namespace kernels {
namespace cuda {
/**
* @brief The ilu factorization namespace.
*
* @ingroup factor
*/
namespace GKO_DEVICE_NAMESPACE {
namespace ilu_factorization {


Expand Down Expand Up @@ -50,7 +46,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());

// CUDA 11.4 has a use-after-free bug on Turing
#if (CUDA_VERSION >= 11040)
#if defined(GKO_COMPILING_CUDA) && (CUDA_VERSION >= 11040)
exec->synchronize();
#endif

Expand All @@ -63,6 +59,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


} // namespace ilu_factorization
} // namespace cuda
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,49 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/factorization/par_ict_kernels.hpp"

#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/matrix/coo.hpp>
#include <ginkgo/core/matrix/csr.hpp>
#include <ginkgo/core/matrix/dense.hpp>

#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/components/intrinsics.hpp"
#include "common/cuda_hip/components/memory.hpp"
#include "common/cuda_hip/components/merging.hpp"
#include "common/cuda_hip/components/prefix_sum.hpp"
#include "common/cuda_hip/components/reduction.hpp"
#include "common/cuda_hip/components/searching.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/coo_builder.hpp"
#include "core/matrix/csr_builder.hpp"
#include "core/matrix/csr_kernels.hpp"
#include "core/synthesizer/implementation_selection.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
/**
* @brief The parallel ICT factorization namespace.
*
* @ingroup factor
*/
namespace par_ict_factorization {


constexpr int default_block_size = 512;


// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


namespace kernel {


Expand Down Expand Up @@ -275,3 +318,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(


} // namespace kernel


namespace {


template <int subwarp_size, typename ValueType, typename IndexType>
void add_candidates(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* llh,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* l,
matrix::Csr<ValueType, IndexType>* l_new)
{
auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
auto subwarps_per_block = default_block_size / subwarp_size;
auto num_blocks = ceildiv(num_rows, subwarps_per_block);
matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
auto llh_row_ptrs = llh->get_const_row_ptrs();
auto llh_col_idxs = llh->get_const_col_idxs();
auto llh_vals = llh->get_const_values();
auto a_row_ptrs = a->get_const_row_ptrs();
auto a_col_idxs = a->get_const_col_idxs();
auto a_vals = a->get_const_values();
auto l_row_ptrs = l->get_const_row_ptrs();
auto l_col_idxs = l->get_const_col_idxs();
auto l_vals = l->get_const_values();
auto l_new_row_ptrs = l_new->get_row_ptrs();
// count non-zeros per row
if (num_blocks > 0) {
kernel::ict_tri_spgeam_nnz<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
l_new_row_ptrs, num_rows);
}

// build row ptrs
components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);

// resize output arrays
auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
l_new_builder.get_value_array().resize_and_reset(l_new_nnz);

auto l_new_col_idxs = l_new->get_col_idxs();
auto l_new_vals = l_new->get_values();

// fill columns and values
if (num_blocks > 0) {
kernel::ict_tri_spgeam_init<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
l_new_col_idxs, as_device_type(l_new_vals), num_rows);
}
}


GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);


template <int subwarp_size, typename ValueType, typename IndexType>
void compute_factor(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
matrix::Csr<ValueType, IndexType>* l,
const matrix::Coo<ValueType, IndexType>* l_coo)
{
auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
auto block_size = default_block_size / subwarp_size;
auto num_blocks = ceildiv(total_nnz, block_size);
if (num_blocks > 0) {
kernel::ict_sweep<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
a->get_const_row_ptrs(), a->get_const_col_idxs(),
as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
}
}


GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);


} // namespace


template <typename ValueType, typename IndexType>
void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* llh,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* l,
matrix::Csr<ValueType, IndexType>* l_new)
{
auto num_rows = a->get_size()[0];
auto total_nnz =
llh->get_num_stored_elements() + a->get_num_stored_elements();
auto total_nnz_per_row = total_nnz / num_rows;
select_add_candidates(
compiled_kernels(),
[&](int compiled_subwarp_size) {
return total_nnz_per_row <= compiled_subwarp_size ||
compiled_subwarp_size == config::warp_size;
},
syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);


template <typename ValueType, typename IndexType>
void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
matrix::Csr<ValueType, IndexType>* l,
const matrix::Coo<ValueType, IndexType>* l_coo)
{
auto num_rows = a->get_size()[0];
auto total_nnz = 2 * l->get_num_stored_elements();
auto total_nnz_per_row = total_nnz / num_rows;
select_compute_factor(
compiled_kernels(),
[&](int compiled_subwarp_size) {
return total_nnz_per_row <= compiled_subwarp_size ||
compiled_subwarp_size == config::warp_size;
},
syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);


} // namespace par_ict_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@
#include "common/cuda_hip/components/prefix_sum.hpp"
#include "common/cuda_hip/components/sorting.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp"
#include "common/cuda_hip/factorization/par_ilut_select_common.hpp"
#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/factorization/par_ilut_kernels.hpp"
#include "core/matrix/coo_builder.hpp"
#include "core/matrix/csr_builder.hpp"
#include "core/matrix/csr_kernels.hpp"
#include "core/synthesizer/implementation_selection.hpp"
#include "hip/factorization/par_ilut_select_common.hip.hpp"


namespace gko {
namespace kernels {
namespace hip {
namespace GKO_DEVICE_NAMESPACE {
/**
* @brief The parallel ILUT factorization namespace.
*
Expand All @@ -45,10 +47,6 @@ using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"


template <int subwarp_size, typename ValueType, typename IndexType>
void threshold_filter_approx(syn::value_list<int, subwarp_size>,
std::shared_ptr<const DefaultExecutor> exec,
Expand Down Expand Up @@ -175,6 +173,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


} // namespace par_ilut_factorization
} // namespace hip
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
29 changes: 29 additions & 0 deletions common/cuda_hip/factorization/par_ilut_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
#define GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_

#include "common/cuda_hip/base/config.hpp"

namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace par_ilut_factorization {


constexpr int default_block_size = 512;


// subwarp sizes for add_candidates kernels
using compiled_kernels =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;


} // namespace par_ilut_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko

#endif // GKO_COMMON_CUDA_HIP_FACTORIZATION_PAR_ILUT_CONFIG_HIP_HPP_
Loading
Loading