Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a CSR batched matrix format, CUDA, HIP and DPCPP kernels #1450

Merged
merged 18 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

template <typename ValueType, typename IndexType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const batch::matrix::Csr<ValueType, IndexType>* mat,
const batch::MultiVector<ValueType>* b,
batch::MultiVector<ValueType>* x)
{
const auto num_blocks = mat->get_num_batch_items();
const auto b_ub = get_batch_struct(b);
const auto x_ub = get_batch_struct(x);
const auto mat_ub = get_batch_struct(mat);
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
simple_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(mat_ub, b_ub, x_ub);
}


GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);


template <typename ValueType, typename IndexType>
void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
const batch::MultiVector<ValueType>* alpha,
const batch::matrix::Csr<ValueType, IndexType>* mat,
const batch::MultiVector<ValueType>* b,
const batch::MultiVector<ValueType>* beta,
batch::MultiVector<ValueType>* x)
{
const auto num_blocks = mat->get_num_batch_items();
const auto b_ub = get_batch_struct(b);
const auto x_ub = get_batch_struct(x);
const auto mat_ub = get_batch_struct(mat);
const auto alpha_ub = get_batch_struct(alpha);
const auto beta_ub = get_batch_struct(beta);
if (b->get_common_size()[1] > 1) {
GKO_NOT_IMPLEMENTED;
}
advanced_apply_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
beta_ub, x_ub);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
113 changes: 113 additions & 0 deletions common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

template <typename ValueType, typename IndexType>
__device__ __forceinline__ void simple_apply(
const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
{
const auto num_rows = mat.num_rows;
const auto val = mat.values;
const auto col = mat.col_idxs;
for (int row = threadIdx.x; row < num_rows; row += blockDim.x) {
auto temp = zero<ValueType>();
for (auto nnz = mat.row_ptrs[row]; nnz < mat.row_ptrs[row + 1]; nnz++) {
const auto col_idx = col[nnz];
temp += val[nnz] * b[col_idx];
}
x[row] = temp;
}
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
const auto mat_b =
gko::batch::matrix::extract_batch_item(mat, batch_id);
const auto b_b = gko::batch::extract_batch_item(b, batch_id);
const auto x_b = gko::batch::extract_batch_item(x, batch_id);
simple_apply(mat_b, b_b.values, x_b.values);
}
}


template <typename ValueType, typename IndexType>
__device__ __forceinline__ void advanced_apply(
const ValueType alpha,
const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
const ValueType* const __restrict__ b, const ValueType beta,
ValueType* const __restrict__ x)
{
const auto num_rows = mat.num_rows;
const auto val = mat.values;
const auto col = mat.col_idxs;
for (int row = threadIdx.x; row < num_rows; row += blockDim.x) {
auto temp = zero<ValueType>();
for (auto nnz = mat.row_ptrs[row]; nnz < mat.row_ptrs[row + 1]; nnz++) {
const auto col_idx = col[nnz];
temp += alpha * val[nnz] * b[col_idx];
}
x[row] = temp + beta * x[row];
}
}

template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(
default_block_size,
sm_oversubscription) void advanced_apply_kernel(const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
alpha,
const gko::batch::matrix::
csr::uniform_batch<
const ValueType,
IndexType>
mat,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
b,
const gko::batch::
multi_vector::
uniform_batch<
const ValueType>
beta,
const gko::batch::
multi_vector::
uniform_batch<
ValueType>
x)
{
for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
batch_id += gridDim.x) {
const auto mat_b =
gko::batch::matrix::extract_batch_item(mat, batch_id);
const auto b_b = gko::batch::extract_batch_item(b, batch_id);
const auto x_b = gko::batch::extract_batch_item(x, batch_id);
const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
x_b.values);
}
}
1 change: 1 addition & 0 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ target_sources(ginkgo
log/vtune.cpp
log/record.cpp
log/stream.cpp
matrix/batch_csr.cpp
matrix/batch_dense.cpp
matrix/batch_ell.cpp
matrix/batch_identity.cpp
Expand Down
11 changes: 11 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "core/factorization/par_ict_kernels.hpp"
#include "core/factorization/par_ilu_kernels.hpp"
#include "core/factorization/par_ilut_kernels.hpp"
#include "core/matrix/batch_csr_kernels.hpp"
#include "core/matrix/batch_dense_kernels.hpp"
#include "core/matrix/batch_ell_kernels.hpp"
#include "core/matrix/coo_kernels.hpp"
Expand Down Expand Up @@ -281,6 +282,16 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
} // namespace batch_multi_vector


namespace batch_csr {


GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);


} // namespace batch_csr


namespace batch_dense {


Expand Down
Loading