Skip to content

Commit

Permalink
review updates
Browse files Browse the repository at this point in the history
Co-authored-by: Marcel Koch <[email protected]>
Co-authored-by: Yu-Hsiang Tsai <[email protected]>
  • Loading branch information
3 people committed May 5, 2024
1 parent c39d311 commit 5992056
Show file tree
Hide file tree
Showing 14 changed files with 53 additions and 161 deletions.
3 changes: 1 addition & 2 deletions core/solver/batch_cg_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ void set_gmem_stride_bytes(storage_config& sconf,
gmem_stride += prec_storage_bytes;
}
// align global memory chunks
sconf.gmem_stride_bytes =
gmem_stride > 0 ? ceildiv(gmem_stride, align_bytes) * align_bytes : 0;
sconf.gmem_stride_bytes = ceildiv(gmem_stride, align_bytes) * align_bytes;
}


Expand Down
14 changes: 7 additions & 7 deletions core/test/solver/batch_cg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,16 @@ TYPED_TEST(BatchCg, CanBeMoved)
{
using Mtx = typename TestFixture::Mtx;
using Solver = typename TestFixture::Solver;
auto copy = this->solver_factory->generate(Mtx::create(this->exec));
auto move = this->solver_factory->generate(Mtx::create(this->exec));

copy->move_from(this->solver);
move->move_from(this->solver);

ASSERT_EQ(copy->get_common_size(),
ASSERT_EQ(move->get_common_size(),
gko::dim<2>(this->num_rows, this->num_rows));
ASSERT_EQ(copy->get_num_batch_items(), this->num_batch_items);
auto copy_mtx = gko::as<Solver>(copy.get())->get_system_matrix();
const auto copy_batch_mtx = gko::as<const Mtx>(copy_mtx.get());
GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), copy_batch_mtx, 0.0);
ASSERT_EQ(move->get_num_batch_items(), this->num_batch_items);
auto moved_mtx = gko::as<Solver>(move.get())->get_system_matrix();
const auto moved_batch_mtx = gko::as<const Mtx>(moved_mtx.get());
GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), moved_batch_mtx, 0.0);
ASSERT_EQ(gko::as<Solver>(this->solver.get())->get_system_matrix(),
nullptr);
}
Expand Down
6 changes: 2 additions & 4 deletions cuda/solver/batch_bicgstab_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,8 @@ public:
exec_, mat.num_rows);
GKO_ASSERT(block_size >= 2 * config::warp_size);
const size_t prec_size =
PrecType::dynamic_work_size(padded_num_rows,
mat.get_single_item_num_nnz()) *
sizeof(value_type);
const size_t prec_size = PrecType::dynamic_work_size(
padded_num_rows, mat.get_single_item_num_nnz());
const auto sconf =
gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
value_type>(
Expand Down
41 changes: 0 additions & 41 deletions cuda/solver/batch_cg_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,47 +57,6 @@ template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename CuValueType>
class kernel_caller {
public:
using value_type = CuValueType;

kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<value_type>> settings)
: exec_{std::move(exec)}, settings_{settings}
{}

template <typename StopType, const int n_shared,
const bool prec_shared_bool, typename PrecType, typename LogType,
typename BatchMatrixType>
void launch_apply_kernel(
const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const value_type* const __restrict__ b_values,
value_type* const __restrict__ x_values,
value_type* const __restrict__ workspace_data, const int& block_size,
const size_t& shared_size) const
{
GKO_NOT_IMPLEMENTED;
}


template <typename BatchMatrixType, typename PrecType, typename StopType,
typename LogType>
void call_kernel(
LogType logger, const BatchMatrixType& mat, PrecType prec,
const gko::batch::multi_vector::uniform_batch<const value_type>& b,
const gko::batch::multi_vector::uniform_batch<value_type>& x) const
{
GKO_NOT_IMPLEMENTED;
}

private:
std::shared_ptr<const DefaultExecutor> exec_;
const settings<remove_complex<value_type>> settings_;
};


template <typename ValueType>
void apply(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<ValueType>>& settings,
Expand Down
38 changes: 0 additions & 38 deletions dpcpp/solver/batch_cg_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,44 +49,6 @@ template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename ValueType>
class KernelCaller {
public:
KernelCaller(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<ValueType>> settings)
: exec_{std::move(exec)}, settings_{settings}
{}

template <typename StopType, const int subgroup_size,
const int n_shared_total, typename PrecType, typename LogType,
typename BatchMatrixType>
__dpct_inline__ void launch_apply_kernel(
const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
PrecType& prec, const BatchMatrixType mat,
const ValueType* const __restrict__ b_values,
ValueType* const __restrict__ x_values,
ValueType* const __restrict__ workspace, const int& group_size,
const int& shared_size) const
{
GKO_NOT_IMPLEMENTED;
}

template <typename BatchMatrixType, typename PrecType, typename StopType,
typename LogType>
void call_kernel(
LogType logger, const BatchMatrixType& mat, PrecType prec,
const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
{
GKO_NOT_IMPLEMENTED;
}

private:
std::shared_ptr<const DefaultExecutor> exec_;
const settings<remove_complex<ValueType>> settings_;
};


template <typename ValueType>
void apply(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<ValueType>>& settings,
Expand Down
7 changes: 3 additions & 4 deletions hip/solver/batch_bicgstab_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,9 @@ class kernel_caller {
get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
GKO_ASSERT(block_size >= 2 * config::warp_size);

const size_t prec_size =
PrecType::dynamic_work_size(padded_num_rows,
mat.get_single_item_num_nnz()) *
sizeof(value_type);
// Returns amount required in bytes
const size_t prec_size = PrecType::dynamic_work_size(
padded_num_rows, mat.get_single_item_num_nnz());
const auto sconf =
gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
value_type>(
Expand Down
41 changes: 0 additions & 41 deletions hip/solver/batch_cg_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,47 +56,6 @@ template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename HipValueType>
class kernel_caller {
public:
using value_type = HipValueType;

kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<value_type>> settings)
: exec_{exec}, settings_{settings}
{}

template <typename StopType, const int n_shared,
const bool prec_shared_bool, typename PrecType, typename LogType,
typename BatchMatrixType>
void launch_apply_kernel(
const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const value_type* const __restrict__ b_values,
value_type* const __restrict__ x_values,
value_type* const __restrict__ workspace_data, const int& block_size,
const size_t& shared_size) const
{
GKO_NOT_IMPLEMENTED;
}


template <typename BatchMatrixType, typename PrecType, typename StopType,
typename LogType>
void call_kernel(
LogType logger, const BatchMatrixType& mat, PrecType prec,
const gko::batch::multi_vector::uniform_batch<const value_type>& b,
const gko::batch::multi_vector::uniform_batch<value_type>& x) const
{
GKO_NOT_IMPLEMENTED;
}

private:
std::shared_ptr<const DefaultExecutor> exec_;
const settings<remove_complex<value_type>> settings_;
};


template <typename ValueType>
void apply(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<ValueType>>& settings,
Expand Down
2 changes: 1 addition & 1 deletion include/ginkgo/core/solver/batch_cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace solver {


/**
* Cg or the Conjugate Gradient-Stabilized is a Krylov subspace solver.
* Cg or the Conjugate Gradient is a Krylov subspace solver.
* It is a short recurrence solver that is generally used to solve linear
* systems with SPD matrices.
*
Expand Down
22 changes: 15 additions & 7 deletions omp/solver/batch_bicgstab_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
#include "core/solver/batch_bicgstab_kernels.hpp"


#include <omp.h>


#include <ginkgo/core/base/array.hpp>


#include "core/solver/batch_dispatch.hpp"


Expand Down Expand Up @@ -66,19 +72,21 @@ class kernel_caller {
gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
num_rows, num_rhs) +
PrecondType::dynamic_work_size(num_rows,
mat.get_single_item_num_nnz()) *
sizeof(ValueType);
mat.get_single_item_num_nnz());
int max_threads = omp_get_max_threads();
auto local_space =
array<unsigned char>(exec_, local_size_bytes * max_threads);

#pragma omp parallel for
for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
// TODO: Align to cache line boundary
// TODO: Allocate and free once per thread rather than once per
// work-item.
auto local_space = array<unsigned char>(exec_, local_size_bytes);
auto thread_local_space = gko::make_array_view(
exec_, local_size_bytes,
local_space.get_data() +
omp_get_thread_num() * local_size_bytes);
batch_entry_bicgstab_impl<StopType, PrecondType, LogType,
BatchMatrixType, ValueType>(
settings_, logger, precond, mat, b, x, batch_id,
local_space.get_data());
thread_local_space.get_data());
}
}

Expand Down
24 changes: 16 additions & 8 deletions omp/solver/batch_cg_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
#include "core/solver/batch_cg_kernels.hpp"


#include <omp.h>


#include <ginkgo/core/base/array.hpp>


#include "core/solver/batch_dispatch.hpp"


Expand Down Expand Up @@ -66,18 +72,20 @@ class kernel_caller {
gko::kernels::batch_cg::local_memory_requirement<ValueType>(
num_rows, num_rhs) +
PrecondType::dynamic_work_size(num_rows,
mat.get_single_item_num_nnz()) *
sizeof(ValueType);

mat.get_single_item_num_nnz());
int max_threads = omp_get_max_threads();
auto local_space =
array<unsigned char>(exec_, local_size_bytes * max_threads);
#pragma omp parallel for
for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
// TODO: Align to cache line boundary
// TODO: Allocate and free once per thread rather than once per
// work-item.
auto local_space = array<unsigned char>(exec_, local_size_bytes);
auto thread_local_space = gko::make_array_view(
exec_, local_size_bytes,
local_space.get_data() +
omp_get_thread_num() * local_size_bytes);
batch_entry_cg_impl<StopType, PrecondType, LogType, BatchMatrixType,
ValueType>(settings_, logger, precond, mat, b,
x, batch_id, local_space.get_data());
x, batch_id,
thread_local_space.get_data());
}
}

Expand Down
3 changes: 2 additions & 1 deletion reference/preconditioner/batch_identity.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class Identity final {
static constexpr int work_size = 0;

/**
* The size of the work vector required in case of dynamic allocation.
* The size of the work vector required in case of dynamic allocation in
* bytes.
*/
static int dynamic_work_size(int, int) { return 0; }

Expand Down
3 changes: 1 addition & 2 deletions reference/solver/batch_bicgstab_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ class kernel_caller {
gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
num_rows, num_rhs) +
PrecType::dynamic_work_size(num_rows,
mat.get_single_item_num_nnz()) *
sizeof(ValueType);
mat.get_single_item_num_nnz());
array<unsigned char> local_space(exec_, local_size_bytes);

for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
Expand Down
4 changes: 1 addition & 3 deletions reference/solver/batch_cg_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,8 @@ class kernel_caller {
const size_type local_size_bytes =
gko::kernels::batch_cg::local_memory_requirement<ValueType>(
num_rows, num_rhs) +
// TODO: return bytes for dynamic work size as well
PrecType::dynamic_work_size(num_rows,
mat.get_single_item_num_nnz()) *
sizeof(ValueType);
mat.get_single_item_num_nnz());
array<unsigned char> local_space(exec_, local_size_bytes);

for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
Expand Down
6 changes: 4 additions & 2 deletions reference/test/solver/batch_cg_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,11 @@ TYPED_TEST(BatchCg, ApplyLogsResAndIters)

auto iter_counts = logger->get_num_iterations();
auto res_norm = logger->get_residual_norm();
GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 1000);
GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 2000);
for (size_t i = 0; i < num_batch_items; i++) {
ASSERT_LE(iter_counts.get_const_data()[i], max_iters);
ASSERT_NEAR(res_norm.get_const_data()[i],
res.host_res_norm->get_const_values()[i], tol * 100);
}
}

Expand Down Expand Up @@ -197,5 +199,5 @@ TYPED_TEST(BatchCg, CanSolveHpdSystem)
auto res =
gko::test::solve_linear_system(this->exec, linear_system, solver);

GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 500);
GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 1000);
}

0 comments on commit 5992056

Please sign in to comment.