Skip to content

Commit

Permalink
Review updates
Browse files Browse the repository at this point in the history
Co-authored-by: Marcel Koch <[email protected]>
Co-authored-by: Tobias Ribizel <[email protected]>
  • Loading branch information
3 people committed Jul 21, 2023
1 parent a60ae1e commit b724eeb
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 184 deletions.
26 changes: 19 additions & 7 deletions common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,16 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
const BatchMultiVector<ValueType>* const alpha,
BatchMultiVector<ValueType>* const x)
{
const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
const auto num_blocks = x->get_num_batch_entries();
const auto alpha_ub = get_batch_struct(alpha);
const auto x_ub = get_batch_struct(x);
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, x_ub);
if (alpha->get_common_size()[1] == 1) {
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, x_ub, [] __device__(int col) { return 0; });
} else {
scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
alpha_ub, x_ub, [] __device__(int col) { return col; });
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand All @@ -53,13 +58,20 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
const BatchMultiVector<ValueType>* const x,
BatchMultiVector<ValueType>* const y)
{
const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
const auto num_blocks = x->get_num_batch_entries();
const size_type nrhs = x->get_common_size()[1];
const auto alpha_ub = get_batch_struct(alpha);
const auto x_ub = get_batch_struct(x);
const auto y_ub = get_batch_struct(y);
add_scaled_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(alpha_ub, x_ub, y_ub);
if (alpha->get_common_size()[1] == 1) {
add_scaled_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
} else {
add_scaled_kernel<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand Down Expand Up @@ -107,7 +119,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
const BatchMultiVector<ValueType>* x,
BatchMultiVector<ValueType>* result)
{
const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
const auto num_blocks = x->get_num_batch_entries();
const auto result_ub = get_batch_struct(result);
const auto x_ub = get_batch_struct(x);
copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
Expand Down
104 changes: 49 additions & 55 deletions common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -35,85 +35,75 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Scales the vectors in global or shared memory with a factor of alpha (alpha
* is in global memory or shared memory)
*/
template <typename ValueType>
template <typename ValueType, typename Mapping>
__device__ __forceinline__ void scale(
const gko::batch_multi_vector::BatchEntry<const ValueType>& alpha,
const gko::batch_multi_vector::BatchEntry<ValueType>& x)
const gko::batch_multi_vector::batch_entry<const ValueType>& alpha,
const gko::batch_multi_vector::batch_entry<ValueType>& x, Mapping map)
{
const int max_li = x.num_rows * x.num_rhs;
for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
const int row = li / x.num_rhs;
const int col = li % x.num_rhs;

if (alpha.num_rhs == 1) {
x.values[row * x.stride + col] =
alpha.values[0] * x.values[row * x.stride + col];
} else {
x.values[row * x.stride + col] =
alpha.values[col] * x.values[row * x.stride + col];
}
x.values[row * x.stride + col] =
alpha.values[map(col)] * x.values[row * x.stride + col];
}
}

template <typename ValueType>
template <typename ValueType, typename Mapping>
__global__
__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel(
const gko::batch_multi_vector::UniformBatch<const ValueType> alpha,
const gko::batch_multi_vector::UniformBatch<ValueType> x)
__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel(
const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
const gko::batch_multi_vector::uniform_batch<ValueType> x, Mapping map)
{
for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
ibatch += gridDim.x) {
const auto alpha_b = gko::batch::batch_entry(alpha, ibatch);
const auto x_b = gko::batch::batch_entry(x, ibatch);
scale(alpha_b, x_b);
scale(alpha_b, x_b, map);
}
}


template <typename ValueType>
template <typename ValueType, typename Mapping>
__device__ __forceinline__ void add_scaled(
const gko::batch_multi_vector::BatchEntry<const ValueType>& alpha,
const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
const gko::batch_multi_vector::BatchEntry<ValueType>& y)
const gko::batch_multi_vector::batch_entry<const ValueType>& alpha,
const gko::batch_multi_vector::batch_entry<const ValueType>& x,
const gko::batch_multi_vector::batch_entry<ValueType>& y, Mapping map)
{
const int max_li = x.num_rows * x.num_rhs;
for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
const int row = li / x.num_rhs;
const int col = li % x.num_rhs;

if (alpha.num_rhs == 1) {
y.values[row * y.stride + col] +=
alpha.values[0] * x.values[row * x.stride + col];
} else {
y.values[row * y.stride + col] +=
alpha.values[col] * x.values[row * x.stride + col];
}
y.values[row * y.stride + col] +=
alpha.values[map(col)] * x.values[row * x.stride + col];
}
}

template <typename ValueType>
template <typename ValueType, typename Mapping>
__global__
__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel(
const gko::batch_multi_vector::UniformBatch<const ValueType> alpha,
const gko::batch_multi_vector::UniformBatch<const ValueType> x,
const gko::batch_multi_vector::UniformBatch<ValueType> y)
__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel(
const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
const gko::batch_multi_vector::uniform_batch<const ValueType> x,
const gko::batch_multi_vector::uniform_batch<ValueType> y, Mapping map)
{
for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
ibatch += gridDim.x) {
const auto alpha_b = gko::batch::batch_entry(alpha, ibatch);
const auto x_b = gko::batch::batch_entry(x, ibatch);
const auto y_b = gko::batch::batch_entry(y, ibatch);
add_scaled(alpha_b, x_b, y_b);
add_scaled(alpha_b, x_b, y_b, map);
}
}


template <typename ValueType>
__device__ __forceinline__ void one_dot(
const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
const gko::batch_multi_vector::BatchEntry<const ValueType>& y,
const gko::batch_multi_vector::batch_entry<const ValueType>& x,
const gko::batch_multi_vector::batch_entry<const ValueType>& y,
const int rhs_index,
const gko::batch_multi_vector::BatchEntry<ValueType>& result,
const gko::batch_multi_vector::batch_entry<ValueType>& result,
group::thread_block_tile<config::warp_size>& subwarp_grp)
{
ValueType val = zero<ValueType>();
Expand Down Expand Up @@ -143,9 +133,9 @@ __device__ __forceinline__ void one_dot(
*/
template <typename ValueType>
__device__ __forceinline__ void compute_dot_product(
const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
const gko::batch_multi_vector::BatchEntry<const ValueType>& y,
const gko::batch_multi_vector::BatchEntry<ValueType>& result)
const gko::batch_multi_vector::batch_entry<const ValueType>& x,
const gko::batch_multi_vector::batch_entry<const ValueType>& y,
const gko::batch_multi_vector::batch_entry<ValueType>& result)
{
constexpr auto tile_size = config::warp_size;
auto thread_block = group::this_thread_block();
Expand All @@ -165,17 +155,17 @@ __global__ __launch_bounds__(
default_block_size,
sm_multiplier) void compute_dot_product_kernel(const gko::
batch_multi_vector::
UniformBatch<
uniform_batch<
const ValueType>
x,
const gko::
batch_multi_vector::
UniformBatch<
uniform_batch<
const ValueType>
y,
const gko::
batch_multi_vector::
UniformBatch<
uniform_batch<
ValueType>
result)
{
Expand All @@ -191,9 +181,9 @@ __global__ __launch_bounds__(

template <typename ValueType>
__device__ __forceinline__ void one_norm2(
const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
const gko::batch_multi_vector::batch_entry<const ValueType>& x,
const int rhs_index,
const gko::batch_multi_vector::BatchEntry<remove_complex<ValueType>>&
const gko::batch_multi_vector::batch_entry<remove_complex<ValueType>>&
result,
group::thread_block_tile<config::warp_size>& subwarp_grp)
{
Expand Down Expand Up @@ -225,8 +215,8 @@ __device__ __forceinline__ void one_norm2(
*/
template <typename ValueType>
__device__ __forceinline__ void compute_norm2(
const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
const gko::batch_multi_vector::BatchEntry<remove_complex<ValueType>>&
const gko::batch_multi_vector::batch_entry<const ValueType>& x,
const gko::batch_multi_vector::batch_entry<remove_complex<ValueType>>&
result)
{
constexpr auto tile_size = config::warp_size;
Expand All @@ -243,11 +233,15 @@ __device__ __forceinline__ void compute_norm2(


template <typename ValueType>
__global__
__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel(
const gko::batch_multi_vector::UniformBatch<const ValueType> x,
const gko::batch_multi_vector::UniformBatch<remove_complex<ValueType>>
result)
__global__ __launch_bounds__(
default_block_size,
sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector::
uniform_batch<const ValueType>
x,
const gko::batch_multi_vector::
uniform_batch<
remove_complex<ValueType>>
result)
{
for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
ibatch += gridDim.x) {
Expand All @@ -266,8 +260,8 @@ __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel(
*/
template <typename ValueType>
__device__ __forceinline__ void copy(
const gko::batch_multi_vector::BatchEntry<const ValueType>& in,
const gko::batch_multi_vector::BatchEntry<ValueType>& out)
const gko::batch_multi_vector::batch_entry<const ValueType>& in,
const gko::batch_multi_vector::batch_entry<ValueType>& out)
{
for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
iz += blockDim.x) {
Expand All @@ -280,9 +274,9 @@ __device__ __forceinline__ void copy(

template <typename ValueType>
__global__
__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel(
const gko::batch_multi_vector::UniformBatch<const ValueType> src,
const gko::batch_multi_vector::UniformBatch<ValueType> dst)
__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel(
const gko::batch_multi_vector::uniform_batch<const ValueType> src,
const gko::batch_multi_vector::uniform_batch<ValueType> dst)
{
for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries;
ibatch += gridDim.x) {
Expand Down
36 changes: 12 additions & 24 deletions core/base/batch_multi_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,10 @@ void BatchMultiVector<ValueType>::scale_impl(
GKO_ASSERT_EQ(alpha->get_num_batch_entries(),
this->get_num_batch_entries());
GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) {
if (alpha->get_common_size()[1] != 1) {
// different alpha for each column
GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
alpha->get_common_size());
}
if (alpha->get_common_size()[1] != 1) {
// different alpha for each column
GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
alpha->get_common_size());
}
this->get_executor()->run(batch_multi_vector::make_scale(alpha, this));
}
Expand All @@ -91,12 +89,10 @@ void BatchMultiVector<ValueType>::add_scaled_impl(
GKO_ASSERT_EQ(alpha->get_num_batch_entries(),
this->get_num_batch_entries());
GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) {
if (alpha->get_common_size()[1] != 1) {
// different alpha for each column
GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
alpha->get_common_size());
}
if (alpha->get_common_size()[1] != 1) {
// different alpha for each column
GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
alpha->get_common_size());
}
GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries());
GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size());
Expand Down Expand Up @@ -162,11 +158,11 @@ void BatchMultiVector<ValueType>::move_to(


template <typename MatrixType, typename MatrixData>
inline void read_impl(MatrixType* mtx, const std::vector<MatrixData>& data)
void read_impl(MatrixType* mtx, const std::vector<MatrixData>& data)
{
GKO_ASSERT(data.size() > 0);
auto common_size = data[0].size;
auto batch_size = batch_dim<2>(data.size(), common_size);
size_type ind = 0;
for (const auto& b : data) {
auto b_size = b.size;
GKO_ASSERT_EQUAL_DIMENSIONS(common_size, b_size);
Expand Down Expand Up @@ -208,17 +204,9 @@ void BatchMultiVector<ValueType>::read(const std::vector<mat_data32>& data)


template <typename MatrixType, typename MatrixData>
inline void write_impl(const MatrixType* mtx, std::vector<MatrixData>& data)
void write_impl(const MatrixType* mtx, std::vector<MatrixData>& data)
{
std::unique_ptr<const BatchMultiVector<typename MatrixData::value_type>>
op{};
const MatrixType* tmp{};
if (mtx->get_executor()->get_master() != mtx->get_executor()) {
op = mtx->clone(mtx->get_executor()->get_master());
tmp = static_cast<const MatrixType*>(op.get());
} else {
tmp = mtx;
}
auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx);

data = std::vector<MatrixData>(mtx->get_num_batch_entries());
for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) {
Expand Down
Loading

0 comments on commit b724eeb

Please sign in to comment.