Skip to content

Commit

Permalink
Merge Use index map in distributed::Matrix
Browse files Browse the repository at this point in the history
This merge uses index map in `distributed::Matrix::read_distributed`. The `index_map` is used to create the numbering for the non-local columns. For now, the created index map is neither stored nor returned from the function. This will be added later.

Related PR: #1544
  • Loading branch information
MarcelKoch authored May 20, 2024
2 parents e8af940 + 0d3b710 commit a61989f
Show file tree
Hide file tree
Showing 19 changed files with 338 additions and 530 deletions.
30 changes: 28 additions & 2 deletions benchmark/test/reference/distributed_solver.profile.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,34 @@ DEBUG: begin dense::fill
DEBUG: end dense::fill
DEBUG: begin components::aos_to_soa
DEBUG: end components::aos_to_soa
DEBUG: begin distributed_matrix::build_local_nonlocal
DEBUG: end distributed_matrix::build_local_nonlocal
DEBUG: begin distributed_matrix::separate_local_nonlocal
DEBUG: end distributed_matrix::separate_local_nonlocal
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin index_map::build_mapping
DEBUG: end index_map::build_mapping
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin index_map::map_to_local
DEBUG: end index_map::map_to_local
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin copy
Expand Down
30 changes: 28 additions & 2 deletions benchmark/test/reference/spmv_distributed.profile.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,34 @@ DEBUG: begin dense::fill
DEBUG: end dense::fill
DEBUG: begin components::aos_to_soa
DEBUG: end components::aos_to_soa
DEBUG: begin distributed_matrix::build_local_nonlocal
DEBUG: end distributed_matrix::build_local_nonlocal
DEBUG: begin distributed_matrix::separate_local_nonlocal
DEBUG: end distributed_matrix::separate_local_nonlocal
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin index_map::build_mapping
DEBUG: end index_map::build_mapping
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin index_map::map_to_local
DEBUG: end index_map::map_to_local
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin copy
Expand Down
2 changes: 1 addition & 1 deletion benchmark/test/reference/spmv_distributed.profile.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"comm_pattern": "stencil",
"spmv": {
"csr-csr": {
"storage": 6564,
"storage": 6420,
"time": 1.0,
"repetitions": 1,
"completed": true
Expand Down
2 changes: 1 addition & 1 deletion benchmark/test/reference/spmv_distributed.simple.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"comm_pattern": "stencil",
"spmv": {
"csr-csr": {
"storage": 6564,
"storage": 6420,
"max_relative_norm2": 1.0,
"time": 1.0,
"repetitions": 10,
Expand Down
8 changes: 5 additions & 3 deletions benchmark/utils/generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,11 @@ struct DistributedDefaultSystemGenerator {
const gko::matrix_data<value_type, index_type>& data,
json* spmv_case = nullptr) const
{
auto part = gko::experimental::distributed::
Partition<itype, global_itype>::build_from_global_size_uniform(
exec, comm.size(), static_cast<global_itype>(data.size[0]));
auto part = gko::share(
gko::experimental::distributed::Partition<itype, global_itype>::
build_from_global_size_uniform(
exec, comm.size(),
static_cast<global_itype>(data.size[0])));
auto formats = split(format_name, '-');
if (formats.size() != 2) {
throw std::runtime_error{"Invalid distributed format specifier " +
Expand Down
141 changes: 25 additions & 116 deletions common/cuda_hip/distributed/matrix_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,23 @@ struct input_type {


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void build_local_nonlocal(
void separate_local_nonlocal(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
row_partition,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
col_partition,
comm_index_type local_part, array<LocalIndexType>& local_row_idxs,
experimental::distributed::comm_index_type local_part,
array<LocalIndexType>& local_row_idxs,
array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values,
array<LocalIndexType>& non_local_row_idxs,
array<LocalIndexType>& non_local_col_idxs,
array<ValueType>& non_local_values,
array<LocalIndexType>& local_gather_idxs,
array<comm_index_type>& recv_sizes,
array<GlobalIndexType>& non_local_to_global)
array<GlobalIndexType>& non_local_col_idxs,
array<ValueType>& non_local_values)
{
auto input_vals = input.get_const_values();
auto row_part_ids = row_partition->get_part_ids();
auto col_part_ids = col_partition->get_part_ids();
auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
const auto* row_range_bounds = row_partition->get_range_bounds();
const auto* col_range_bounds = col_partition->get_range_bounds();
const auto* row_range_starting_indices =
Expand All @@ -54,25 +51,29 @@ void build_local_nonlocal(
const auto num_col_ranges = col_partition->get_num_ranges();
const auto num_input_elements = input.get_num_stored_elements();

auto policy = thrust_policy(exec);

// precompute the row and column range id of each input element
auto input_row_idxs = input.get_const_row_idxs();
auto input_col_idxs = input.get_const_col_idxs();
array<size_type> row_range_ids{exec, num_input_elements};
thrust::upper_bound(thrust_policy(exec), row_range_bounds + 1,
thrust::upper_bound(policy, row_range_bounds + 1,
row_range_bounds + num_row_ranges + 1, input_row_idxs,
input_row_idxs + num_input_elements,
row_range_ids.get_data());
array<size_type> col_range_ids{exec, input.get_num_stored_elements()};
thrust::upper_bound(thrust_policy(exec), col_range_bounds + 1,
thrust::upper_bound(policy, col_range_bounds + 1,
col_range_bounds + num_col_ranges + 1, input_col_idxs,
input_col_idxs + num_input_elements,
col_range_ids.get_data());

// count number of local<0> and non-local<1> elements
// count number of local<0> and non-local<1> elements. Since the input
// may contain non-local rows, we don't have
// num_local + num_non_local = num_elements and can't just count one of them
auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple(
row_range_ids.get_const_data(), col_range_ids.get_const_data()));
auto num_elements_pair = thrust::transform_reduce(
thrust_policy(exec), range_ids_it, range_ids_it + num_input_elements,
policy, range_ids_it, range_ids_it + num_input_elements,
[local_part, row_part_ids, col_part_ids] __host__ __device__(
const thrust::tuple<size_type, size_type>& tuple) {
auto row_part = row_part_ids[thrust::get<0>(tuple)];
Expand Down Expand Up @@ -128,8 +129,8 @@ void build_local_nonlocal(
return thrust::make_tuple(local_row, local_col, input.val);
});
thrust::copy_if(
thrust_policy(exec), local_it,
local_it + input.get_num_stored_elements(), range_ids_it,
policy, local_it, local_it + input.get_num_stored_elements(),
range_ids_it,
thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(),
local_col_idxs.get_data(),
local_values.get_data())),
Expand All @@ -139,125 +140,33 @@ void build_local_nonlocal(
auto col_part = col_part_ids[thrust::get<1>(tuple)];
return row_part == local_part && col_part == local_part;
});


// copy and transform non-local entries into arrays. this keeps global
// column indices, and also stores the column part id for each non-local
// entry in an array
non_local_row_idxs.resize_and_reset(num_non_local_elements);
non_local_col_idxs.resize_and_reset(num_non_local_elements);
non_local_values.resize_and_reset(num_non_local_elements);
array<GlobalIndexType> non_local_global_col_idxs{exec,
num_non_local_elements};
array<comm_index_type> non_local_col_part_ids{exec, num_non_local_elements};
array<size_type> non_local_col_range_ids{exec, num_non_local_elements};
auto non_local_it = thrust::make_transform_iterator(
input_it, [map_to_local_row, map_to_local_col,
input_it, [map_to_local_row,
col_part_ids] __host__ __device__(const input_type input) {
auto local_row = map_to_local_row(input.row, input.row_range);
return thrust::make_tuple(local_row, input.col, input.val,
col_part_ids[input.col_range],
input.col_range);
return thrust::make_tuple(local_row, input.col, input.val);
});
thrust::copy_if(
thrust_policy(exec), non_local_it,
non_local_it + input.get_num_stored_elements(), range_ids_it,
policy, non_local_it, non_local_it + input.get_num_stored_elements(),
range_ids_it,
thrust::make_zip_iterator(thrust::make_tuple(
non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(),
non_local_values.get_data(), non_local_col_part_ids.get_data(),
non_local_col_range_ids.get_data())),
non_local_row_idxs.get_data(), non_local_col_idxs.get_data(),
non_local_values.get_data())),
[local_part, row_part_ids, col_part_ids] __host__ __device__(
const thrust::tuple<size_type, size_type>& tuple) {
auto row_part = row_part_ids[thrust::get<0>(tuple)];
auto col_part = col_part_ids[thrust::get<1>(tuple)];
return row_part == local_part && col_part != local_part;
});

// 1. sort global columns, part-id and range-id according to
// their part-id and global columns
// the previous `non_local_global_col_idxs` is not modify to
// keep it consistent with the non-local row and values array
array<GlobalIndexType> sorted_non_local_global_col_idxs{
exec, non_local_global_col_idxs};
auto key_it = thrust::make_zip_iterator(
thrust::make_tuple(non_local_col_part_ids.get_data(),
sorted_non_local_global_col_idxs.get_data()));
thrust::sort_by_key(thrust_policy(exec), key_it,
key_it + num_non_local_elements,
non_local_col_range_ids.get_data());

// 2. remove duplicate columns, now the new column i has global index
// non_local_global_col_idxs[i]
auto non_local_global_col_idxs_begin =
sorted_non_local_global_col_idxs.get_data();
auto non_local_global_col_idxs_end = thrust::get<0>(thrust::unique_by_key(
thrust_policy(exec), non_local_global_col_idxs_begin,
non_local_global_col_idxs_begin + num_non_local_elements,
thrust::make_zip_iterator(
thrust::make_tuple(non_local_col_part_ids.get_data(),
non_local_col_range_ids.get_data()))));
auto num_non_local_cols = static_cast<size_type>(thrust::distance(
non_local_global_col_idxs_begin, non_local_global_col_idxs_end));

// 2.5 copy unique_columns to non_local_to_global map
non_local_to_global.resize_and_reset(num_non_local_cols);
exec->copy(num_non_local_cols, non_local_global_col_idxs_begin,
non_local_to_global.get_data());

// 3. create mapping from unique_columns
// since we don't have hash tables on GPUs I'm first sorting the non-local
// global column indices and their new local index again by the global
// column index. Then I'm using binary searches to find the new local column
// index.
array<LocalIndexType> permutation{exec, num_non_local_cols};
thrust::sequence(thrust_policy(exec), permutation.get_data(),
permutation.get_data() + num_non_local_cols);
thrust::sort_by_key(
thrust_policy(exec), non_local_global_col_idxs_begin,
non_local_global_col_idxs_begin + num_non_local_cols,
thrust::make_zip_iterator(thrust::make_tuple(
non_local_col_part_ids.get_data(), permutation.get_data())));

// 4. map column index of non-local entries to new columns
non_local_col_idxs.resize_and_reset(num_non_local_elements);
array<size_type> lower_bounds{exec, num_non_local_elements};
// I have to precompute the lower bounds because the calling binary
// searches from the device does not work:
// https://github.com/NVIDIA/thrust/issues/1415
// TODO: compute lower bounds on-the-fly if available
thrust::lower_bound(
thrust_policy(exec), non_local_global_col_idxs_begin,
non_local_global_col_idxs_begin + num_non_local_cols,
non_local_global_col_idxs.get_data(),
non_local_global_col_idxs.get_data() + num_non_local_elements,
lower_bounds.get_data());
auto permutation_data = permutation.get_data();
thrust::transform(
thrust_policy(exec), lower_bounds.get_data(),
lower_bounds.get_data() + num_non_local_elements,
non_local_col_idxs.get_data(),
[permutation_data] __host__ __device__(const size_type lower_bound) {
return permutation_data[lower_bound];
});

// 5. compute gather idxs and recv_sizes
local_gather_idxs.resize_and_reset(num_non_local_cols);
auto transform_it = thrust::make_zip_iterator(thrust::make_tuple(
non_local_to_global.get_data(), non_local_col_range_ids.get_data()));
thrust::transform(
thrust_policy(exec), transform_it, transform_it + num_non_local_cols,
local_gather_idxs.get_data(),
[map_to_local_col] __host__ __device__(
const thrust::tuple<GlobalIndexType, size_type>& tuple) {
return map_to_local_col(thrust::get<0>(tuple),
thrust::get<1>(tuple));
});

auto recv_sizes_ptr = recv_sizes.get_data();
thrust::fill_n(thrust_policy(exec), recv_sizes_ptr, num_parts, 0);
thrust::for_each_n(thrust_policy(exec), non_local_col_part_ids.get_data(),
num_non_local_cols,
[recv_sizes_ptr] __device__(const size_type part) {
atomic_add(recv_sizes_ptr + part, 1);
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
2 changes: 1 addition & 1 deletion core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
namespace distributed_matrix {


GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);


} // namespace distributed_matrix
Expand Down
Loading

0 comments on commit a61989f

Please sign in to comment.