From da9919048d355e3040f71d3afd331ba6ffaa4683 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 14 Feb 2024 16:38:50 +0000 Subject: [PATCH 1/6] [dist-mat] introduce kernel to separate local and non-local matrix data --- .../distributed/matrix_kernels.hpp.inc | 141 ++++-------------- core/device_hooks/common_kernels.inc.cpp | 2 +- core/distributed/matrix_kernels.hpp | 17 +-- dpcpp/distributed/matrix_kernels.dp.cpp | 11 +- omp/distributed/matrix_kernels.cpp | 70 +-------- reference/distributed/matrix_kernels.cpp | 64 +------- 6 files changed, 49 insertions(+), 256 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc index 5caf3522f62..8848e490c18 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc @@ -24,26 +24,23 @@ struct input_type { template -void build_local_nonlocal( +void separate_local_nonlocal( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, const experimental::distributed::Partition* col_partition, - comm_index_type local_part, array& local_row_idxs, + experimental::distributed::comm_index_type local_part, + array& local_row_idxs, array& local_col_idxs, array& local_values, array& non_local_row_idxs, - array& non_local_col_idxs, - array& non_local_values, - array& local_gather_idxs, - array& recv_sizes, - array& non_local_to_global) + array& non_local_col_idxs, + array& non_local_values) { auto input_vals = input.get_const_values(); auto row_part_ids = row_partition->get_part_ids(); auto col_part_ids = col_partition->get_part_ids(); - auto num_parts = static_cast(row_partition->get_num_parts()); const auto* row_range_bounds = row_partition->get_range_bounds(); const auto* col_range_bounds = col_partition->get_range_bounds(); const auto* row_range_starting_indices = @@ -54,25 +51,29 @@ void build_local_nonlocal( const auto num_col_ranges = col_partition->get_num_ranges(); const auto num_input_elements = input.get_num_stored_elements(); + auto policy = thrust_policy(exec); + // precompute the row and column range id of each input element auto input_row_idxs = input.get_const_row_idxs(); auto input_col_idxs = input.get_const_col_idxs(); array row_range_ids{exec, num_input_elements}; - thrust::upper_bound(thrust_policy(exec), row_range_bounds + 1, + thrust::upper_bound(policy, row_range_bounds + 1, row_range_bounds + num_row_ranges + 1, input_row_idxs, input_row_idxs + num_input_elements, row_range_ids.get_data()); array col_range_ids{exec, input.get_num_stored_elements()}; - thrust::upper_bound(thrust_policy(exec), col_range_bounds + 1, + thrust::upper_bound(policy, col_range_bounds + 1, col_range_bounds + num_col_ranges + 1, input_col_idxs, input_col_idxs + num_input_elements, col_range_ids.get_data()); - // count number of local<0> and non-local<1> elements + // count number of local<0> and non-local<1> elements. Since the input + // may contain non-local rows, we don't have + // num_local + num_non_local = num_elements and can't just count one of them auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple( row_range_ids.get_const_data(), col_range_ids.get_const_data())); auto num_elements_pair = thrust::transform_reduce( - thrust_policy(exec), range_ids_it, range_ids_it + num_input_elements, + policy, range_ids_it, range_ids_it + num_input_elements, [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; @@ -128,8 +129,8 @@ void build_local_nonlocal( return thrust::make_tuple(local_row, local_col, input.val); }); thrust::copy_if( - thrust_policy(exec), local_it, - local_it + input.get_num_stored_elements(), range_ids_it, + policy, local_it, local_it + input.get_num_stored_elements(), + range_ids_it, thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), local_col_idxs.get_data(), local_values.get_data())), @@ -139,125 +140,33 @@ void build_local_nonlocal( auto col_part = col_part_ids[thrust::get<1>(tuple)]; return row_part == local_part && col_part == local_part; }); + + // copy and transform non-local entries into arrays. this keeps global // column indices, and also stores the column part id for each non-local // entry in an array non_local_row_idxs.resize_and_reset(num_non_local_elements); + non_local_col_idxs.resize_and_reset(num_non_local_elements); non_local_values.resize_and_reset(num_non_local_elements); - array non_local_global_col_idxs{exec, - num_non_local_elements}; - array non_local_col_part_ids{exec, num_non_local_elements}; - array non_local_col_range_ids{exec, num_non_local_elements}; auto non_local_it = thrust::make_transform_iterator( - input_it, [map_to_local_row, map_to_local_col, + input_it, [map_to_local_row, col_part_ids] __host__ __device__(const input_type input) { auto local_row = map_to_local_row(input.row, input.row_range); - return thrust::make_tuple(local_row, input.col, input.val, - col_part_ids[input.col_range], - input.col_range); + return thrust::make_tuple(local_row, input.col, input.val); }); thrust::copy_if( - thrust_policy(exec), non_local_it, - non_local_it + input.get_num_stored_elements(), range_ids_it, + policy, non_local_it, non_local_it + input.get_num_stored_elements(), + range_ids_it, thrust::make_zip_iterator(thrust::make_tuple( - non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(), - non_local_values.get_data(), non_local_col_part_ids.get_data(), - non_local_col_range_ids.get_data())), + non_local_row_idxs.get_data(), non_local_col_idxs.get_data(), + non_local_values.get_data())), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; auto col_part = col_part_ids[thrust::get<1>(tuple)]; return row_part == local_part && col_part != local_part; }); - - // 1. sort global columns, part-id and range-id according to - // their part-id and global columns - // the previous `non_local_global_col_idxs` is not modify to - // keep it consistent with the non-local row and values array - array sorted_non_local_global_col_idxs{ - exec, non_local_global_col_idxs}; - auto key_it = thrust::make_zip_iterator( - thrust::make_tuple(non_local_col_part_ids.get_data(), - sorted_non_local_global_col_idxs.get_data())); - thrust::sort_by_key(thrust_policy(exec), key_it, - key_it + num_non_local_elements, - non_local_col_range_ids.get_data()); - - // 2. remove duplicate columns, now the new column i has global index - // non_local_global_col_idxs[i] - auto non_local_global_col_idxs_begin = - sorted_non_local_global_col_idxs.get_data(); - auto non_local_global_col_idxs_end = thrust::get<0>(thrust::unique_by_key( - thrust_policy(exec), non_local_global_col_idxs_begin, - non_local_global_col_idxs_begin + num_non_local_elements, - thrust::make_zip_iterator( - thrust::make_tuple(non_local_col_part_ids.get_data(), - non_local_col_range_ids.get_data())))); - auto num_non_local_cols = static_cast(thrust::distance( - non_local_global_col_idxs_begin, non_local_global_col_idxs_end)); - - // 2.5 copy unique_columns to non_local_to_global map - non_local_to_global.resize_and_reset(num_non_local_cols); - exec->copy(num_non_local_cols, non_local_global_col_idxs_begin, - non_local_to_global.get_data()); - - // 3. create mapping from unique_columns - // since we don't have hash tables on GPUs I'm first sorting the non-local - // global column indices and their new local index again by the global - // column index. Then I'm using binary searches to find the new local column - // index. - array permutation{exec, num_non_local_cols}; - thrust::sequence(thrust_policy(exec), permutation.get_data(), - permutation.get_data() + num_non_local_cols); - thrust::sort_by_key( - thrust_policy(exec), non_local_global_col_idxs_begin, - non_local_global_col_idxs_begin + num_non_local_cols, - thrust::make_zip_iterator(thrust::make_tuple( - non_local_col_part_ids.get_data(), permutation.get_data()))); - - // 4. map column index of non-local entries to new columns - non_local_col_idxs.resize_and_reset(num_non_local_elements); - array lower_bounds{exec, num_non_local_elements}; - // I have to precompute the lower bounds because the calling binary - // searches from the device does not work: - // https://github.com/NVIDIA/thrust/issues/1415 - // TODO: compute lower bounds on-the-fly if available - thrust::lower_bound( - thrust_policy(exec), non_local_global_col_idxs_begin, - non_local_global_col_idxs_begin + num_non_local_cols, - non_local_global_col_idxs.get_data(), - non_local_global_col_idxs.get_data() + num_non_local_elements, - lower_bounds.get_data()); - auto permutation_data = permutation.get_data(); - thrust::transform( - thrust_policy(exec), lower_bounds.get_data(), - lower_bounds.get_data() + num_non_local_elements, - non_local_col_idxs.get_data(), - [permutation_data] __host__ __device__(const size_type lower_bound) { - return permutation_data[lower_bound]; - }); - - // 5. compute gather idxs and recv_sizes - local_gather_idxs.resize_and_reset(num_non_local_cols); - auto transform_it = thrust::make_zip_iterator(thrust::make_tuple( - non_local_to_global.get_data(), non_local_col_range_ids.get_data())); - thrust::transform( - thrust_policy(exec), transform_it, transform_it + num_non_local_cols, - local_gather_idxs.get_data(), - [map_to_local_col] __host__ __device__( - const thrust::tuple& tuple) { - return map_to_local_col(thrust::get<0>(tuple), - thrust::get<1>(tuple)); - }); - - auto recv_sizes_ptr = recv_sizes.get_data(); - thrust::fill_n(thrust_policy(exec), recv_sizes_ptr, num_parts, 0); - thrust::for_each_n(thrust_policy(exec), non_local_col_part_ids.get_data(), - num_non_local_cols, - [recv_sizes_ptr] __device__(const size_type part) { - atomic_add(recv_sizes_ptr + part, 1); - }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index cb74ad443f6..f26b221a799 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -275,7 +275,7 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( namespace distributed_matrix { -GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL); +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); } // namespace distributed_matrix diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index 6f984ef9d71..a424c49c442 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -20,9 +20,9 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ - GlobalIndexType) \ - void build_local_nonlocal( \ +#define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void separate_local_nonlocal( \ std::shared_ptr exec, \ const device_matrix_data& input, \ const experimental::distributed::Partition< \ @@ -32,19 +32,16 @@ namespace kernels { comm_index_type local_part, array& local_row_idxs, \ array& local_col_idxs, array& local_values, \ array& non_local_row_idxs, \ - array& non_local_col_idxs, \ - array& non_local_values, \ - array& local_gather_idxs, \ - array& recv_offsets, \ - array& non_local_to_global) + array& non_local_col_idxs, \ + array& non_local_values) #define GKO_DECLARE_ALL_AS_TEMPLATES \ using comm_index_type = experimental::distributed::comm_index_type; \ template \ - GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ - GlobalIndexType) + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_matrix, diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index bf5ebeaa685..5649cb579c9 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -15,7 +15,7 @@ namespace distributed_matrix { template -void build_local_nonlocal( +void separate_local_nonlocal( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* @@ -25,14 +25,11 @@ void build_local_nonlocal( comm_index_type local_part, array& local_row_idxs, array& local_col_idxs, array& local_values, array& non_local_row_idxs, - array& non_local_col_idxs, - array& non_local_values, - array& local_gather_idxs, - array& recv_sizes, - array& non_local_to_global) GKO_NOT_IMPLEMENTED; + array& non_local_col_idxs, + array& non_local_values) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); } // namespace distributed_matrix diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 1bea0898304..076136e99e0 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -24,7 +24,7 @@ namespace distributed_matrix { template -void build_local_nonlocal( +void separate_local_nonlocal( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* @@ -34,14 +34,9 @@ void build_local_nonlocal( comm_index_type local_part, array& local_row_idxs, array& local_col_idxs, array& local_values, array& non_local_row_idxs, - array& non_local_col_idxs, - array& non_local_values, - array& local_gather_idxs, - array& recv_sizes, - array& non_local_to_global) + array& non_local_col_idxs, + array& non_local_values) { - using partition_type = - experimental::distributed::Partition; using range_index_type = GlobalIndexType; using global_nonzero = matrix_data_entry; using local_nonzero = matrix_data_entry; @@ -51,14 +46,9 @@ void build_local_nonlocal( auto row_part_ids = row_partition->get_part_ids(); auto col_part_ids = col_partition->get_part_ids(); auto num_parts = row_partition->get_num_parts(); - auto recv_sizes_ptr = recv_sizes.get_data(); size_type row_range_id_hint = 0; size_type col_range_id_hint = 0; - // zero recv_sizes values - std::fill_n(recv_sizes_ptr, num_parts, comm_index_type{}); - // store non-local columns and their range indices - map non_local_cols(exec); // store non-local entries with global column idxs vector non_local_entries(exec); vector local_entries(exec); @@ -71,11 +61,8 @@ void build_local_nonlocal( #pragma omp parallel firstprivate(col_range_id_hint, row_range_id_hint) { - std::unordered_map - thread_non_local_cols; std::vector thread_non_local_entries; std::vector thread_local_entries; - std::vector thread_recv_sizes; auto thread_id = omp_get_thread_num(); auto thread_begin = thread_id * size_per_thread; auto thread_end = std::min(thread_begin + size_per_thread, num_input); @@ -103,7 +90,6 @@ void build_local_nonlocal( thread_local_entries.emplace_back(local_row, local_col, value); } else { - thread_non_local_cols.emplace(global_col, col_range_id); thread_non_local_entries.emplace_back(local_row, global_col, value); } @@ -112,12 +98,6 @@ void build_local_nonlocal( local_entry_offsets[thread_id] = thread_local_entries.size(); non_local_entry_offsets[thread_id] = thread_non_local_entries.size(); -#pragma omp critical - { - // collect global non-local columns - non_local_cols.insert(thread_non_local_cols.begin(), - thread_non_local_cols.end()); - } #pragma omp barrier #pragma omp single { @@ -158,45 +138,6 @@ void build_local_nonlocal( local_col_idxs.get_data()[i] = entry.column; local_values.get_data()[i] = entry.value; } - - // count non-local columns per part - for (const auto& entry : non_local_cols) { - auto col_range_id = entry.second; - recv_sizes_ptr[col_part_ids[col_range_id]]++; - } - const auto num_non_local_cols = std::accumulate( - recv_sizes_ptr, recv_sizes_ptr + num_parts, size_type{}); - components::prefix_sum_nonnegative(exec, recv_sizes_ptr, num_parts); - - // collect and renumber offdiagonal columns - local_gather_idxs.resize_and_reset(num_non_local_cols); - std::unordered_map - non_local_global_to_local; - for (const auto& entry : non_local_cols) { - auto range = entry.second; - auto part = col_part_ids[range]; - auto idx = recv_sizes_ptr[part]; - local_gather_idxs.get_data()[idx] = - map_to_local(entry.first, col_partition, entry.second); - non_local_global_to_local[entry.first] = idx; - ++recv_sizes_ptr[part]; - } - - // build local-to-global map for non-local columns - non_local_to_global.resize_and_reset(num_non_local_cols); - std::fill_n(non_local_to_global.get_data(), non_local_to_global.get_size(), - invalid_index()); - for (const auto& key_value : non_local_global_to_local) { - const auto global_idx = key_value.first; - const auto local_idx = key_value.second; - non_local_to_global.get_data()[local_idx] = global_idx; - } - - // compute sizes from shifted offsets - for (size_type i = num_parts - 1; i > 0; --i) { - recv_sizes_ptr[i] -= recv_sizes_ptr[i - 1]; - } - // map non-local values to local column indices non_local_row_idxs.resize_and_reset(non_local_entries.size()); non_local_col_idxs.resize_and_reset(non_local_entries.size()); @@ -206,14 +147,13 @@ void build_local_nonlocal( auto global = non_local_entries[i]; non_local_row_idxs.get_data()[i] = static_cast(global.row); - non_local_col_idxs.get_data()[i] = - non_local_global_to_local[global.column]; + non_local_col_idxs.get_data()[i] = global.column; non_local_values.get_data()[i] = global.value; } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); } // namespace distributed_matrix diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index a9fd28889e8..9b4ff9231df 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -8,7 +8,6 @@ #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "reference/distributed/partition_helpers.hpp" @@ -19,7 +18,7 @@ namespace distributed_matrix { template -void build_local_nonlocal( +void separate_local_nonlocal( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* @@ -29,14 +28,9 @@ void build_local_nonlocal( comm_index_type local_part, array& local_row_idxs, array& local_col_idxs, array& local_values, array& non_local_row_idxs, - array& non_local_col_idxs, - array& non_local_values, - array& local_gather_idxs, - array& recv_sizes, - array& non_local_to_global) + array& non_local_col_idxs, + array& non_local_values) { - using partition_type = - experimental::distributed::Partition; using global_nonzero = matrix_data_entry; auto input_row_idxs = input.get_const_row_idxs(); auto input_col_idxs = input.get_const_col_idxs(); @@ -80,65 +74,21 @@ void build_local_nonlocal( } // create non-local matrix - // 1. stable sort global columns according to their part-id and global - // columns - auto find_col_part = [&](GlobalIndexType idx) { - auto range_id = find_range(idx, col_partition, 0); - return col_part_ids[range_id]; - }; - vector unique_columns(exec); - std::transform(non_local_entries.begin(), non_local_entries.end(), - std::back_inserter(unique_columns), - [](const auto& entry) { return entry.column; }); - std::sort(unique_columns.begin(), unique_columns.end(), - [&](const auto& a, const auto& b) { - auto part_a = find_col_part(a); - auto part_b = find_col_part(b); - return std::tie(part_a, a) < std::tie(part_b, b); - }); - - // 2. remove duplicate columns, now the new column i has global index - // unique_columns[i] - unique_columns.erase( - std::unique(unique_columns.begin(), unique_columns.end()), - unique_columns.end()); - - // 3. create mapping from unique_columns - unordered_map non_local_column_map(exec); - for (size_type i = 0; i < unique_columns.size(); ++i) { - non_local_column_map[unique_columns[i]] = - static_cast(i); - } - - // 3.5 copy unique_columns to array - non_local_to_global = array{exec, unique_columns.begin(), - unique_columns.end()}; - - // 4. fill non_local_data + // copy non-local data into row and value array + // copy non-local global column indices into temporary vector non_local_row_idxs.resize_and_reset(non_local_entries.size()); non_local_col_idxs.resize_and_reset(non_local_entries.size()); non_local_values.resize_and_reset(non_local_entries.size()); for (size_type i = 0; i < non_local_entries.size(); ++i) { const auto& entry = non_local_entries[i]; non_local_row_idxs.get_data()[i] = entry.row; - non_local_col_idxs.get_data()[i] = non_local_column_map[entry.column]; + non_local_col_idxs.get_data()[i] = entry.column; non_local_values.get_data()[i] = entry.value; } - - // compute gather idxs and recv_sizes - local_gather_idxs.resize_and_reset(unique_columns.size()); - std::fill_n(recv_sizes.get_data(), num_parts, 0); - for (size_type i = 0; i < unique_columns.size(); ++i) { - col_range_id = - find_range(unique_columns[i], col_partition, col_range_id); - local_gather_idxs.get_data()[i] = - map_to_local(unique_columns[i], col_partition, col_range_id); - recv_sizes.get_data()[find_col_part(unique_columns[i])]++; - } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); } // namespace distributed_matrix From 2eb2ff918fcb3c0826e417bf916fe8cb872046b4 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 6 May 2024 16:16:23 +0200 Subject: [PATCH 2/6] [dist-mat] use index_map in matrix::read_distributed --- core/distributed/matrix.cpp | 104 +++++++++++++-------- include/ginkgo/core/distributed/matrix.hpp | 26 +++--- 2 files changed, 80 insertions(+), 50 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index dbeea7c9757..7b23a947fc3 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -21,8 +21,8 @@ namespace matrix { namespace { -GKO_REGISTER_OPERATION(build_local_nonlocal, - distributed_matrix::build_local_nonlocal); +GKO_REGISTER_OPERATION(separate_local_nonlocal, + distributed_matrix::separate_local_nonlocal); } // namespace @@ -51,7 +51,6 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - recv_gather_idxs_{exec}, non_local_to_global_{exec}, one_scalar_{}, local_mtx_{local_matrix_template->clone(exec)}, @@ -79,7 +78,6 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - recv_gather_idxs_{exec}, non_local_to_global_{exec}, one_scalar_{}, non_local_mtx_(::gko::matrix::Coo::create( @@ -106,16 +104,14 @@ Matrix::Matrix( recv_offsets_(comm.size() + 1), recv_sizes_(comm.size()), gather_idxs_{exec}, - recv_gather_idxs_{exec}, non_local_to_global_{exec}, one_scalar_{} { this->set_size(size); - local_mtx_ = local_linop; - non_local_mtx_ = non_local_linop; - recv_offsets_ = recv_offsets; - recv_sizes_ = recv_sizes; - recv_gather_idxs_ = recv_gather_idxs; + local_mtx_ = std::move(local_linop); + non_local_mtx_ = std::move(non_local_linop); + recv_offsets_ = std::move(recv_offsets); + recv_sizes_ = std::move(recv_sizes); // build send information from recv copy // exchange step 1: determine recv_sizes, send_sizes, send_offsets std::partial_sum(recv_sizes_.begin(), recv_sizes_.end(), @@ -129,18 +125,17 @@ Matrix::Matrix( // exchange step 2: exchange gather_idxs from receivers to senders auto use_host_buffer = mpi::requires_host_buffer(exec, comm); if (use_host_buffer) { - recv_gather_idxs_.set_executor(exec->get_master()); + recv_gather_idxs.set_executor(exec->get_master()); gather_idxs_.clear(); gather_idxs_.set_executor(exec->get_master()); } gather_idxs_.resize_and_reset(send_offsets_.back()); comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, - recv_gather_idxs_.get_const_data(), recv_sizes_.data(), + recv_gather_idxs.get_const_data(), recv_sizes_.data(), recv_offsets_.data(), gather_idxs_.get_data(), send_sizes_.data(), send_offsets_.data()); if (use_host_buffer) { gather_idxs_.set_executor(exec); - recv_gather_idxs_.set_executor(exec); } one_scalar_.init(exec, dim<2>{1, 1}); @@ -244,11 +239,12 @@ void Matrix::move_to( template -void Matrix::read_distributed( +index_map +Matrix::read_distributed( const device_matrix_data& data, - ptr_param> + std::shared_ptr> row_partition, - ptr_param> + std::shared_ptr> col_partition) { const auto comm = this->get_communicator(); @@ -271,29 +267,40 @@ void Matrix::read_distributed( array local_col_idxs{exec}; array local_values{exec}; array non_local_row_idxs{exec}; - array non_local_col_idxs{exec}; + array global_non_local_col_idxs{exec}; array non_local_values{exec}; - array recv_sizes_array{exec, num_parts}; - // build local, non-local matrix data and communication structures - exec->run(matrix::make_build_local_nonlocal( + // separate input into local and non-local block + // The rows and columns of the local block are mapped into local indexing, + // as well as the rows of the non-local block. The columns of the non-local + // block are still in global indices. + exec->run(matrix::make_separate_local_nonlocal( data, make_temporary_clone(exec, row_partition).get(), make_temporary_clone(exec, col_partition).get(), local_part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, - non_local_col_idxs, non_local_values, recv_gather_idxs_, - recv_sizes_array, non_local_to_global_)); + global_non_local_col_idxs, non_local_values)); + + auto imap = index_map( + exec, col_partition, comm.rank(), global_non_local_col_idxs); + + auto non_local_col_idxs = + imap.map_to_local(global_non_local_col_idxs, index_space::non_local); + non_local_to_global_ = + make_const_array_view( + imap.get_executor(), imap.get_remote_global_idxs().get_size(), + imap.get_remote_global_idxs().get_const_flat_data()) + .copy_to_array(); // read the local matrix data const auto num_local_rows = static_cast(row_partition->get_part_size(local_part)); const auto num_local_cols = static_cast(col_partition->get_part_size(local_part)); - const auto num_non_local_cols = non_local_to_global_.get_size(); device_matrix_data local_data{ exec, dim<2>{num_local_rows, num_local_cols}, std::move(local_row_idxs), std::move(local_col_idxs), std::move(local_values)}; device_matrix_data non_local_data{ - exec, dim<2>{num_local_rows, num_non_local_cols}, + exec, dim<2>{num_local_rows, imap.get_remote_global_idxs().get_size()}, std::move(non_local_row_idxs), std::move(non_local_col_idxs), std::move(non_local_values)}; as>(this->local_mtx_) @@ -302,8 +309,16 @@ void Matrix::read_distributed( ->read(std::move(non_local_data)); // exchange step 1: determine recv_sizes, send_sizes, send_offsets - exec->get_master()->copy_from( - exec, num_parts, recv_sizes_array.get_const_data(), recv_sizes_.data()); + auto host_recv_targets = + make_temporary_clone(exec->get_master(), &imap.get_remote_target_ids()); + auto host_offsets = make_temporary_clone( + exec->get_master(), &imap.get_remote_global_idxs().get_offsets()); + std::fill(recv_sizes_.begin(), recv_sizes_.end(), 0); + for (size_type i = 0; i < host_recv_targets->get_size(); ++i) { + recv_sizes_[host_recv_targets->get_const_data()[i]] = + host_offsets->get_const_data()[i + 1] - + host_offsets->get_const_data()[i]; + } std::partial_sum(recv_sizes_.begin(), recv_sizes_.end(), recv_offsets_.begin() + 1); comm.all_to_all(exec, recv_sizes_.data(), 1, send_sizes_.data(), 1); @@ -313,33 +328,40 @@ void Matrix::read_distributed( recv_offsets_[0] = 0; // exchange step 2: exchange gather_idxs from receivers to senders + auto recv_gather_idxs = + make_const_array_view( + imap.get_executor(), imap.get_non_local_size(), + imap.get_remote_local_idxs().get_const_flat_data()) + .copy_to_array(); auto use_host_buffer = mpi::requires_host_buffer(exec, comm); if (use_host_buffer) { - recv_gather_idxs_.set_executor(exec->get_master()); + recv_gather_idxs.set_executor(exec->get_master()); gather_idxs_.clear(); gather_idxs_.set_executor(exec->get_master()); } gather_idxs_.resize_and_reset(send_offsets_.back()); comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, - recv_gather_idxs_.get_const_data(), recv_sizes_.data(), + recv_gather_idxs.get_const_data(), recv_sizes_.data(), recv_offsets_.data(), gather_idxs_.get_data(), send_sizes_.data(), send_offsets_.data()); if (use_host_buffer) { gather_idxs_.set_executor(exec); - recv_gather_idxs_.set_executor(exec); } + + return imap; } template -void Matrix::read_distributed( +index_map +Matrix::read_distributed( const matrix_data& data, - ptr_param> + std::shared_ptr> row_partition, - ptr_param> + std::shared_ptr> col_partition) { - this->read_distributed( + return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), row_partition, col_partition); @@ -347,11 +369,13 @@ void Matrix::read_distributed( template -void Matrix::read_distributed( +index_map +Matrix::read_distributed( const matrix_data& data, - ptr_param> partition) + std::shared_ptr> + partition) { - this->read_distributed( + return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), partition, partition); @@ -359,11 +383,13 @@ void Matrix::read_distributed( template -void Matrix::read_distributed( +index_map +Matrix::read_distributed( const device_matrix_data& data, - ptr_param> partition) + std::shared_ptr> + partition) { - this->read_distributed(data, partition, partition); + return this->read_distributed(data, partition, partition); } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index e9f497800b5..66ed5f7089b 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -289,10 +290,12 @@ class Matrix * * @param data The device_matrix_data structure. * @param partition The global row and column partition. + * + * @return the index_map induced by the partitions and the matrix structure */ - void read_distributed( + index_map read_distributed( const device_matrix_data& data, - ptr_param> + std::shared_ptr> partition); /** @@ -304,9 +307,9 @@ class Matrix * @note For efficiency it is advised to use the device_matrix_data * overload. */ - void read_distributed( + index_map read_distributed( const matrix_data& data, - ptr_param> + std::shared_ptr> partition); /** @@ -323,12 +326,14 @@ class Matrix * @param data The device_matrix_data structure. * @param row_partition The global row partition. * @param col_partition The global col partition. + * + * @return the index_map induced by the partitions and the matrix structure */ - void read_distributed( + index_map read_distributed( const device_matrix_data& data, - ptr_param> + std::shared_ptr> row_partition, - ptr_param> + std::shared_ptr> col_partition); /** @@ -340,11 +345,11 @@ class Matrix * @note For efficiency it is advised to use the device_matrix_data * overload. */ - void read_distributed( + index_map read_distributed( const matrix_data& data, - ptr_param> + std::shared_ptr> row_partition, - ptr_param> + std::shared_ptr> col_partition); /** @@ -612,7 +617,6 @@ class Matrix std::vector recv_offsets_; std::vector recv_sizes_; array gather_idxs_; - array recv_gather_idxs_; array non_local_to_global_; gko::detail::DenseCache one_scalar_; gko::detail::DenseCache host_send_buffer_; From 7d75f3da5e60634c9cab726ff1cfdb165fd613e9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 14 Feb 2024 16:40:41 +0000 Subject: [PATCH 3/6] [dist-mat] adjust usages of read_distributed --- benchmark/utils/generator.hpp | 8 +- reference/test/distributed/matrix_kernels.cpp | 218 ++++++------------ test/distributed/matrix_kernels.cpp | 38 +-- test/mpi/matrix.cpp | 8 +- test/mpi/multigrid/pgm.cpp | 2 +- test/mpi/preconditioner/schwarz.cpp | 2 +- test/mpi/solver/solver.cpp | 37 +-- 7 files changed, 112 insertions(+), 201 deletions(-) diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 25e8f6f81ce..6b173651aa3 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -223,9 +223,11 @@ struct DistributedDefaultSystemGenerator { const gko::matrix_data& data, json* spmv_case = nullptr) const { - auto part = gko::experimental::distributed:: - Partition::build_from_global_size_uniform( - exec, comm.size(), static_cast(data.size[0])); + auto part = gko::share( + gko::experimental::distributed::Partition:: + build_from_global_size_uniform( + exec, comm.size(), + static_cast(data.size[0]))); auto formats = split(format_name, '-'); if (formats.size() != 2) { throw std::runtime_error{"Invalid distributed format specifier " + diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index eb2473fbfda..302e16a2d6f 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -46,10 +46,7 @@ class Matrix : public ::testing::Test { local_values{ref}, non_local_row_idxs{ref}, non_local_col_idxs{ref}, - non_local_values{ref}, - gather_idxs{ref}, - recv_sizes{ref}, - non_local_to_global{ref} + non_local_values{ref} {} void validate( @@ -72,56 +69,50 @@ class Matrix : public ::testing::Test { std::tuple, std::initializer_list, std::initializer_list, std::initializer_list>> - non_local_entries, - std::initializer_list> - gather_idx_entries, - std::initializer_list> - recv_sizes_entries) + non_local_entries) { std::vector> ref_locals; - std::vector> + std::vector< + std::tuple, gko::array, + gko::array, gko::array>> ref_non_locals; - std::vector> ref_gather_idxs; - std::vector> ref_recv_sizes; auto input = gko::device_matrix_data{ ref, size, gko::array{ref, input_rows}, gko::array{ref, input_cols}, gko::array{ref, input_vals}}; - this->recv_sizes.resize_and_reset( - static_cast(row_partition->get_num_parts())); for (auto entry : local_entries) { ref_locals.emplace_back(ref, std::get<0>(entry), std::get<1>(entry), std::get<2>(entry), std::get<3>(entry)); } for (auto entry : non_local_entries) { - ref_non_locals.emplace_back(ref, std::get<0>(entry), - std::get<1>(entry), std::get<2>(entry), - std::get<3>(entry)); - } - for (auto entry : gather_idx_entries) { - ref_gather_idxs.emplace_back(ref, entry); - } - for (auto entry : recv_sizes_entries) { - ref_recv_sizes.emplace_back(ref, entry); + ref_non_locals.emplace_back( + std::get<0>(entry), + gko::array{ref, std::get<1>(entry)}, + gko::array{ref, std::get<2>(entry)}, + gko::array{ref, std::get<3>(entry)}); } for (comm_index_type part = 0; part < row_partition->get_num_parts(); ++part) { - gko::kernels::reference::distributed_matrix::build_local_nonlocal( - ref, input, row_partition.get(), col_partition.get(), part, - local_row_idxs, local_col_idxs, local_values, - non_local_row_idxs, non_local_col_idxs, non_local_values, - gather_idxs, recv_sizes, non_local_to_global); - - assert_device_matrix_data_equal(local_row_idxs, local_col_idxs, - local_values, ref_locals[part]); - assert_device_matrix_data_equal( - non_local_row_idxs, non_local_col_idxs, non_local_values, - ref_non_locals[part]); - GKO_ASSERT_ARRAY_EQ(gather_idxs, ref_gather_idxs[part]); - GKO_ASSERT_ARRAY_EQ(recv_sizes, ref_recv_sizes[part]); + gko::kernels::reference::distributed_matrix:: + separate_local_nonlocal( + ref, input, row_partition.get(), col_partition.get(), part, + local_row_idxs, local_col_idxs, local_values, + non_local_row_idxs, non_local_col_idxs, non_local_values); + + + auto local_arrays = ref_locals[part].empty_out(); + GKO_ASSERT_ARRAY_EQ(local_row_idxs, local_arrays.row_idxs); + GKO_ASSERT_ARRAY_EQ(local_col_idxs, local_arrays.col_idxs); + GKO_ASSERT_ARRAY_EQ(local_values, local_arrays.values); + GKO_ASSERT_ARRAY_EQ(non_local_row_idxs, + std::get<1>(ref_non_locals[part])); + GKO_ASSERT_ARRAY_EQ(non_local_col_idxs, + std::get<2>(ref_non_locals[part])); + GKO_ASSERT_ARRAY_EQ(non_local_values, + std::get<3>(ref_non_locals[part])); } } @@ -165,18 +156,15 @@ class Matrix : public ::testing::Test { gko::array local_col_idxs; gko::array local_values; gko::array non_local_row_idxs; - gko::array non_local_col_idxs; + gko::array non_local_col_idxs; gko::array non_local_values; - gko::array gather_idxs; - gko::array recv_sizes; - gko::array non_local_to_global; }; TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); -TYPED_TEST(Matrix, BuildsLocalNonLocalEmpty) +TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -194,12 +182,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalEmpty) std::make_tuple(gko::dim<2>{3, 3}, I{}, I{}, I{})}, {std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{}), - std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{})}, - {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}); + std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalSmall) +TYPED_TEST(Matrix, SeparateLocalNonLocalSmall) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -216,12 +203,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalSmall) {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{4}), std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{1})}, {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{3}), - std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{2})}, - {{0}, {0}}, {{0, 1}, {1, 0}}); + std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{1}, I{2})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocal) +TYPED_TEST(Matrix, SeparateLocalNonLocalNoNonLocal) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -243,12 +229,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocal) I{3, 4, 7})}, {std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), - std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{})}, - {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}); + std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocal) +TYPED_TEST(Matrix, SeparateLocalNonLocalNoLocal) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -265,16 +250,15 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocal) {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{})}, - {std::make_tuple(gko::dim<2>{2, 1}, I{1}, I{0}, I{6}), - std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1}, I{2, 1, 0}, + {std::make_tuple(gko::dim<2>{2, 1}, I{1}, I{1}, I{6}), + std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1}, I{1, 3, 2}, I{1, 2, 8}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, - I{5, 7})}, - {{0}, {0, 1, 0}, {1, 1}}, {{0, 0, 1}, {2, 0, 1}, {1, 1, 0}}); + std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{5, 3}, + I{5, 7})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalMixed) +TYPED_TEST(Matrix, SeparateLocalNonLocalMixed) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -287,26 +271,26 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalMixed) this->validate( gko::dim<2>{6, 6}, partition, partition, + // clang-format on {0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 4, 4, 5, 5}, {0, 1, 3, 5, 1, 4, 5, 3, 1, 2, 3, 4, 0, 2}, {11, 1, 2, 12, 13, 14, 5, 15, 6, 16, 7, 17, 18, 8}, - + // clang-format off {std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, I{15, 16}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 0, 1}, I{0, 1, 0}, I{11, 12, 18}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 0, 1}, I{0, 1, 1}, I{13, 14, 17})}, - {std::make_tuple(gko::dim<2>{2, 1}, I{1}, I{0}, I{6}), - std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1}, I{2, 1, 0}, + {std::make_tuple(gko::dim<2>{2, 1}, I{1}, I{1}, I{6}), + std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1}, I{1, 3, 2}, I{1, 2, 8}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, - I{5, 7})}, - {{0}, {0, 1, 0}, {1, 1}}, {{0, 0, 1}, {2, 0, 1}, {1, 1, 0}}); + std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{5, 3}, + I{5, 7})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalEmptyWithColPartition) +TYPED_TEST(Matrix, SeparateLocalNonLocalEmptyWithColPartition) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -329,12 +313,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalEmptyWithColPartition) std::make_tuple(gko::dim<2>{3, 3}, I{}, I{}, I{})}, {std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{}), - std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{})}, - {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}); + std::make_tuple(gko::dim<2>{3, 0}, I{}, I{}, I{})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalSmallWithColPartition) +TYPED_TEST(Matrix, SeparateLocalNonLocalSmallWithColPartition) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -354,12 +337,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalSmallWithColPartition) {1, 2, 3, 4}, {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{3}), std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{2})}, - {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{4}), - std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{1})}, - {{0}, {0}}, {{0, 1}, {1, 0}}); + {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{1}, I{4}), + std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{1})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocalWithColPartition) +TYPED_TEST(Matrix, SeparateLocalNonLocalNoNonLocalWithColPartition) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -384,12 +366,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocalWithColPartition) I{4, 5, 6})}, {std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{}), - std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{})}, - {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}); + std::make_tuple(gko::dim<2>{2, 0}, I{}, I{}, I{})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocalWithColPartition) +TYPED_TEST(Matrix, SeparateLocalNonLocalNoLocalWithColPartition) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -410,17 +391,16 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocalWithColPartition) {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{})}, - {std::make_tuple(gko::dim<2>{2, 3}, I{0, 1, 0}, I{1, 2, 0}, + {std::make_tuple(gko::dim<2>{2, 3}, I{0, 1, 0}, I{2, 3, 5}, I{1, 2, 3}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{0, 1}, I{4, 5}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 0}, I{0, 1}, - I{6, 7})}, - {{1, 0, 1}, {0, 1}, {1, 0}}, {{0, 1, 2}, {2, 0, 0}, {1, 1, 0}}); + std::make_tuple(gko::dim<2>{2, 2}, I{0, 0}, I{1, 4}, + I{6, 7})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalMixedWithColPartition) +TYPED_TEST(Matrix, SeparateLocalNonLocalMixedWithColPartition) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -436,9 +416,11 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalMixedWithColPartition) this->ref, col_mapping, num_parts); this->validate(gko::dim<2>{6, 6}, partition, col_partition, + // clang-format off {2, 3, 3, 0, 5, 1, 4, 2, 3, 2, 0, 0, 1, 1, 4, 4}, - {0, 0, 1, 5, 4, 2, 2, 3, 2, 4, 1, 2, 4, 5, 0, 5}, + { 0, 0, 1, 5, 4, 2, 2, 3, 2, 4, 1, 2, 4, 5, 0, 5}, {11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + // clang-format on {std::make_tuple(gko::dim<2>{2, 2}, I{0, 1, 1}, I{0, 0, 1}, I{11, 12, 13}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, @@ -446,17 +428,15 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalMixedWithColPartition) std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{0, 0}, I{16, 17})}, {std::make_tuple(gko::dim<2>{2, 3}, I{0, 1, 0}, - I{2, 1, 0}, I{1, 2, 3}), + I{3, 2, 4}, I{1, 2, 3}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 0}, - I{0, 1}, I{4, 5}), + I{1, 2}, I{4, 5}), std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1, 1}, - I{1, 2, 0, 2}, I{6, 7, 8, 9})}, - {{0, 0, 1}, {1, 0}, {0, 0, 1}}, - {{0, 1, 2}, {1, 0, 1}, {1, 2, 0}}); + I{4, 5, 0, 5}, I{6, 7, 8, 9})}); } -TYPED_TEST(Matrix, BuildsLocalNonLocalNonSquare) +TYPED_TEST(Matrix, SeparateLocalNonLocalNonSquare) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; @@ -473,76 +453,22 @@ TYPED_TEST(Matrix, BuildsLocalNonLocalNonSquare) this->validate( gko::dim<2>{6, 4}, partition, col_partition, - {2, 3, 0, 1, 4, 3, 3, 0, 1, 4}, {0, 0, 3, 2, 1, 2, 3, 0, 3, 3}, + // clang-format off + {2, 3, 0, 1, 4, 3, 3, 0, 1, 4}, + {0, 0, 3, 2, 1, 2, 3, 0, 3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + // clang-format on {std::make_tuple(gko::dim<2>{2, 1}, I{0, 1}, I{0, 0}, I{1, 2}), std::make_tuple(gko::dim<2>{2, 1}, I{0}, I{0}, I{3}), std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, I{4, 5})}, - {std::make_tuple(gko::dim<2>{2, 2}, I{1, 1}, I{1, 0}, + {std::make_tuple(gko::dim<2>{2, 2}, I{1, 1}, I{2, 3}, I{6, 7}), std::make_tuple(gko::dim<2>{2, 1}, I{0}, I{0}, I{8}), - std::make_tuple(gko::dim<2>{2, 1}, I{0, 1}, I{0, 0}, - I{9, 10})}, - {{0, 1}, {0}, {0}}, {{0, 1, 1}, {1, 0, 0}, {0, 1, 0}}); + std::make_tuple(gko::dim<2>{2, 1}, I{0, 1}, I{3, 3}, + I{9, 10})}); } -TYPED_TEST(Matrix, BuildGhostMapContinuous) -{ - using value_type = typename TestFixture::value_type; - using local_index_type = typename TestFixture::local_index_type; - using global_index_type = typename TestFixture::global_index_type; - this->mapping = {this->ref, {0, 0, 0, 1, 1, 2, 2}}; - constexpr comm_index_type num_parts = 3; - auto partition = gko::experimental::distributed::Partition< - local_index_type, global_index_type>::build_from_mapping(this->ref, - this->mapping, - num_parts); - this->recv_sizes.resize_and_reset(num_parts + 1); - gko::array result[num_parts] = { - {this->ref, {3}}, {this->ref, {0, 6}}, {this->ref, {4}}}; - - for (int local_id = 0; local_id < num_parts; ++local_id) { - gko::kernels::reference::distributed_matrix::build_local_nonlocal( - this->ref, this->create_input_full_rank(), partition.get(), - partition.get(), local_id, this->local_row_idxs, - this->local_col_idxs, this->local_values, this->non_local_row_idxs, - this->non_local_col_idxs, this->non_local_values, this->gather_idxs, - this->recv_sizes, this->non_local_to_global); - - GKO_ASSERT_ARRAY_EQ(result[local_id], this->non_local_to_global); - } -} - -TYPED_TEST(Matrix, BuildGhostMapScattered) -{ - using value_type = typename TestFixture::value_type; - using local_index_type = typename TestFixture::local_index_type; - using global_index_type = typename TestFixture::global_index_type; - this->mapping = {this->ref, {0, 1, 2, 0, 1, 2, 0}}; - constexpr comm_index_type num_parts = 3; - auto partition = gko::experimental::distributed::Partition< - local_index_type, global_index_type>::build_from_mapping(this->ref, - this->mapping, - num_parts); - this->recv_sizes.resize_and_reset(num_parts + 1); - gko::array result[num_parts] = { - {this->ref, {5}}, - {this->ref, {6, 2}}, - {this->ref, {4}}}; // the columns are sorted by their part_id - - for (int local_id = 0; local_id < num_parts; ++local_id) { - gko::kernels::reference::distributed_matrix::build_local_nonlocal( - this->ref, this->create_input_full_rank(), partition.get(), - partition.get(), local_id, this->local_row_idxs, - this->local_col_idxs, this->local_values, this->non_local_row_idxs, - this->non_local_col_idxs, this->non_local_values, this->gather_idxs, - this->recv_sizes, this->non_local_to_global); - - GKO_ASSERT_ARRAY_EQ(result[local_id], this->non_local_to_global); - } -} - } // namespace diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 8342898506d..5e3677db2f4 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -6,7 +6,6 @@ #include -#include #include @@ -15,8 +14,6 @@ #include #include -#include -#include #include "core/test/utils.hpp" @@ -35,7 +32,6 @@ class Matrix : public CommonTestFixture { 1, decltype(ValueLocalGlobalIndexType())>::type; using global_index_type = typename std::tuple_element< 2, decltype(ValueLocalGlobalIndexType())>::type; - using Mtx = gko::matrix::Csr; Matrix() : engine(42) {} @@ -65,34 +61,23 @@ class Matrix : public CommonTestFixture { gko::array d_local_col_idxs{exec}; gko::array d_local_values{exec}; gko::array non_local_row_idxs{ref}; - gko::array non_local_col_idxs{ref}; + gko::array non_local_col_idxs{ref}; gko::array non_local_values{ref}; gko::array d_non_local_row_idxs{exec}; - gko::array d_non_local_col_idxs{exec}; + gko::array d_non_local_col_idxs{exec}; gko::array d_non_local_values{exec}; - gko::array gather_idxs{ref}; - gko::array d_gather_idxs{exec}; - gko::array recv_sizes{ - ref, - static_cast(row_partition->get_num_parts())}; - gko::array d_recv_sizes{ - exec, - static_cast(row_partition->get_num_parts())}; - gko::array local_to_global_col{ref}; - gko::array d_local_to_global_col{exec}; - - gko::kernels::reference::distributed_matrix::build_local_nonlocal( - ref, input, row_partition.get(), col_partition.get(), part, - local_row_idxs, local_col_idxs, local_values, - non_local_row_idxs, non_local_col_idxs, non_local_values, - gather_idxs, recv_sizes, local_to_global_col); + + gko::kernels::reference::distributed_matrix:: + separate_local_nonlocal( + ref, input, row_partition.get(), col_partition.get(), part, + local_row_idxs, local_col_idxs, local_values, + non_local_row_idxs, non_local_col_idxs, non_local_values); gko::kernels::EXEC_NAMESPACE::distributed_matrix:: - build_local_nonlocal( + separate_local_nonlocal( exec, d_input, d_row_partition.get(), d_col_partition.get(), part, d_local_row_idxs, d_local_col_idxs, d_local_values, d_non_local_row_idxs, d_non_local_col_idxs, - d_non_local_values, d_gather_idxs, d_recv_sizes, - d_local_to_global_col); + d_non_local_values); GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs); GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs); @@ -100,9 +85,6 @@ class Matrix : public CommonTestFixture { GKO_ASSERT_ARRAY_EQ(non_local_row_idxs, d_non_local_row_idxs); GKO_ASSERT_ARRAY_EQ(non_local_col_idxs, d_non_local_col_idxs); GKO_ASSERT_ARRAY_EQ(non_local_values, d_non_local_values); - GKO_ASSERT_ARRAY_EQ(gather_idxs, d_gather_idxs); - GKO_ASSERT_ARRAY_EQ(recv_sizes, d_recv_sizes); - GKO_ASSERT_ARRAY_EQ(local_to_global_col, d_local_to_global_col); } } diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 6b4e92a87bf..d836eb008d9 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -403,10 +403,10 @@ class Matrix : public CommonMpiTestFixture { gko::dim<2> size; - std::unique_ptr row_part; - std::unique_ptr col_part; - std::unique_ptr row_part_large; - std::unique_ptr col_part_large; + std::shared_ptr row_part; + std::shared_ptr col_part; + std::shared_ptr row_part_large; + std::shared_ptr col_part_large; std::unique_ptr dist_mat; std::unique_ptr dist_mat_large; diff --git a/test/mpi/multigrid/pgm.cpp b/test/mpi/multigrid/pgm.cpp index 0ef824f1a9d..8e72588128b 100644 --- a/test/mpi/multigrid/pgm.cpp +++ b/test/mpi/multigrid/pgm.cpp @@ -71,7 +71,7 @@ class Pgm : public CommonMpiTestFixture { exec, I{0, 2, 4, 8})); dist_mat = dist_mtx_type::create(exec, comm); - dist_mat->read_distributed(mat_input, row_part.get()); + dist_mat->read_distributed(mat_input, row_part); } void SetUp() override { ASSERT_EQ(comm.size(), 3); } diff --git a/test/mpi/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp index b4c77203bca..5c17254a970 100644 --- a/test/mpi/preconditioner/schwarz.cpp +++ b/test/mpi/preconditioner/schwarz.cpp @@ -87,7 +87,7 @@ class SchwarzPreconditioner : public CommonMpiTestFixture { exec, I{0, 2, 4, 8})); dist_mat = dist_mtx_type::create(exec, comm); - dist_mat->read_distributed(mat_input, row_part.get()); + dist_mat->read_distributed(mat_input, row_part); non_dist_mat = non_dist_matrix_type::create(exec); non_dist_mat->read(mat_input); diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index bb17207a556..52df7232e32 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -186,7 +186,7 @@ class Solver : public CommonMpiTestFixture { Solver() : rand_engine(15) {} - std::unique_ptr gen_part(int size, int num_active_parts) + std::shared_ptr gen_part(int size, int num_active_parts) { auto mapping = gko::test::generate_random_array< gko::experimental::distributed::comm_index_type>( @@ -195,12 +195,12 @@ class Solver : public CommonMpiTestFixture { gko::experimental::distributed::comm_index_type>( 0, num_active_parts - 1), rand_engine, ref); - return Part::build_from_mapping(ref, mapping, comm.size()); + return gko::share(Part::build_from_mapping(ref, mapping, comm.size())); } - std::shared_ptr gen_mtx(const Part* part, int num_rows, int num_cols, - int min_cols, int max_cols) + std::shared_ptr gen_mtx(std::shared_ptr part, int num_rows, + int num_cols, int min_cols, int max_cols) { auto data = gko::test::generate_random_matrix_data( @@ -224,8 +224,8 @@ class Solver : public CommonMpiTestFixture { template std::shared_ptr gen_in_vec( - const Part* part, const std::shared_ptr& solver, int nrhs, - int stride) + std::shared_ptr part, + const std::shared_ptr& solver, int nrhs, int stride) { auto global_size = gko::dim<2>{solver->get_size()[1], static_cast(nrhs)}; @@ -255,8 +255,8 @@ class Solver : public CommonMpiTestFixture { template std::shared_ptr gen_out_vec( - const Part* part, const std::shared_ptr& solver, int nrhs, - int stride) + std::shared_ptr part, + const std::shared_ptr& solver, int nrhs, int stride) { auto global_size = gko::dim<2>{solver->get_size()[0], static_cast(nrhs)}; @@ -312,7 +312,8 @@ class Solver : public CommonMpiTestFixture { } template - void forall_matrix_scenarios(const Part* part, TestFunction fn) + void forall_matrix_scenarios(std::shared_ptr part, + TestFunction fn) { auto guarded_fn = [&](auto mtx) { try { @@ -352,7 +353,7 @@ class Solver : public CommonMpiTestFixture { template - void forall_vector_scenarios(const Part* part, + void forall_vector_scenarios(std::shared_ptr part, const std::shared_ptr& solver, TestFunction fn) { @@ -483,10 +484,10 @@ TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator); TYPED_TEST(Solver, ApplyIsEquivalentToRef) { this->forall_partition_scenarios([&](auto part) { - this->forall_matrix_scenarios(part.get(), [&](auto mtx) { + this->forall_matrix_scenarios(part, [&](auto mtx) { this->forall_solver_scenarios(mtx, [&](auto solver) { this->forall_vector_scenarios( - part.get(), solver, [&](auto b, auto x) { + part, solver, [&](auto b, auto x) { solver->apply(b, x); this->assert_residual_near(mtx, x, b, this->tol(x)); @@ -500,10 +501,10 @@ TYPED_TEST(Solver, ApplyIsEquivalentToRef) TYPED_TEST(Solver, AdvancedApplyIsEquivalentToRef) { this->forall_partition_scenarios([&](auto part) { - this->forall_matrix_scenarios(part.get(), [&](auto mtx) { + this->forall_matrix_scenarios(part, [&](auto mtx) { this->forall_solver_scenarios(mtx, [&](auto solver) { this->forall_vector_scenarios( - part.get(), solver, [&](auto b, auto x) { + part, solver, [&](auto b, auto x) { auto alpha = this->gen_scalar(); auto beta = this->gen_scalar(); auto x_old = gko::share(gko::clone(x)); @@ -523,10 +524,10 @@ TYPED_TEST(Solver, MixedApplyIsEquivalentToRef) { using MixedVec = typename TestFixture::MixedVec; this->forall_partition_scenarios([&](auto part) { - this->forall_matrix_scenarios(part.get(), [&](auto mtx) { + this->forall_matrix_scenarios(part, [&](auto mtx) { this->forall_solver_scenarios(mtx, [&](auto solver) { this->template forall_vector_scenarios( - part.get(), solver, [&](auto b, auto x) { + part, solver, [&](auto b, auto x) { solver->apply(b, x); this->assert_residual_near(mtx, x, b, @@ -543,10 +544,10 @@ TYPED_TEST(Solver, MixedAdvancedApplyIsEquivalentToRef) using MixedVec = typename TestFixture::MixedVec; using MixedLocalVec = typename TestFixture::MixedLocalVec; this->forall_partition_scenarios([&](auto part) { - this->forall_matrix_scenarios(part.get(), [&](auto mtx) { + this->forall_matrix_scenarios(part, [&](auto mtx) { this->forall_solver_scenarios(mtx, [&](auto solver) { this->template forall_vector_scenarios( - part.get(), solver, [&](auto b, auto x) { + part, solver, [&](auto b, auto x) { auto alpha = this->template gen_scalar(); auto beta = this->template gen_scalar(); auto x_old = gko::share(gko::clone(x)); From be2835284642b52555d48f677082e388a1d2d851 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 17 May 2024 13:40:40 +0000 Subject: [PATCH 4/6] [dist-mat] don't return an index map from read-distributed --- core/distributed/matrix.cpp | 14 ++++---------- include/ginkgo/core/distributed/matrix.hpp | 8 ++++---- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 7b23a947fc3..2f5050730a4 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -239,8 +239,7 @@ void Matrix::move_to( template -index_map -Matrix::read_distributed( +void Matrix::read_distributed( const device_matrix_data& data, std::shared_ptr> row_partition, @@ -347,14 +346,11 @@ Matrix::read_distributed( if (use_host_buffer) { gather_idxs_.set_executor(exec); } - - return imap; } template -index_map -Matrix::read_distributed( +void Matrix::read_distributed( const matrix_data& data, std::shared_ptr> row_partition, @@ -369,8 +365,7 @@ Matrix::read_distributed( template -index_map -Matrix::read_distributed( +void Matrix::read_distributed( const matrix_data& data, std::shared_ptr> partition) @@ -383,8 +378,7 @@ Matrix::read_distributed( template -index_map -Matrix::read_distributed( +void Matrix::read_distributed( const device_matrix_data& data, std::shared_ptr> partition) diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 66ed5f7089b..4689c3d3381 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -293,7 +293,7 @@ class Matrix * * @return the index_map induced by the partitions and the matrix structure */ - index_map read_distributed( + void read_distributed( const device_matrix_data& data, std::shared_ptr> partition); @@ -307,7 +307,7 @@ class Matrix * @note For efficiency it is advised to use the device_matrix_data * overload. */ - index_map read_distributed( + void read_distributed( const matrix_data& data, std::shared_ptr> partition); @@ -329,7 +329,7 @@ class Matrix * * @return the index_map induced by the partitions and the matrix structure */ - index_map read_distributed( + void read_distributed( const device_matrix_data& data, std::shared_ptr> row_partition, @@ -345,7 +345,7 @@ class Matrix * @note For efficiency it is advised to use the device_matrix_data * overload. */ - index_map read_distributed( + void read_distributed( const matrix_data& data, std::shared_ptr> row_partition, From 277a8fbcca67c550b6b0ca4cf9639bbd66e85908 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 17 May 2024 13:41:41 +0000 Subject: [PATCH 5/6] [dist-mat] review updates: - comment tests - replace std::vectors in backend - refactoring Co-authored-by: Tobias Ribizel --- core/distributed/matrix.cpp | 14 ++-- omp/distributed/matrix_kernels.cpp | 8 +- reference/test/distributed/matrix_kernels.cpp | 74 ++++++++++++------- 3 files changed, 62 insertions(+), 34 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 2f5050730a4..1dcddbd1a6a 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -312,12 +312,16 @@ void Matrix::read_distributed( make_temporary_clone(exec->get_master(), &imap.get_remote_target_ids()); auto host_offsets = make_temporary_clone( exec->get_master(), &imap.get_remote_global_idxs().get_offsets()); + auto compute_recv_sizes = [](const auto* recv_targets, size_type size, + const auto* offsets, auto& recv_sizes) { + for (size_type i = 0; i < size; ++i) { + recv_sizes[recv_targets[i]] = offsets[i + 1] - offsets[i]; + } + }; std::fill(recv_sizes_.begin(), recv_sizes_.end(), 0); - for (size_type i = 0; i < host_recv_targets->get_size(); ++i) { - recv_sizes_[host_recv_targets->get_const_data()[i]] = - host_offsets->get_const_data()[i + 1] - - host_offsets->get_const_data()[i]; - } + compute_recv_sizes(host_recv_targets->get_const_data(), + host_recv_targets->get_size(), + host_offsets->get_const_data(), recv_sizes_); std::partial_sum(recv_sizes_.begin(), recv_sizes_.end(), recv_offsets_.begin() + 1); comm.all_to_all(exec, recv_sizes_.data(), 1, send_sizes_.data(), 1); diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 076136e99e0..9f7b5594fa7 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -56,13 +56,13 @@ void separate_local_nonlocal( auto num_threads = static_cast(omp_get_max_threads()); auto num_input = input.get_num_stored_elements(); auto size_per_thread = (num_input + num_threads - 1) / num_threads; - std::vector local_entry_offsets(num_threads, 0); - std::vector non_local_entry_offsets(num_threads, 0); + vector local_entry_offsets(num_threads, 0, exec); + vector non_local_entry_offsets(num_threads, 0, exec); #pragma omp parallel firstprivate(col_range_id_hint, row_range_id_hint) { - std::vector thread_non_local_entries; - std::vector thread_local_entries; + vector thread_non_local_entries(exec); + vector thread_local_entries(exec); auto thread_id = omp_get_thread_num(); auto thread_begin = thread_id * size_per_thread; auto thread_end = std::min(thread_begin + size_per_thread, num_input); diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index 302e16a2d6f..5d96f4f9c64 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -49,7 +49,31 @@ class Matrix : public ::testing::Test { non_local_values{ref} {} - void validate( + /** + * apply the `separate_local_nonlocal` kernel and validate the result + * against provided reference values + * + * @param size the expected global matrix size + * @param row_partition the row partition passed to the kernel + * @param col_partition the column partition passed to the kernel + * @param input_rows the row indices passed to the kernel + * @param input_cols the column indices passed to the kernel + * @param input_vals the values passed to the kernel + * @param local_entries the reference local matrix data. It is provided + * as a list of tuples for each part of the row + * partition. Each tuple consists of the size of + * the local matrix, a list of row indices, + * a list of column indices, and a list of values. + * The indices are mapped to local indexing. + * @param non_local_entries the reference non-local matrix data. It is + * provided as a list of tuples for each part + * of the row partition. Each tuple contains + * the size of the non-local matrix, a list of + * row indices (mapped to local indexing), a + * list of column indices (NOT mapped to local + * indexing), and a list of values. + */ + void act_and_assert( gko::dim<2> size, gko::ptr_param> @@ -175,7 +199,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty) gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{8, 8}, partition, partition, {}, {}, {}, {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{3, 3}, I{}, I{}, I{}), @@ -197,7 +221,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalSmall) gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{2, 2}, partition, partition, {0, 0, 1, 1}, {0, 1, 0, 1}, {1, 2, 3, 4}, {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{4}), @@ -218,7 +242,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalNoNonLocal) gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 6}, partition, partition, {0, 0, 1, 1, 2, 3, 4, 5}, {0, 5, 1, 4, 3, 2, 4, 0}, {1, 2, 3, 4, 5, 6, 7, 8}, {std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, @@ -244,7 +268,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalNoLocal) gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 6}, partition, partition, {0, 0, 1, 3, 4, 5}, {1, 3, 5, 1, 3, 2}, {1, 2, 5, 6, 7, 8}, {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), @@ -269,7 +293,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalMixed) gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 6}, partition, partition, // clang-format on {0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 4, 4, 5, 5}, @@ -306,7 +330,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalEmptyWithColPartition) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{8, 8}, partition, col_partition, {}, {}, {}, {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), std::make_tuple(gko::dim<2>{3, 3}, I{}, I{}, I{}), @@ -332,7 +356,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalSmallWithColPartition) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{2, 2}, partition, col_partition, {0, 0, 1, 1}, {0, 1, 0, 1}, {1, 2, 3, 4}, {std::make_tuple(gko::dim<2>{1, 1}, I{0}, I{0}, I{3}), @@ -356,7 +380,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalNoNonLocalWithColPartition) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 6}, partition, col_partition, {3, 0, 5, 1, 1, 4}, {1, 4, 5, 2, 3, 3}, {1, 2, 3, 4, 5, 6}, {std::make_tuple(gko::dim<2>{2, 2}, I{1}, I{1}, I{1}), @@ -385,7 +409,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalNoLocalWithColPartition) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 6}, partition, col_partition, {2, 3, 2, 0, 5, 1, 1}, {2, 3, 5, 0, 1, 1, 4}, {1, 2, 3, 4, 5, 6, 7}, {std::make_tuple(gko::dim<2>{2, 2}, I{}, I{}, I{}), @@ -415,24 +439,24 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalMixedWithColPartition) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate(gko::dim<2>{6, 6}, partition, col_partition, + this->act_and_assert(gko::dim<2>{6, 6}, partition, col_partition, // clang-format off {2, 3, 3, 0, 5, 1, 4, 2, 3, 2, 0, 0, 1, 1, 4, 4}, { 0, 0, 1, 5, 4, 2, 2, 3, 2, 4, 1, 2, 4, 5, 0, 5}, {11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - // clang-format on - {std::make_tuple(gko::dim<2>{2, 2}, I{0, 1, 1}, - I{0, 0, 1}, I{11, 12, 13}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, - I{1, 0}, I{14, 15}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, - I{0, 0}, I{16, 17})}, - {std::make_tuple(gko::dim<2>{2, 3}, I{0, 1, 0}, - I{3, 2, 4}, I{1, 2, 3}), - std::make_tuple(gko::dim<2>{2, 2}, I{0, 0}, - I{1, 2}, I{4, 5}), - std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1, 1}, - I{4, 5, 0, 5}, I{6, 7, 8, 9})}); + // clang-format on + {std::make_tuple(gko::dim<2>{2, 2}, I{0, 1, 1}, I{0, 0, 1}, + I{11, 12, 13}), + std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{1, 0}, + I{14, 15}), + std::make_tuple(gko::dim<2>{2, 2}, I{0, 1}, I{0, 0}, + I{16, 17})}, + {std::make_tuple(gko::dim<2>{2, 3}, I{0, 1, 0}, I{3, 2, 4}, + I{1, 2, 3}), + std::make_tuple(gko::dim<2>{2, 2}, I{0, 0}, I{1, 2}, + I{4, 5}), + std::make_tuple(gko::dim<2>{2, 3}, I{0, 0, 1, 1}, + I{4, 5, 0, 5}, I{6, 7, 8, 9})}); } @@ -451,7 +475,7 @@ TYPED_TEST(Matrix, SeparateLocalNonLocalNonSquare) gko::experimental::distributed::Partition::build_from_mapping( this->ref, col_mapping, num_parts); - this->validate( + this->act_and_assert( gko::dim<2>{6, 4}, partition, col_partition, // clang-format off {2, 3, 0, 1, 4, 3, 3, 0, 1, 4}, From 0d3b7106d98e7da88492fb3c3f8752ec08d71d1f Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 8 May 2024 15:20:47 +0200 Subject: [PATCH 6/6] [bench] update benchmark tests with new output --- .../distributed_solver.profile.stderr | 30 +++++++++++++++++-- .../reference/spmv_distributed.profile.stderr | 30 +++++++++++++++++-- .../reference/spmv_distributed.profile.stdout | 2 +- .../reference/spmv_distributed.simple.stdout | 2 +- 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr index 47918f049ea..907ff8a9c98 100644 --- a/benchmark/test/reference/distributed_solver.profile.stderr +++ b/benchmark/test/reference/distributed_solver.profile.stderr @@ -52,8 +52,34 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin distributed_matrix::separate_local_nonlocal +DEBUG: end distributed_matrix::separate_local_nonlocal +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin index_map::build_mapping +DEBUG: end index_map::build_mapping +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::prefix_sum_nonnegative +DEBUG: end components::prefix_sum_nonnegative +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::prefix_sum_nonnegative +DEBUG: end components::prefix_sum_nonnegative +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin index_map::map_to_local +DEBUG: end index_map::map_to_local +DEBUG: begin copy +DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr index d5e4913b51a..d3a645aa0f7 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stderr +++ b/benchmark/test/reference/spmv_distributed.profile.stderr @@ -104,8 +104,34 @@ DEBUG: begin dense::fill DEBUG: end dense::fill DEBUG: begin components::aos_to_soa DEBUG: end components::aos_to_soa -DEBUG: begin distributed_matrix::build_local_nonlocal -DEBUG: end distributed_matrix::build_local_nonlocal +DEBUG: begin distributed_matrix::separate_local_nonlocal +DEBUG: end distributed_matrix::separate_local_nonlocal +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin index_map::build_mapping +DEBUG: end index_map::build_mapping +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::prefix_sum_nonnegative +DEBUG: end components::prefix_sum_nonnegative +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::prefix_sum_nonnegative +DEBUG: end components::prefix_sum_nonnegative +DEBUG: begin copy +DEBUG: end copy +DEBUG: begin components::fill_array +DEBUG: end components::fill_array +DEBUG: begin index_map::map_to_local +DEBUG: end index_map::map_to_local +DEBUG: begin copy +DEBUG: end copy DEBUG: begin copy DEBUG: end copy DEBUG: begin copy diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout index 5eb5710a134..bbef87d0b89 100644 --- a/benchmark/test/reference/spmv_distributed.profile.stdout +++ b/benchmark/test/reference/spmv_distributed.profile.stdout @@ -5,7 +5,7 @@ "comm_pattern": "stencil", "spmv": { "csr-csr": { - "storage": 6564, + "storage": 6420, "time": 1.0, "repetitions": 1, "completed": true diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout index 1bb9378eb7c..77bdef168d3 100644 --- a/benchmark/test/reference/spmv_distributed.simple.stdout +++ b/benchmark/test/reference/spmv_distributed.simple.stdout @@ -5,7 +5,7 @@ "comm_pattern": "stencil", "spmv": { "csr-csr": { - "storage": 6564, + "storage": 6420, "max_relative_norm2": 1.0, "time": 1.0, "repetitions": 10,