diff --git a/common/cuda_hip/reorder/rcm_kernels.hpp.inc b/common/cuda_hip/reorder/rcm_kernels.hpp.inc index 172d4bc1689..06cb43dd85f 100644 --- a/common/cuda_hip/reorder/rcm_kernels.hpp.inc +++ b/common/cuda_hip/reorder/rcm_kernels.hpp.inc @@ -3,10 +3,494 @@ // SPDX-License-Identifier: BSD-3-Clause template -void get_permutation( - std::shared_ptr exec, const IndexType num_vertices, +array compute_node_degrees( + std::shared_ptr exec, + const IndexType* const row_ptrs, const IndexType num_rows) +{ + const auto policy = thrust_policy(exec); + array node_degrees{exec, static_cast(num_rows)}; + const auto row_ptr_zip_it = + thrust::make_zip_iterator(thrust::make_tuple(row_ptrs, row_ptrs + 1)); + thrust::transform(policy, row_ptr_zip_it, row_ptr_zip_it + num_rows, + node_degrees.get_data(), [] __device__(auto pair) { + return thrust::get<1>(pair) - thrust::get<0>(pair); + }); + return node_degrees; +} + + +template +struct components_data { + /** Mapping node -> component ID */ + array node_component; + /** Segmented storage of node IDs for each component */ + array nodes; + /** mapping entries in nodes to their component ID */ + array sorted_ids; + /** Pointers into nodes */ + array ptrs; + /** Minimum degree node for each component */ + array min_deg_node; + + components_data(std::shared_ptr exec, + size_type num_rows) + : node_component{exec, num_rows}, + nodes{exec, num_rows}, + sorted_ids{exec, num_rows}, + ptrs{exec}, + min_deg_node{exec} + {} + + void set_num_components(size_type num_components) + { + ptrs.resize_and_reset(num_components + 1); + min_deg_node.resize_and_reset(num_components); + } + + size_type get_num_components() const { return min_deg_node.get_size(); } +}; + + +template +__global__ +__launch_bounds__(default_block_size) void connected_components_attach( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, IndexType num_rows, + IndexType* __restrict__ components) +{ + const auto row = thread::get_thread_id_flat(); + if (row >= num_rows) { + return; + } + const auto begin = row_ptrs[row]; + const auto end = row_ptrs[row + 1]; + auto parent = static_cast(row); + for (auto nz = begin; nz < end; nz++) { + const auto col = col_idxs[nz]; + parent = min(col, parent); + } + components[row] = parent; +} + + +__global__ void reset_flag(bool* finished) { *finished = true; } + + +template +__global__ +__launch_bounds__(default_block_size) void connected_components_path_compress( + IndexType num_rows, IndexType* components, bool* finished) +{ + const auto row = thread::get_thread_id_flat(); + if (row >= num_rows) { + return; + } + const auto old_value = components[row]; + const auto new_value = components[old_value]; + if (old_value != new_value && *finished) { + *finished = false; + } + components[row] = new_value; +} + + +template +struct adj_not_predicate { + __device__ __forceinline__ bool operator()(IndexType i) + { + return i == 0 || data[i - 1] != data[i]; + } + + const IndexType* data; +}; + + +template +struct node_min_degree_reduction { + __device__ __forceinline__ IndexType operator()(IndexType u, IndexType v) + { + return thrust::make_pair(degree[u], u) < thrust::make_pair(degree[v], v) + ? u + : v; + } + + const IndexType* degree; +}; + + +template +components_data compute_connected_components( + std::shared_ptr exec, const IndexType num_rows, const IndexType* const row_ptrs, const IndexType* const col_idxs, - IndexType* const permutation, IndexType* const inv_permutation, - const gko::reorder::starting_strategy strategy) GKO_NOT_IMPLEMENTED; + const IndexType* const node_degrees) +{ + const auto policy = thrust_policy(exec); + components_data result{exec, static_cast(num_rows)}; + array finished_array{exec, 1}; + const auto node_component = result.node_component.get_data(); + const auto nodes = result.nodes.get_data(); + const auto finished = finished_array.get_data(); + // attach every node to its minimum neighbor + const auto num_blocks = ceildiv(num_rows, default_block_size); + connected_components_attach<<get_stream()>>>( + row_ptrs, col_idxs, num_rows, node_component); + // double pointers until all paths are loops or edges + do { + reset_flag<<<1, 1, 0, exec->get_stream()>>>(finished); + connected_components_path_compress<<get_stream()>>>( + num_rows, node_component, finished); + } while (!exec->copy_val_to_host(finished)); + // group nodes by component ID + result.sorted_ids = result.node_component; + const auto sorted_component_ids = result.sorted_ids.get_data(); + thrust::sequence(policy, nodes, nodes + num_rows, IndexType{}); + thrust::stable_sort_by_key(policy, sorted_component_ids, + sorted_component_ids + num_rows, nodes); + // find beginning of all components + auto it = thrust::make_counting_iterator(size_type{}); + const auto predicate = adj_not_predicate{sorted_component_ids}; + const auto num_components = static_cast( + thrust::count_if(policy, it, it + num_rows, predicate)); + result.set_num_components(num_components); + const auto ptrs = result.ptrs.get_data(); + const auto min_deg_node = result.min_deg_node.get_data(); + thrust::copy_if(policy, it, it + num_rows, ptrs, predicate); + // set the sentinel entry + result.ptrs.set_value(num_components, num_rows); + // find minimum degree node for each component + array component_id_array{exec, num_components}; + const auto component_ids = component_id_array.get_data(); + thrust::reduce_by_key(policy, sorted_component_ids, + sorted_component_ids + num_rows, nodes, component_ids, + min_deg_node, thrust::equal_to{}, + node_min_degree_reduction{node_degrees}); + // map component IDs to consecutive indexing + array compacted_node_component{exec, + static_cast(num_rows)}; + thrust::lower_bound(policy, component_ids, component_ids + num_components, + node_component, node_component + num_rows, + compacted_node_component.get_data()); + result.node_component = std::move(compacted_node_component); + return result; +} + + +template +struct ubfs_levels { + /** Mapping node -> level */ + array node_level; + /** Segmented list of nodes for each level */ + array nodes; + /** Pointers into nodes for each level */ + array ptrs; + + ubfs_levels(std::shared_ptr exec, size_type num_rows) + : node_level{exec, num_rows}, + nodes{exec, num_rows}, + ptrs{exec, num_rows + 1} + {} +}; + + +template +struct atomic_map {}; + + +template <> +struct atomic_map { + using type = int; +}; + + +template <> +struct atomic_map { + using type = unsigned long long; +}; + + +template +__global__ __launch_bounds__(default_block_size) void ubfs_level_kernel( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ sources, size_type num_sources, + IndexType level, IndexType* __restrict__ node_levels, + IndexType* __restrict__ level_nodes, IndexType* __restrict__ output_ptr) +{ + using atomic_type = typename atomic_map::type; + const auto source = thread::get_thread_id_flat(); + if (source >= num_sources) { + return; + } + const auto row = sources[source]; + const auto begin = row_ptrs[row]; + const auto end = row_ptrs[row + 1]; + atomic_type unsigned_unattached{}; + const auto unattached = invalid_index(); + memcpy(&unsigned_unattached, &unattached, sizeof(IndexType)); + auto parent = static_cast(row); + for (auto nz = begin; nz < end; nz++) { + const auto col = col_idxs[nz]; + if (node_levels[col] == unattached && + atomicCAS(reinterpret_cast(node_levels + col), + unsigned_unattached, + static_cast(level)) == unsigned_unattached) { + const auto output_pos = + atomicAdd(reinterpret_cast(output_ptr), 1); + level_nodes[output_pos] = col; + } + } +} + + +template +IndexType ubfs(std::shared_ptr exec, + const IndexType num_rows, const IndexType* const row_ptrs, + const IndexType* const col_idxs, ubfs_levels& levels) +{ + const auto policy = thrust_policy(exec); + const auto node_levels = levels.node_level.get_data(); + const auto level_nodes = levels.nodes.get_data(); + IndexType level{}; + IndexType level_begin{}; + thrust::fill_n(policy, node_levels, num_rows, invalid_index()); + auto level_end_ptr = levels.ptrs.get_data() + level + 1; + auto level_end = exec->copy_val_to_host(level_end_ptr); + const auto level_order_level_it = + thrust::make_permutation_iterator(node_levels, level_nodes); + thrust::fill_n(policy, level_order_level_it, level_end, 0); + while (level_end > level_begin) { + level++; + level_end_ptr++; + const auto level_size = level_end - level_begin; + const auto num_blocks = ceildiv(level_size, default_block_size); + // copy end of previous level pointer to atomic counter for this level + exec->copy(1, level_end_ptr - 1, level_end_ptr); + ubfs_level_kernel<<get_stream()>>>( + row_ptrs, col_idxs, level_nodes + level_begin, level_size, level, + node_levels, level_nodes, level_end_ptr); + level_begin = + std::exchange(level_end, exec->copy_val_to_host(level_end_ptr)); + } + return level; +} + + +template +struct node_max_level_min_degree_reduction { + __device__ __forceinline__ IndexType operator()(IndexType u, IndexType v) + { + // return node with larger level (smaller degree or ID as tie-breakers) + return thrust::make_tuple(level[v], degree[u], u) < + thrust::make_tuple(level[u], degree[v], v) + ? u + : v; + } + + const IndexType* degree; + const IndexType* level; +}; + + +template +struct node_compare_functor { + __device__ void operator()(IndexType component) + { + const auto new_node = candidate_node[component]; + const auto new_level = level[new_node]; + const auto old_level = best_level[component]; + // if the candidate has a larger level, swap it + if (new_level > old_level) { + *improved = true; + best_node[component] = new_node; + best_level[component] = new_level; + } + } + + const IndexType* candidate_node; + const IndexType* level; + IndexType* best_node; + IndexType* best_level; + bool* improved; +}; + + +template +void find_pseudo_peripheral_nodes(std::shared_ptr exec, + const IndexType num_rows, + const IndexType* const row_ptrs, + const IndexType* const col_idxs, + const IndexType* const node_degrees, + const components_data& components, + ubfs_levels& levels) +{ + const auto policy = thrust_policy(exec); + const auto num_components = components.get_num_components(); + array candidate_node_array{exec, num_components}; + array best_level_array{exec, num_components}; + array improved{exec, 1}; + const auto candidate_nodes = candidate_node_array.get_data(); + const auto best_levels = best_level_array.get_data(); + const auto level_nodes = levels.nodes.get_data(); + const auto node_levels = levels.node_level.get_const_data(); + const auto component_nodes = components.nodes.get_const_data(); + const auto sorted_component_ids = components.sorted_ids.get_const_data(); + const auto reduction = node_max_level_min_degree_reduction{ + node_degrees, node_levels}; + const auto discard_it = thrust::discard_iterator{}; + const auto eq_op = thrust::equal_to{}; + const auto counting_it = thrust::make_counting_iterator(IndexType{}); + const auto compare_fn = node_compare_functor{ + candidate_nodes, node_levels, level_nodes, best_levels, + improved.get_data()}; + // initialize best_levels to the initial nodes at level 0 + thrust::fill_n(policy, best_levels, num_components, IndexType{}); + do { + ubfs(exec, num_rows, row_ptrs, col_idxs, levels); + // write a last-level node of min degree to candidate_nodes for each + // component + thrust::reduce_by_key(policy, sorted_component_ids, + sorted_component_ids + num_rows, component_nodes, + discard_it, candidate_nodes, eq_op, reduction); + improved.set_value(0, false); + thrust::for_each_n(policy, counting_it, num_components, compare_fn); + } while (improved.get_value(0)); + // the best nodes stay on the 0th level +} + + +template +__global__ __launch_bounds__(default_block_size) void ubfs_min_neighbor_kernel( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, const IndexType level_begin, + const IndexType level_end, const IndexType* __restrict__ level_nodes, + const IndexType* __restrict__ inv_permutation, + const IndexType* __restrict__ node_levels, + IndexType* __restrict__ min_neighbors) +{ + const auto target = thread::get_thread_id_flat() + level_begin; + if (target >= level_end) { + return; + } + const auto row = level_nodes[target]; + const auto begin = row_ptrs[row]; + const auto end = row_ptrs[row + 1]; + const auto cur_level = node_levels[row]; + auto min_neighbor = device_numeric_limits::max; + for (auto nz = begin; nz < end; nz++) { + const auto col = col_idxs[nz]; + const auto neighbor_level = node_levels[col]; + if (neighbor_level < cur_level) { + min_neighbor = min(min_neighbor, inv_permutation[col]); + } + } + min_neighbors[target] = min_neighbor; +} + + +template +__global__ __launch_bounds__(default_block_size) void build_permutation_level( + const IndexType level_begin, const IndexType level_end, + const IndexType* const level_nodes, IndexType* const inv_permutation) +{ + const auto target = thread::get_thread_id_flat() + level_begin; + if (target >= level_end) { + return; + } + inv_permutation[level_nodes[target]] = target; +} + + +template +void sort_levels(std::shared_ptr exec, + const IndexType num_rows, const IndexType* const row_ptrs, + const IndexType* const col_idxs, + const IndexType* const degrees, const IndexType num_levels, + const components_data& comps, + ubfs_levels& levels, IndexType* const permutation) +{ + const auto policy = thrust_policy(exec); + array inv_permutation_array{exec, + static_cast(num_rows)}; + array key_array{exec, static_cast(num_rows)}; + levels.ptrs.set_executor(exec->get_master()); + const auto level_ptrs = levels.ptrs.get_data(); + const auto level_nodes = levels.nodes.get_data(); + const auto node_levels = levels.node_level.get_const_data(); + const auto inv_permutation = inv_permutation_array.get_data(); + const auto key = key_array.get_data(); + const auto it = thrust::make_counting_iterator(IndexType{}); + const auto inv_permutation_it = + thrust::make_permutation_iterator(inv_permutation, level_nodes); + thrust::fill_n(policy, inv_permutation, num_rows, + std::numeric_limits::max()); + // fill inverse permutation for first level + const auto num_components = level_ptrs[1]; + thrust::copy_n(policy, it, num_components, inv_permutation_it); + for (IndexType lvl = 1; lvl < num_levels; lvl++) { + const auto level_begin = level_ptrs[lvl]; + const auto level_end = level_ptrs[lvl + 1]; + const auto level_size = level_end - level_begin; + // sort by node ID for determinism + thrust::sort(policy, level_nodes + level_begin, + level_nodes + level_end); + // sort by degree as tie-breaker + thrust::copy_n(policy, + thrust::make_permutation_iterator( + degrees, level_nodes + level_begin), + level_size, key + level_begin); + thrust::stable_sort_by_key(policy, key + level_begin, key + level_end, + level_nodes + level_begin); + // sort by minimum parent in CM order + const auto num_blocks = ceildiv(level_size, default_block_size); + ubfs_min_neighbor_kernel<<get_stream()>>>( + row_ptrs, col_idxs, level_begin, level_end, level_nodes, + inv_permutation, node_levels, key); + thrust::stable_sort_by_key(policy, key + level_begin, key + level_end, + level_nodes + level_begin); + // fill inverse permutation for next level + thrust::copy_n(policy, it + level_begin, level_size, + inv_permutation_it + level_begin); + } + // sort by component + thrust::copy_n(policy, + thrust::make_permutation_iterator( + comps.node_component.get_const_data(), level_nodes), + num_rows, key); + thrust::stable_sort_by_key(policy, key, key + num_rows, level_nodes); + thrust::copy_n(policy, level_nodes, num_rows, permutation); +} + + +template +void get_permutation(std::shared_ptr exec, + const IndexType num_rows, const IndexType* const row_ptrs, + const IndexType* const col_idxs, + IndexType* const permutation, + IndexType* const inv_permutation, + const gko::reorder::starting_strategy strategy) +{ + if (num_rows == 0) { + return; + } + const auto degrees = compute_node_degrees(exec, row_ptrs, num_rows); + auto comps = compute_connected_components( + exec, num_rows, row_ptrs, col_idxs, degrees.get_const_data()); + const auto num_components = comps.get_num_components(); + ubfs_levels levels{exec, static_cast(num_rows)}; + levels.ptrs.set_value(0, 0); + levels.ptrs.set_value(1, num_components); + if (strategy == reorder::starting_strategy::pseudo_peripheral) { + find_pseudo_peripheral_nodes(exec, num_rows, row_ptrs, col_idxs, + degrees.get_const_data(), comps, levels); + } + const auto num_levels = ubfs(exec, num_rows, row_ptrs, col_idxs, levels); + sort_levels(exec, num_rows, row_ptrs, col_idxs, degrees.get_const_data(), + num_levels, comps, levels, permutation); + thrust::reverse(thrust_policy(exec), permutation, permutation + num_rows); +} -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_GET_PERMUTATION_KERNEL); \ No newline at end of file +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_GET_PERMUTATION_KERNEL); diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu index 1eb90903351..4d3960b78d8 100644 --- a/cuda/reorder/rcm_kernels.cu +++ b/cuda/reorder/rcm_kernels.cu @@ -5,6 +5,18 @@ #include "core/reorder/rcm_kernels.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + #include #include #include @@ -13,9 +25,8 @@ #include -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/prefix_sum.cuh" +#include "cuda/base/thrust.cuh" +#include "cuda/components/thread_ids.cuh" namespace gko { @@ -29,6 +40,9 @@ namespace cuda { namespace rcm { +constexpr int default_block_size = 512; + + #include "common/cuda_hip/reorder/rcm_kernels.hpp.inc" diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp index 84306290ffa..c4455fe14f2 100644 --- a/hip/reorder/rcm_kernels.hip.cpp +++ b/hip/reorder/rcm_kernels.hip.cpp @@ -5,6 +5,18 @@ #include "core/reorder/rcm_kernels.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + #include #include #include @@ -13,9 +25,8 @@ #include -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" namespace gko { @@ -29,6 +40,9 @@ namespace hip { namespace rcm { +constexpr int default_block_size = 512; + + #include "common/cuda_hip/reorder/rcm_kernels.hpp.inc"