Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add local to global index mapping #1707

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
86 changes: 85 additions & 1 deletion common/cuda_hip/distributed/index_map_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -296,6 +296,90 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);


template <typename LocalIndexType, typename GlobalIndexType>
void map_to_global(
std::shared_ptr<const DefaultExecutor> exec,
device_partition<const LocalIndexType, const GlobalIndexType> partition,
device_segmented_array<const GlobalIndexType> remote_global_idxs,
experimental::distributed::comm_index_type rank,
const array<LocalIndexType>& local_ids,
experimental::distributed::index_space is,
array<GlobalIndexType>& global_ids)
{
auto range_bounds = partition.offsets_begin;
auto starting_indices = partition.starting_indices_begin;
const auto& ranges_by_part = partition.ranges_by_part;
auto local_ids_it = local_ids.get_const_data();
auto input_size = local_ids.get_size();

auto policy = thrust_policy(exec);

global_ids.resize_and_reset(local_ids.get_size());
auto global_ids_it = global_ids.get_data();

auto map_local = [rank, ranges_by_part, range_bounds, starting_indices,
partition] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < 0 || lid >= local_size) {
return invalid_index<GlobalIndexType>();
}

auto local_ranges = ranges_by_part.get_segment(rank);
auto local_ranges_size =
static_cast<int64>(local_ranges.end - local_ranges.begin);

auto it = binary_search(int64(0), local_ranges_size, [=](const auto i) {
return starting_indices[local_ranges.begin[i]] >= lid;
});
auto local_range_id =
it != local_ranges_size ? it : max(int64(0), it - 1);
auto range_id = local_ranges.begin[local_range_id];

return static_cast<GlobalIndexType>(lid - starting_indices[range_id]) +
range_bounds[range_id];
};
auto map_non_local = [remote_global_idxs] __device__(auto lid) {
auto remote_size = static_cast<LocalIndexType>(
remote_global_idxs.flat_end - remote_global_idxs.flat_begin);

if (lid < 0 || lid >= remote_size) {
return invalid_index<GlobalIndexType>();
}

return remote_global_idxs.flat_begin[lid];
};
auto map_combined = [map_local, map_non_local, partition,
rank] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < local_size) {
return map_local(lid);
} else {
return map_non_local(lid - local_size);
}
};

if (is == experimental::distributed::index_space::local) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_local);
}
if (is == experimental::distributed::index_space::non_local) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_non_local);
}
if (is == experimental::distributed::index_space::combined) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_combined);
}
}

GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);


} // namespace index_map
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
Expand Down
35 changes: 34 additions & 1 deletion common/cuda_hip/distributed/partition_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/distributed/partition_kernels.hpp"

#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/atomic.hpp"
#include "common/unified/base/kernel_launch.hpp"
#include "core/components/fill_array_kernels.hpp"

Expand Down Expand Up @@ -132,6 +135,36 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);


void build_ranges_by_part(std::shared_ptr<const DefaultExecutor> exec,
const int* range_parts, size_type num_ranges,
int num_parts, array<size_type>& range_ids,
array<int64>& sizes)
{
auto policy = thrust_policy(exec);

range_ids.resize_and_reset(num_ranges);
auto range_ids_ptr = range_ids.get_data();
thrust::sequence(policy, range_ids_ptr, range_ids_ptr + num_ranges);

// mutable copy of range_parts such that it can be used as keys for sorting
array<int> range_parts_copy{exec, num_ranges};
thrust::copy_n(policy, range_parts, num_ranges,
range_parts_copy.get_data());
auto range_parts_ptr = range_parts_copy.get_data();

thrust::stable_sort_by_key(policy, range_parts_ptr,
range_parts_ptr + num_ranges, range_ids_ptr);

sizes.resize_and_reset(num_parts);
auto sizes_ptr = sizes.get_data();
thrust::fill_n(policy, sizes_ptr, num_parts, 0);
thrust::for_each_n(policy, range_parts_ptr, num_ranges,
[sizes_ptr] __device__(const size_type pid) {
atomic_add(sizes_ptr + pid, int64(1));
});
}


} // namespace partition
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
Expand Down
4 changes: 2 additions & 2 deletions core/base/segmented_array.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -29,7 +29,7 @@ struct device_segmented_array {
T* end;
};

constexpr segment get_segment(size_type segment_id)
constexpr segment get_segment(size_type segment_id) const
{
GKO_ASSERT(segment_id < (offsets_end - offsets_begin));
return {flat_begin + offsets_begin[segment_id],
Expand Down
2 changes: 2 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ GKO_STUB_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_MAPPING);
GKO_STUB_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED);
GKO_STUB(GKO_DECLARE_PARTITION_BUILD_RANGES_BY_PART);


} // namespace partition
Expand All @@ -343,6 +344,7 @@ namespace index_map {

GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_BUILD_MAPPING);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);


} // namespace index_map
Expand Down
61 changes: 61 additions & 0 deletions core/distributed/device_partition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <ginkgo/core/distributed/partition.hpp>

#include "core/base/segmented_array.hpp"

namespace gko {


template <typename LocalIndexType, typename GlobalIndexType>
struct device_partition {
using local_index_type = LocalIndexType;
using global_index_type = GlobalIndexType;
using comm_index_type = experimental::distributed::comm_index_type;

comm_index_type num_parts;
comm_index_type num_empty_parts;
size_type size;
global_index_type* offsets_begin;
global_index_type* offsets_end;
local_index_type* starting_indices_begin;
local_index_type* starting_indices_end;
local_index_type* part_sizes_begin;
local_index_type* part_sizes_end;
const comm_index_type* part_ids_begin;
const comm_index_type* part_ids_end;
device_segmented_array<const size_type> ranges_by_part;
};


/**
* Explicitly create a const version of device_partition.
*/
template <typename LocalIndexType, typename GlobalIndexType>
constexpr device_partition<const LocalIndexType, const GlobalIndexType>
to_device_const(
MarcelKoch marked this conversation as resolved.
Show resolved Hide resolved
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
partition)
{
auto num_ranges = partition->get_num_ranges();
auto num_parts = partition->get_num_parts();
return {num_parts,
partition->get_num_empty_parts(),
partition->get_size(),
partition->get_range_bounds(),
partition->get_range_bounds() + num_ranges + 1,
partition->get_range_starting_indices(),
partition->get_range_starting_indices() + num_ranges,
partition->get_part_sizes(),
partition->get_part_sizes() + num_parts,
partition->get_part_ids(),
partition->get_part_ids() + num_parts,
to_device(partition->get_ranges_by_part())};
}


} // namespace gko
18 changes: 17 additions & 1 deletion core/distributed/index_map.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -13,6 +13,7 @@ namespace index_map_kernels {

GKO_REGISTER_OPERATION(build_mapping, index_map::build_mapping);
GKO_REGISTER_OPERATION(map_to_local, index_map::map_to_local);
GKO_REGISTER_OPERATION(map_to_global, index_map::map_to_global);


} // namespace index_map_kernels
Expand Down Expand Up @@ -89,6 +90,21 @@ array<LocalIndexType> index_map<LocalIndexType, GlobalIndexType>::map_to_local(
}


template <typename LocalIndexType, typename GlobalIndexType>
array<GlobalIndexType>
index_map<LocalIndexType, GlobalIndexType>::map_to_global(
const array<LocalIndexType>& local_ids, index_space index_space_v) const
{
array<GlobalIndexType> global_ids(exec_);

exec_->run(index_map_kernels::make_map_to_global(
to_device_const(partition_.get()), to_device(remote_global_idxs_),
rank_, local_ids, index_space_v, global_ids));

return global_ids;
}


template <typename LocalIndexType, typename GlobalIndexType>
index_map<LocalIndexType, GlobalIndexType>::index_map(
std::shared_ptr<const Executor> exec,
Expand Down
39 changes: 34 additions & 5 deletions core/distributed/index_map_kernels.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -13,6 +13,7 @@

#include "core/base/kernel_declaration.hpp"
#include "core/base/segmented_array.hpp"
#include "core/distributed/device_partition.hpp"


namespace gko {
Expand Down Expand Up @@ -53,12 +54,15 @@ namespace kernels {
* space defined by is. The resulting indices are stored in local_ids.
* The index map is defined by the input parameters:
*
* - partition: the global partition
* - partition: the global partition
* - remote_target_ids: the owning part ids of each segment of
* remote_global_idxs
* remote_global_idxs
* - remote_global_idxs: the remote global indices, segmented by the owning part
* ids
* ids, and each segment sorted
* - rank: the part id of this process
*
* Any global index that is not in the specified local index space is mapped
* to invalid_index.
*/
#define GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL(_ltype, _gtype) \
void map_to_local( \
Expand All @@ -72,11 +76,36 @@ namespace kernels {
experimental::distributed::index_space is, array<_ltype>& local_ids)


/**
* This kernels maps local indices to global indices.
*
* The relevant input parameter from the index map are:
*
* - partition: the global partition
* - remote_global_idxs: the remote global indices, segmented by the owning part
* ids
* - rank: the part id of this process
*
* Any local index that is not part of the specified index space is mapped to
* invalid_index.
*/
#define GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL(_ltype, _gtype) \
void map_to_global( \
std::shared_ptr<const DefaultExecutor> exec, \
device_partition<const _ltype, const _gtype> partition, \
device_segmented_array<const _gtype> remote_global_idxs, \
experimental::distributed::comm_index_type rank, \
const array<_ltype>& local_ids, \
experimental::distributed::index_space is, array<_gtype>& global_ids)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename LocalIndexType, typename GlobalIndexType> \
GKO_DECLARE_INDEX_MAP_BUILD_MAPPING(LocalIndexType, GlobalIndexType); \
template <typename LocalIndexType, typename GlobalIndexType> \
GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL(LocalIndexType, GlobalIndexType)
GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL(LocalIndexType, GlobalIndexType); \
template <typename LocalIndexType, typename GlobalIndexType> \
GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL(LocalIndexType, GlobalIndexType)


GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(index_map,
Expand Down
13 changes: 11 additions & 2 deletions core/distributed/partition.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -21,6 +21,7 @@ GKO_REGISTER_OPERATION(build_ranges_from_global_size,
partition::build_ranges_from_global_size);
GKO_REGISTER_OPERATION(build_starting_indices,
partition::build_starting_indices);
GKO_REGISTER_OPERATION(build_ranges_by_part, partition::build_ranges_by_part);
GKO_REGISTER_OPERATION(has_ordered_parts, partition::has_ordered_parts);


Expand All @@ -38,7 +39,8 @@ Partition<LocalIndexType, GlobalIndexType>::Partition(
offsets_{exec, num_ranges + 1},
starting_indices_{exec, num_ranges},
part_sizes_{exec, static_cast<size_type>(num_parts)},
part_ids_{exec, num_ranges}
part_ids_{exec, num_ranges},
ranges_by_part_{exec}
{
offsets_.fill(0);
starting_indices_.fill(0);
Expand Down Expand Up @@ -126,6 +128,13 @@ void Partition<LocalIndexType, GlobalIndexType>::finalize_construction()
get_num_parts(), num_empty_parts_, starting_indices_.get_data(),
part_sizes_.get_data()));
size_ = get_element(offsets_, get_num_ranges());
array<size_type> range_ids(exec);
array<int64> num_ranges_per_part(exec);
exec->run(partition::make_build_ranges_by_part(
part_ids_.get_const_data(), get_num_ranges(), get_num_parts(),
range_ids, num_ranges_per_part));
ranges_by_part_ = segmented_array<size_type>::create_from_sizes(
std::move(range_ids), num_ranges_per_part);
}
Comment on lines +133 to +137
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently, this rely on the atomic add to count the size.
then segmented_array will convert size to offset.
Maybe you can implement a similar kernel with convert_idxs_to_ptrs + additional index map from i to range_id[i] because range_part[range_id[i]] is sorted. then segmented_array can create_from_offsets directly


template <typename LocalIndexType, typename GlobalIndexType>
Expand Down
Loading
Loading