Skip to content

Commit

Permalink
Update for CCCL 2.x (#3862)
Browse files Browse the repository at this point in the history
This PR adds `cuda::proclaim_return_type` to device lambdas used in `thrust::transform` and `thrust::make_transform_iterator`.

This PR requires libcudacxx 2.1.0, which was provided by rapidsai/rapids-cmake#464.

Closes #3863.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #3862
  • Loading branch information
seunghwak authored Dec 8, 2023
1 parent 1df6217 commit 18ab76b
Show file tree
Hide file tree
Showing 15 changed files with 235 additions and 167 deletions.
5 changes: 4 additions & 1 deletion cpp/include/cugraph/utilities/misc_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/optional.h>

#include <cuda/functional>

#include <optional>
#include <tuple>
#include <vector>
Expand All @@ -44,7 +46,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
{
auto search_offset_first = thrust::make_transform_iterator(
thrust::make_counting_iterator(size_t{1}),
[approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
cuda::proclaim_return_type<size_t>(
[approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;

if (num_chunks > 1) {
Expand Down
48 changes: 28 additions & 20 deletions cpp/include/cugraph/utilities/shuffle_comm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include <thrust/tabulate.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <algorithm>
#include <numeric>
#include <vector>
Expand Down Expand Up @@ -197,12 +199,13 @@ void multi_partition(ValueIterator value_first,
value_last,
thrust::make_zip_iterator(
thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
[value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
auto group_id = value_to_group_id_op(value);
cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
return thrust::make_tuple(group_id,
counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
});
cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
[value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
auto group_id = value_to_group_id_op(value);
cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
return thrust::make_tuple(group_id,
counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
}));

rmm::device_uvector<size_t> displacements(num_groups, stream_view);
thrust::exclusive_scan(
Expand Down Expand Up @@ -245,17 +248,19 @@ void multi_partition(KeyIterator key_first,
rmm::device_uvector<int> group_ids(num_keys, stream_view);
rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
thrust::transform(rmm::exec_policy(stream_view),
key_first,
key_last,
thrust::make_zip_iterator(
thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
[key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
auto group_id = key_to_group_id_op(key);
cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
return thrust::make_tuple(
group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
});
thrust::transform(
rmm::exec_policy(stream_view),
key_first,
key_last,
thrust::make_zip_iterator(
thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
[key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
auto group_id = key_to_group_id_op(key);
cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
return thrust::make_tuple(group_id,
counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
}));

rmm::device_uvector<size_t> displacements(num_groups, stream_view);
thrust::exclusive_scan(
Expand Down Expand Up @@ -761,8 +766,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
stream_view);

auto group_id_first = thrust::make_transform_iterator(
tx_value_first,
[value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
return value_to_group_id_op(value);
}));
rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
auto rank_count_pair_first = thrust::make_zip_iterator(
Expand Down Expand Up @@ -795,7 +801,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
stream_view);

auto group_id_first = thrust::make_transform_iterator(
tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
return key_to_group_id_op(key);
}));
rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
auto rank_count_pair_first = thrust::make_zip_iterator(
Expand Down
16 changes: 9 additions & 7 deletions cpp/src/community/detail/mis_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include <thrust/transform.h>
#include <thrust/transform_reduce.h>

#include <cuda/functional>

#include <cmath>

namespace cugraph {
Expand Down Expand Up @@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());

// Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
thrust::transform_if(
handle.get_thrust_policy(),
out_degrees.begin(),
out_degrees.end(),
ranks.begin(),
[] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
[] __device__(auto deg) { return deg == 0; });
thrust::transform_if(handle.get_thrust_policy(),
out_degrees.begin(),
out_degrees.end(),
ranks.begin(),
cuda::proclaim_return_type<vertex_t>(
[] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
[] __device__(auto deg) { return deg == 0; });

out_degrees.resize(0, handle.get_stream());
out_degrees.shrink_to_fit(handle.get_stream());
Expand Down
39 changes: 22 additions & 17 deletions cpp/src/community/detail/refine_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include <thrust/transform_reduce.h>
#include <thrust/tuple.h>

#include <cuda/functional>

CUCO_DECLARE_BITWISE_COMPARABLE(float)
CUCO_DECLARE_BITWISE_COMPARABLE(double)
// FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
Expand Down Expand Up @@ -213,16 +215,17 @@ refine_clustering(
: detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
louvain_assignment_of_vertices.data(), vertex_t{0}),
*edge_weight_view,
[] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
weight_t weighted_cut_contribution{0};
cuda::proclaim_return_type<weight_t>(
[] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
weight_t weighted_cut_contribution{0};

if (src == dst) // self loop
weighted_cut_contribution = 0;
else if (src_cluster == dst_cluster)
weighted_cut_contribution = wt;
if (src == dst) // self loop
weighted_cut_contribution = 0;
else if (src_cluster == dst_cluster)
weighted_cut_contribution = wt;

return weighted_cut_contribution;
},
return weighted_cut_contribution;
}),
weight_t{0},
cugraph::reduce_op::plus<weight_t>{},
weighted_cut_of_vertices_to_louvain.begin());
Expand All @@ -243,13 +246,14 @@ refine_clustering(
wcut_deg_and_cluster_vol_triple_begin,
wcut_deg_and_cluster_vol_triple_end,
singleton_and_connected_flags.begin(),
[resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
auto wcut_wdeg_and_louvain_volume) {
auto wcut = thrust::get<0>(wcut_wdeg_and_louvain_volume);
auto wdeg = thrust::get<1>(wcut_wdeg_and_louvain_volume);
auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
return wcut >
(resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
});
return static_cast<uint8_t>(
wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
}));

edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
Expand Down Expand Up @@ -718,11 +722,12 @@ refine_clustering(
vertices_in_mis.begin(),
vertices_in_mis.end(),
dst_vertices.begin(),
[dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
auto dst = *(dst_first + v - v_first);
return dst;
});
cuda::proclaim_return_type<vertex_t>(
[dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
auto dst = *(dst_first + v - v_first);
return dst;
}));

cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/detail/collect_local_vertex_values.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <cugraph/graph_functions.hpp>
#include <cugraph/utilities/shuffle_comm.cuh>

#include <cuda/functional>

namespace cugraph {
namespace detail {

Expand Down Expand Up @@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p

auto vertex_iterator = thrust::make_transform_iterator(
d_vertices.begin(),
[local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
cuda::proclaim_return_type<vertex_t>(
[local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));

d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
thrust::fill(
Expand Down
24 changes: 14 additions & 10 deletions cpp/src/generators/erdos_renyi_generator.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,6 +28,8 @@
#include <thrust/transform.h>
#include <thrust/tuple.h>

#include <cuda/functional>

namespace cugraph {

template <typename vertex_t>
Expand All @@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
"Implementation cannot support specified value");

auto random_iterator = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
thrust::make_counting_iterator<size_t>(0),
cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> dist(0.0, 1.0);
rng.discard(index);
return dist(rng);
});
}));

size_t count = thrust::count_if(handle.get_thrust_policy(),
random_iterator,
Expand All @@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
indices_v.begin(),
indices_v.end(),
thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
[num_vertices] __device__(size_t index) {
size_t src = index / num_vertices;
size_t dst = index % num_vertices;

return thrust::make_tuple(static_cast<vertex_t>(src),
static_cast<vertex_t>(dst));
});
cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
[num_vertices] __device__(size_t index) {
size_t src = index / num_vertices;
size_t dst = index % num_vertices;

return thrust::make_tuple(static_cast<vertex_t>(src),
static_cast<vertex_t>(dst));
}));

handle.sync_stream();

Expand Down
39 changes: 21 additions & 18 deletions cpp/src/generators/simple_generators.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,6 +27,8 @@
#include <thrust/sequence.h>
#include <thrust/tuple.h>

#include <cuda/functional>

#include <numeric>

namespace cugraph {
Expand Down Expand Up @@ -264,23 +266,24 @@ generate_complete_graph_edgelist(

auto transform_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_t>(0),
[base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
size_t graph_index = index / (num_vertices * num_vertices);
size_t local_index = index % (num_vertices * num_vertices);

vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);

if (src == dst) {
src = invalid_vertex;
dst = invalid_vertex;
} else {
src += (graph_index * num_vertices);
dst += (graph_index * num_vertices);
}

return thrust::make_tuple(src, dst);
});
cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
[base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
size_t graph_index = index / (num_vertices * num_vertices);
size_t local_index = index % (num_vertices * num_vertices);

vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);

if (src == dst) {
src = invalid_vertex;
dst = invalid_vertex;
} else {
src += (graph_index * num_vertices);
dst += (graph_index * num_vertices);
}

return thrust::make_tuple(src, dst);
}));

output_iterator = thrust::copy_if(handle.get_thrust_policy(),
transform_iter,
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/mtmg/vertex_result.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <detail/graph_partition_utils.cuh>

#include <thrust/functional.h>
#include <thrust/gather.h>

namespace cugraph {
Expand Down Expand Up @@ -91,10 +92,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
auto vertex_partition =
vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);

auto iter =
thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
auto iter = thrust::make_transform_iterator(
local_vertices.begin(),
cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
});
}));

thrust::gather(handle.get_thrust_policy(),
iter,
Expand All @@ -111,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
vertex_gpu_ids.begin(),
vertex_gpu_ids.end(),
thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
[] __device__(int gpu) { return gpu; },
thrust::identity{},
handle.get_stream());

//
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include <thrust/tuple.h>
#include <thrust/unique.h>

#include <cuda/functional>

#include <optional>
#include <tuple>

Expand Down Expand Up @@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
multiplier_t<size_t>{high_partition_over_sampling_K}),
thrust::make_transform_iterator(
thrust::make_counting_iterator(size_t{0}),
[high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
cuda::proclaim_return_type<size_t>(
[high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
handle.get_stream());
if (tmp_storage_bytes > d_tmp_storage.size()) {
d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
Expand All @@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
multiplier_t<size_t>{high_partition_over_sampling_K}),
thrust::make_transform_iterator(
thrust::make_counting_iterator(size_t{0}),
[high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
cuda::proclaim_return_type<size_t>(
[high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
handle.get_stream());

// copy the neighbor indices back to sample_nbr_indices
Expand Down
Loading

0 comments on commit 18ab76b

Please sign in to comment.