From f592e9c4bfcc2d8e887ad5f96e5167ee0ee2c73a Mon Sep 17 00:00:00 2001 From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> Date: Tue, 9 Jul 2024 21:00:57 -0700 Subject: [PATCH] Add groupby_max multi-threaded benchmark (#16154) This PR adds **groupby_max** multi-threaded benchmark. The benchmark runs multiple **max groupby aggregations** concurrently using one CUDA stream per host thread. Closes #16134 Authors: - Srinivas Yadav (https://github.com/srinivasyadav18) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16154 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/groupby/group_max.cpp | 16 ++- .../groupby/group_max_multithreaded.cpp | 102 ++++++++++++++++++ 3 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_max_multithreaded.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index a5b248135c1..ff431c7f260 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -231,8 +231,8 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp - groupby/group_struct_keys.cpp + GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp + groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index 01ca23ebbf8..f41285008c4 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state, cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); }(); + auto const num_aggregations = state.get_int64("num_aggregations"); + auto keys_view = keys->view(); auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view})); std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - requests[0].values = vals->view(); - requests[0].aggregations.push_back(cudf::make_max_aggregation()); + for (int64_t i = 0; i < num_aggregations; i++) { + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[i].values = vals->view(); + requests[i].aggregations.push_back(cudf::make_max_aggregation()); + } auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_element_count( + static_cast(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } @@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max, .set_name("groupby_max") .add_int64_axis("cardinality", {0}) .add_int64_power_of_two_axis("num_rows", {12, 18, 24}) - .add_float64_axis("null_probability", {0, 0.1, 0.9}); + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32}); NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("groupby_max_cardinality") diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp new file mode 100644 index 00000000000..3b8faba618f --- /dev/null +++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list) +{ + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + auto const num_threads = state.get_int64("num_threads"); + auto const num_aggregations = state.get_int64("num_aggregations"); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const vals = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + auto keys_view = keys->view(); + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view})); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + cudf::detail::thread_pool threads(num_threads); + + std::vector> requests(num_threads); + for (auto& thread_requests : requests) { + for (int64_t j = 0; j < num_aggregations; j++) { + thread_requests.emplace_back(); + thread_requests.back().values = vals->view(); + thread_requests.back().aggregations.push_back( + cudf::make_max_aggregation()); + } + } + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); }; + timer.start(); + for (int64_t i = 0; i < num_threads; ++i) { + threads.submit(perform_agg, i); + } + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + timer.stop(); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count( + static_cast(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000., + "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_max_multithreaded") + .add_int64_axis("cardinality", {0}) + .add_int64_power_of_two_axis("num_rows", {12, 18}) + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("num_aggregations", {1}) + .add_int64_axis("num_threads", {1, 2, 4, 8});