Merge branch 'branch-24.12' into all_host_alloc_single_api_v3

rapidsai · Nov 4, 2024 · b62eb85 · b62eb85
2 parents 18df720 + 0d37506
commit b62eb85
Show file tree

Hide file tree

Showing 150 changed files with 3,574 additions and 596 deletions.
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
@@ -54,15 +54,8 @@ else
     RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
     RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-    echo "" > ./constraints.txt
-    if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-        # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
-        rapids-dependency-file-generator \
-            --output requirements \
-            --file-key test_python_cudf_pandas \
-            --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-        | tee ./constraints.txt
-    fi
+    # generate constraints (possibly pinning to oldest support versions of dependencies)
+    rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt
 
     python -m pip install \
         -v \

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
@@ -12,15 +12,8 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 
 rapids-logger "Install cudf, pylibcudf, and test requirements"
 
-# Constrain to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \

diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
@@ -29,15 +29,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf_polars \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt
 
 # echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
@@ -12,15 +12,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_dask_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -46,6 +46,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -65,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.12
+- polars>=1.11,<1.13
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
@@ -76,6 +77,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -45,6 +45,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -63,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.12
+- polars>=1.11,<1.13
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
@@ -74,6 +75,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -371,6 +371,7 @@ add_library(
   src/groupby/hash/compute_groupby.cu
   src/groupby/hash/compute_mapping_indices.cu
   src/groupby/hash/compute_mapping_indices_null.cu
+  src/groupby/hash/compute_shared_memory_aggs.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -356,7 +356,6 @@ ConfigureNVBench(
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/combine.cpp
   string/convert_datetime.cpp
   string/convert_durations.cpp
   string/convert_fixed_point.cpp
@@ -374,6 +373,7 @@ ConfigureNVBench(
   STRINGS_NVBENCH
   string/case.cpp
   string/char_types.cpp
+  string/combine.cpp
   string/contains.cpp
   string/copy_if_else.cpp
   string/copy_range.cpp

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -45,6 +45,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
   auto const num_cols       = state.get_int64("num_cols");
   size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
   return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(state.get_int64("num_iterations")) + " iterations, " +
           std::to_string(state.get_int64("num_threads")) + " threads " + " (" +
           std::to_string(read_size_mb) + " MB each)"};
 }
@@ -90,9 +91,10 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                                           std::vector<cudf::type_id> const& d_types,
                                           std::string const& label)
 {
-  size_t const data_size = state.get_int64("total_data_size");
-  auto const num_threads = state.get_int64("num_threads");
-  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  auto const num_iterations = state.get_int64("num_iterations");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
@@ -109,12 +111,15 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 
   nvtxRangePushA(("(read) " + label).c_str());
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
+             [&, num_files = num_files](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref());
+                 for (int i = 0; i < num_iterations; ++i) {
+                   cudf::io::read_parquet(
+                     read_opts, stream, cudf::get_current_device_resource_ref());
+                 }
                };
 
                threads.pause();
@@ -128,7 +133,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   nvtxRangePop();
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(num_iterations * static_cast<double>(data_size) / time,
+                          "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
@@ -173,6 +179,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
 {
   size_t const data_size    = state.get_int64("total_data_size");
   auto const num_threads    = state.get_int64("num_threads");
+  auto const num_iterations = state.get_int64("num_iterations");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
   auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
@@ -192,22 +199,25 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   nvtxRangePushA(("(read) " + label).c_str());
   std::vector<cudf::io::table_with_metadata> chunks;
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
+             [&, num_files = num_files](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 // divide chunk limits by number of threads so the number of chunks produced is the
-                 // same for all cases. this seems better than the alternative, which is to keep the
-                 // limits the same. if we do that, as the number of threads goes up, the number of
-                 // chunks goes down - so are actually benchmarking the same thing in that case?
-                 auto reader = cudf::io::chunked_parquet_reader(
-                   output_limit / num_threads, input_limit / num_threads, read_opts, stream);
-
-                 // read all the chunks
-                 do {
-                   auto table = reader.read_chunk();
-                 } while (reader.has_next());
+                 for (int i = 0; i < num_iterations; ++i) {
+                   // divide chunk limits by number of threads so the number of chunks produced is
+                   // the same for all cases. this seems better than the alternative, which is to
+                   // keep the limits the same. if we do that, as the number of threads goes up, the
+                   // number of chunks goes down - so are actually benchmarking the same thing in
+                   // that case?
+                   auto reader = cudf::io::chunked_parquet_reader(
+                     output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                   // read all the chunks
+                   do {
+                     auto table = reader.read_chunk();
+                   } while (reader.has_next());
+                 }
                };
 
                threads.pause();
@@ -221,7 +231,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   nvtxRangePop();
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(num_iterations * static_cast<double>(data_size) / time,
+                          "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
@@ -267,6 +278,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -277,6 +289,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -287,6 +300,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -297,6 +311,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_string_axis("io_type", {"PINNED_BUFFER"});
@@ -308,6 +323,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -320,6 +336,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -332,6 +349,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
@@ -344,6 +362,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("cardinality", {1000})
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_iterations", {1})
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})

diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
@@ -14,57 +14,41 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCombine : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_combine(benchmark::State& state)
+static void bench_combine(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const table = create_random_table(
-    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, profile);
   cudf::strings_column_view input1(table->view().column(0));
   cudf::strings_column_view input2(table->view().column(1));
   cudf::string_scalar separator("+");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::concatenate(table->view(), separator);
-  }
-
-  state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
-                                                input2.chars_size(cudf::get_default_stream())));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size =
+    input1.chars_size(stream) + input2.chars_size(stream) + (num_rows * separator.size());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 4;
-  int const max_rowlen = 1 << 11;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::concatenate(table->view(), separator);
+  });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCombine, name)       \
-  (::benchmark::State & st) { BM_combine(st); } \
-  BENCHMARK_REGISTER_F(StringCombine, name)     \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(concat)
+NVBENCH_BENCH(bench_combine)
+  .set_name("concat")
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});