diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8190b5d0297..315a389339a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1" ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml new file mode 100644 index 00000000000..1bf4ac08b69 --- /dev/null +++ b/.github/workflows/auto-assign.yml @@ -0,0 +1,17 @@ +name: "Auto Assign PR" + +on: + pull_request_target: + types: + - opened + - reopened + - synchronize + +jobs: + add_assignees: + runs-on: ubuntu-latest + steps: + - uses: actions-ecosystem/action-add-assignees@v1 + with: + github_token: "${{ secrets.GITHUB_TOKEN }}" + assignees: ${{ github.actor }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 31e78f82a62..f5cb71bfc14 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,4 +1,5 @@ name: "Pull Request Labeler" + on: - pull_request_target diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 174dc72bf02..f5234f58efe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,17 +16,6 @@ repos: ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - # Use the config file specific to each subproject so that each - # project can specify its own first/third-party packages. - args: ["--config-root=python/", "--resolve-all-configs"] - files: python/.* - exclude: | - (?x)^(^python/cudf_polars/.*) - types_or: [python, cython, pyi] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.2 hooks: @@ -150,6 +139,7 @@ repos: rev: v0.4.8 hooks: - id: ruff + args: ["--fix"] files: python/.*$ - id: ruff-format files: python/.*$ @@ -165,7 +155,7 @@ repos: ) - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9cdde7c2b7..3db1ed35294 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,6 +38,7 @@ conduct. More information can be found at: 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks). Fix if needed. 9. Wait for other developers to review your code and update code as needed. + Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging. 10. Once reviewed and approved, a RAPIDS developer will merge your pull request. If you are unsure about anything, don't hesitate to comment on issues and ask for clarification! @@ -293,8 +294,8 @@ In order to run doxygen as a linter on C++/CUDA code, run ./ci/checks/doxygen.sh ``` -Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/), -[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/). +Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/) +with its various rules for Black-like formatting or Isort. cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes, diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index e5fcef17a83..3d06eacf9ff 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -15,8 +15,12 @@ rapids-print-env rapids-logger "Begin cpp build" +sccache --zero-stats + # With boa installed conda build forward to boa RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \ conda/recipes/libcudf +sccache --show-adv-stats + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 823d7f62290..ed90041cc77 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -19,6 +19,8 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +sccache --zero-stats + # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly # With boa installed conda build forwards to the boa builder @@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/pylibcudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf +sccache --show-adv-stats +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ @@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka +sccache --show-adv-stats + RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index bf76f4ed29a..78b8a8a08cf 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,7 +3,8 @@ set -euo pipefail -package_dir=$1 +package_name=$1 +package_dir=$2 source rapids-configure-sccache source rapids-date-string @@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION cd "${package_dir}" -python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check +sccache --zero-stats + +rapids-logger "Building '${package_name}' wheel" +python -m pip wheel \ + -w dist \ + -v \ + --no-deps \ + --disable-pip-version-check \ + . + +sccache --show-adv-stats diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index fb93b06dbe2..fef4416a366 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh index 9c945e11c00..79853cdbdb2 100755 --- a/ci/build_wheel_cudf_polars.sh +++ b/ci/build_wheel_cudf_polars.sh @@ -5,7 +5,7 @@ set -euo pipefail package_dir="python/cudf_polars" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh cudf-polars ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index eb2a91289f7..00c64afa2ef 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -5,7 +5,7 @@ set -euo pipefail package_dir="python/dask_cudf" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh dask-cudf ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 91bc071583e..b3d6778ea04 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -3,10 +3,30 @@ set -euo pipefail +package_name="libcudf" package_dir="python/libcudf" +rapids-logger "Generating build requirements" + +rapids-dependency-file-generator \ + --output requirements \ + --file-key "py_build_${package_name}" \ + --file-key "py_rapids_build_${package_name}" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \ +| tee /tmp/requirements-build.txt + +rapids-logger "Installing build requirements" +python -m pip install \ + -v \ + --prefer-binary \ + -r /tmp/requirements-build.txt + +# build with '--no-build-isolation', for better sccache hit rate +# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) +export PIP_NO_BUILD_ISOLATION=0 + export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh "${package_name}" "${package_dir}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -16,4 +36,4 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist" diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 5e9f7f8a0c4..839d98846fe 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -16,7 +16,7 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -./ci/build_wheel.sh ${package_dir} +./ci/build_wheel.sh pylibcudf ${package_dir} python -m auditwheel repair \ --exclude libcudf.so \ @@ -24,4 +24,4 @@ python -m auditwheel repair \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index f6bdc6f9484..61361fffb07 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -54,15 +54,8 @@ else RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist - echo "" > ./constraints.txt - if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` - rapids-dependency-file-generator \ - --output requirements \ - --file-key test_python_cudf_pandas \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt - fi + # generate constraints (possibly pinning to oldest support versions of dependencies) + rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt python -m pip install \ -v \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index a701bfe15e0..ce12744c9e3 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -12,15 +12,8 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels rapids-logger "Install cudf, pylibcudf, and test requirements" -# Constrain to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_cudf \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_cudf ./constraints.txt # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 05f882a475b..2884757e46b 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -29,15 +29,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Installing cudf_polars and its dependencies" -# Constraint to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_cudf_polars \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi + +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt # echo to expand wildcard before adding `[test]` requires for pip python -m pip install \ diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 361a42ccda9..e15949f4bdb 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -12,15 +12,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements" -# Constraint to minimum dependency versions if job is set up as "oldest" -echo "" > ./constraints.txt -if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - rapids-dependency-file-generator \ - --output requirements \ - --file-key py_test_dask_cudf \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ - | tee ./constraints.txt -fi + +# generate constraints (possibly pinning to oldest support versions of dependencies) +rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index bd5e6c3d569..9d9fec97731 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -46,6 +46,7 @@ dependencies: - librdkafka>=2.5.0,<2.6.0a0 - librmm==24.12.*,>=0.0.0a0 - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb @@ -58,14 +59,14 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - nvtx>=0.2.1 - openpyxl - packaging - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.13 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<18.0.0a0 @@ -76,6 +77,7 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=2.5.0,<2.6.0a0 +- python-xxhash - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 565a3ebfa3c..19e3eafd641 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -45,6 +45,7 @@ dependencies: - librdkafka>=2.5.0,<2.6.0a0 - librmm==24.12.*,>=0.0.0a0 - make +- mmh3 - moto>=4.0.8 - msgpack-python - myst-nb @@ -56,14 +57,14 @@ dependencies: - numba-cuda>=0.0.13 - numpy>=1.23,<3.0a0 - numpydoc -- nvcomp==4.0.1 +- nvcomp==4.1.0.6 - nvtx>=0.2.1 - openpyxl - packaging - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.13 - pre-commit - pyarrow>=14.0.0,<18.0.0a0 - pydata-sphinx-theme!=0.14.2 @@ -74,6 +75,7 @@ dependencies: - pytest-xdist - pytest<8 - python-confluent-kafka>=2.5.0,<2.6.0a0 +- python-xxhash - python>=3.10,<3.13 - pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index e8fef715c60..edf92b930d9 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.8,<1.9 + - polars >=1.11,<1.12 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index dc75eb4b252..c78ca326005 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -35,7 +35,7 @@ spdlog_version: - ">=1.14.1,<1.15" nvcomp_version: - - "=4.0.1" + - "=4.1.0.6" zlib_version: - ">=1.2.13" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4b9cbf8921..bfa4bf80724 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,6 +369,9 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_mapping_indices.cu + src/groupby/hash/compute_mapping_indices_null.cu + src/groupby/hash/compute_shared_memory_aggs.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index f013b31b3de..68781889c53 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -345,18 +345,17 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) +ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- ConfigureBench( STRINGS_BENCH - string/combine.cpp string/convert_datetime.cpp string/convert_durations.cpp string/convert_fixed_point.cpp @@ -374,6 +373,7 @@ ConfigureNVBench( STRINGS_NVBENCH string/case.cpp string/char_types.cpp + string/combine.cpp string/contains.cpp string/copy_if_else.cpp string/copy_range.cpp diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index f44f26e4d2c..2533ea9611c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -16,16 +16,29 @@ #include +#include + +#include +#include +#include +#include #include #include +#include +#include #include #include #include +#include #include +#include +#include +#include +#include #include #include #include @@ -39,14 +52,14 @@ enum class TreeType { template static void BM_ast_transform(nvbench::state& state) { - auto const table_size = static_cast(state.get_int64("table_size")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, n_cols), - row_count{table_size}, + row_count{num_rows}, Nullable ? std::optional{0.5} : std::nullopt); auto table = source_table->view(); @@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state) auto const& expression_tree_root = expressions.back(); // Use the number of bytes read from global memory - state.add_global_memory_reads(table_size * (tree_levels + 1)); + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); +} + +template +static void BM_string_compare_ast_transform(nvbench::state& state) +{ + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), + table_view.end(), + static_cast(0), + [](int64_t size, auto& column) -> int64_t { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references + auto column_refs = std::vector(); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_cols), + std::back_inserter(column_refs), + [](auto const& column_id) { return cudf::ast::column_reference(column_id); }); + + // Create expression trees + std::list expressions; + + // Construct AST tree (a == b && c == d && e == f && ...) + + expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1])); + + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + auto const& lhs = expressions.back(); + auto const& rhs = expressions.emplace_back( + cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1])); + expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs)); + }); + + auto const& expression_tree_root = expressions.back(); + + // Use the number of bytes read from global memory + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); @@ -100,7 +177,7 @@ static void BM_ast_transform(nvbench::state& state) NVBENCH_BENCH(name) \ .set_name(#name) \ .add_int64_axis("tree_levels", {1, 5, 10}) \ - .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); AST_TRANSFORM_BENCHMARK_DEFINE( ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); + +#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_ast_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and, + cudf::ast::ast_operator::EQUAL, + cudf::ast::ast_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index 7d267a88764..75c91d270a7 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -17,12 +17,18 @@ #include #include +#include +#include #include #include +#include + #include #include +#include +#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -34,17 +40,18 @@ enum class TreeType { template static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const num_rows{static_cast(state.get_int64("num_rows"))}; auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; auto const source_table = create_sequence_table( - cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); + cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{num_rows}); cudf::table_view table{*source_table}; // Use the number of bytes read from global memory - state.add_global_memory_reads(table_size * (tree_levels + 1)); + state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) @@ -64,15 +71,69 @@ static void BM_binaryop_transform(nvbench::state& state) }); } +template +static void BM_string_compare_binaryop_transform(nvbench::state& state) +{ + auto const string_width = static_cast(state.get_int64("string_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); + + // Create table data + auto const num_cols = tree_levels * 2; + std::vector> columns; + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) { + columns.emplace_back(create_string_column(num_rows, string_width, hit_rate)); + }); + + cudf::table table{std::move(columns)}; + cudf::table_view const table_view = table.view(); + + int64_t const chars_size = std::accumulate( + table_view.begin(), table_view.end(), static_cast(0), [](int64_t size, auto& column) { + return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream()); + }); + + // Create column references + + // Use the number of bytes read from global memory + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); + + // Construct binary operations (a == b && c == d && e == f && ...) + auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8}; + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream{launch.get_stream().get_stream()}; + std::unique_ptr reduction = + cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream); + std::for_each( + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(tree_levels), + [&](size_t idx) { + std::unique_ptr comparison = cudf::binary_operation( + table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream); + std::unique_ptr reduced = + cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream); + stream.synchronize(); + reduction = std::move(reduced); + }); + }); +} + #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ \ static void name(::nvbench::state& st) \ { \ - BM_binaryop_transform(st); \ + ::BM_binaryop_transform(st); \ } \ NVBENCH_BENCH(name) \ .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ - .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); + +#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \ + \ + static void name(::nvbench::state& st) \ + { \ + ::BM_string_compare_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("string_width", {32, 64, 128, 256}) \ + .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ + .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ + .add_int64_axis("hit_rate", {50, 100}) + +STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform, + cudf::binary_operator::EQUAL, + cudf::binary_operator::LOGICAL_AND); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index bc0ff69bce9..426f44a4fa1 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -23,10 +23,10 @@ template void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const table_size = static_cast(state.get_int64("table_size")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{num_rows}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -37,9 +37,9 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) cudf::binary_operation(lhs, rhs, binop, output_dtype); // use number of bytes read and written to global memory - state.add_global_memory_reads(table_size); - state.add_global_memory_reads(table_size); - state.add_global_memory_reads(table_size); + state.add_global_memory_reads(num_rows); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); @@ -55,7 +55,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) } \ NVBENCH_BENCH(name) \ .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ - .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index dc258e32dc5..bdce8a31176 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -17,13 +17,17 @@ #include "generate_input.hpp" #include "random_distribution_factory.cuh" +#include + #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -918,6 +922,58 @@ std::unique_ptr create_sequence_table(std::vector co return std::make_unique(std::move(columns)); } +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate) +{ + // build input table using the following data + auto raw_data = cudf::test::strings_column_wrapper( + { + "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; + "012345 6789 01234 56789 0123 456", // the rest do not match + "abc 4567890 DEFGHI 0987 Wxyz 123", + "abcdefghijklmnopqrstuvwxyz 01234", + "", + "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", + "9876543210,abcdefghijklmnopqrstU", + "9876543210,abcdefghijklmnopqrstU", + "123 édf 4567890 DéFG 0987 X5", + "1", + }) + .release(); + + if (row_width / 32 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 32; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); + + // compute number of rows in n_rows that should match + auto const num_matches = (static_cast(num_rows) * hit_rate) / 100; + + // Create a randomized gather-map to build a column out of the strings in data. + data_profile gather_profile = + data_profile_builder().cardinality(0).null_probability(0.0).distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + auto gather_table = + create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile); + gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); + + // Create scatter map by placing 0-index values throughout the gather-map + auto scatter_data = cudf::sequence(num_matches, + cudf::numeric_scalar(0), + cudf::numeric_scalar(num_rows / num_matches)); + auto zero_scalar = cudf::numeric_scalar(0); + auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); + auto gather_map = table->view().column(0); + table = cudf::gather(cudf::table_view({data_view}), gather_map); + + return std::move(table->release().front()); +} + std::pair create_random_null_mask( cudf::size_type size, std::optional null_probability, unsigned seed) { diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 68d3dc492f5..57834fd11d2 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -670,6 +670,18 @@ std::unique_ptr create_random_column(cudf::type_id dtype_id, data_profile const& data_params = data_profile{}, unsigned seed = 1); +/** + * @brief Deterministically generates a large string column filled with data with the given + * parameters. + * + * @param num_rows Number of rows in the output column + * @param row_width Width of each string in the column + * @param hit_rate The hit rate percentage, ranging from 0 to 100 + */ +std::unique_ptr create_string_column(cudf::size_type num_rows, + cudf::size_type row_width, + int32_t hit_rate); + /** * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in * subsequent rows. diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index fe24fb58728..45b46005c47 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd) std::fflush(nullptr); // Switch stderr and stdout to only capture stderr auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null"); - std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); + std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); CUDF_EXPECTS(pipe != nullptr, "popen() failed"); std::array buffer; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 7121cb9f034..bf7039269bc 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -45,6 +45,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) auto const num_cols = state.get_int64("num_cols"); size_t const read_size_mb = get_read_size(state) / (1024 * 1024); return {test_name + ", " + std::to_string(num_cols) + " columns, " + + std::to_string(state.get_int64("num_iterations")) + " iterations, " + std::to_string(state.get_int64("num_threads")) + " threads " + " (" + std::to_string(read_size_mb) + " MB each)"}; } @@ -90,9 +91,10 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, std::vector const& d_types, std::string const& label) { - size_t const data_size = state.get_int64("total_data_size"); - auto const num_threads = state.get_int64("num_threads"); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + size_t const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + auto const num_iterations = state.get_int64("num_iterations"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); @@ -109,12 +111,15 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, nvtxRangePushA(("(read) " + label).c_str()); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { + [&, num_files = num_files](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_info_vector[index]); - cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref()); + for (int i = 0; i < num_iterations; ++i) { + cudf::io::read_parquet( + read_opts, stream, cudf::get_current_device_resource_ref()); + } }; threads.pause(); @@ -128,7 +133,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, nvtxRangePop(); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(num_iterations * static_cast(data_size) / time, + "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); @@ -173,6 +179,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const num_iterations = state.get_int64("num_iterations"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); @@ -192,22 +199,25 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, nvtxRangePushA(("(read) " + label).c_str()); std::vector chunks; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { + [&, num_files = num_files](nvbench::launch& launch, auto& timer) { auto read_func = [&](int index) { auto const stream = streams[index % num_threads]; cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_info_vector[index]); - // divide chunk limits by number of threads so the number of chunks produced is the - // same for all cases. this seems better than the alternative, which is to keep the - // limits the same. if we do that, as the number of threads goes up, the number of - // chunks goes down - so are actually benchmarking the same thing in that case? - auto reader = cudf::io::chunked_parquet_reader( - output_limit / num_threads, input_limit / num_threads, read_opts, stream); - - // read all the chunks - do { - auto table = reader.read_chunk(); - } while (reader.has_next()); + for (int i = 0; i < num_iterations; ++i) { + // divide chunk limits by number of threads so the number of chunks produced is + // the same for all cases. this seems better than the alternative, which is to + // keep the limits the same. if we do that, as the number of threads goes up, the + // number of chunks goes down - so are actually benchmarking the same thing in + // that case? + auto reader = cudf::io::chunked_parquet_reader( + output_limit / num_threads, input_limit / num_threads, read_opts, stream); + + // read all the chunks + do { + auto table = reader.read_chunk(); + } while (reader.has_next()); + } }; threads.pause(); @@ -221,7 +231,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, nvtxRangePop(); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(num_iterations * static_cast(data_size) / time, + "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); @@ -267,6 +278,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -277,6 +289,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -287,6 +300,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -297,6 +311,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_string_axis("io_type", {"PINNED_BUFFER"}); @@ -308,6 +323,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -320,6 +336,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -332,6 +349,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) @@ -344,6 +362,7 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("cardinality", {1000}) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) + .add_int64_axis("num_iterations", {1}) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp index 7acfb1ffb0d..d6ccfae63e8 100644 --- a/cpp/benchmarks/string/combine.cpp +++ b/cpp/benchmarks/string/combine.cpp @@ -14,57 +14,41 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include -#include -#include #include #include #include #include -class StringCombine : public cudf::benchmark {}; +#include -static void BM_combine(benchmark::State& state) +static void bench_combine(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const table = create_random_table( - {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile); + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, profile); cudf::strings_column_view input1(table->view().column(0)); cudf::strings_column_view input2(table->view().column(1)); cudf::string_scalar separator("+"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::strings::concatenate(table->view(), separator); - } - - state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) + - input2.chars_size(cudf::get_default_stream()))); -} + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = + input1.chars_size(stream) + input2.chars_size(stream) + (num_rows * separator.size()); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 4; - int const max_rowlen = 1 << 11; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::concatenate(table->view(), separator); + }); } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringCombine, name) \ - (::benchmark::State & st) { BM_combine(st); } \ - BENCHMARK_REGISTER_F(StringCombine, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(concat) +NVBENCH_BENCH(bench_combine) + .set_name("concat") + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index ae6c8b844c8..a73017dda18 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -17,10 +17,6 @@ #include #include -#include - -#include -#include #include #include #include @@ -28,57 +24,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - // longer pattern lengths demand more working memory per string std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; @@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index a9c620e4bf0..996bdcf0332 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -29,10 +28,6 @@ #include -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate); - static void bench_find_string(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -46,7 +41,7 @@ static void bench_find_string(nvbench::state& state) } auto const stream = cudf::get_default_stream(); - auto const col = build_input_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(n_rows, row_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); std::vector h_targets({"5W", "5W43", "0987 5W43"}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 99cef640dc3..105ae65cbe8 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -18,68 +18,12 @@ #include -#include -#include -#include #include #include #include #include -namespace { -std::unique_ptr build_input_column(cudf::size_type n_rows, - cudf::size_type row_width, - int32_t hit_rate) -{ - // build input table using the following data - auto raw_data = cudf::test::strings_column_wrapper( - { - "123 abc 4567890 DEFGHI 0987 5W43", // matches always; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }) - .release(); - if (row_width / 32 > 1) { - std::vector columns; - for (int i = 0; i < row_width / 32; ++i) { - columns.push_back(raw_data->view()); - } - raw_data = cudf::strings::concatenate(cudf::table_view(columns)); - } - auto data_view = raw_data->view(); - - // compute number of rows in n_rows that should match - auto matches = static_cast(n_rows * hit_rate) / 100; - - // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); - auto gather_table = - create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - // Create scatter map by placing 0-index values throughout the gather-map - auto scatter_data = cudf::sequence( - matches, cudf::numeric_scalar(0), cudf::numeric_scalar(n_rows / matches)); - auto zero_scalar = cudf::numeric_scalar(0); - auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view()); - auto gather_map = table->view().column(0); - table = cudf::gather(cudf::table_view({data_view}), gather_map); - - return std::move(table->release().front()); -} - -} // namespace - static void bench_like(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); @@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto col = build_input_column(n_rows, row_width, hit_rate); + auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); // This pattern forces reading the entire target string (when matched expected) diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 8e48f8e9a05..43d57201b20 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -15,58 +15,45 @@ */ #include -#include -#include -#include #include #include #include -class TextNGrams : public cudf::benchmark {}; +#include -enum class ngrams_type { tokens, characters }; - -static void BM_ngrams(benchmark::State& state, ngrams_type nt) +static void bench_ngrams(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const ngram_type = state.get_string("type"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto const separator = cudf::string_scalar("_"); - for (auto _ : state) { - cuda_event_timer raii(state, true); - switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; - case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; - } - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size * 2); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 5; - int const max_rowlen = 40; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (ngram_type == "chars") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_character_ngrams(input); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_ngrams(input, 2, separator); + }); + } } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextNGrams, name) \ - (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \ - BENCHMARK_REGISTER_F(TextNGrams, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(tokens) -NVTEXT_BENCHMARK_DEFINE(characters) +NVBENCH_BENCH(bench_ngrams) + .set_name("ngrams") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578}) + .add_int64_axis("row_width", {10, 20, 40, 100}) + .add_string_axis("type", {"chars", "tokens"}); diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 311539efbfc..1c1052487f2 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1483,6 +1483,17 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o `cudf::type_dispatcher` dispatches to the `struct_view` data type when invoked on a `STRUCT` column. +# Empty Columns + +The libcudf columns support empty, typed content. These columns have no data and no validity mask. +Empty strings or lists columns may or may not contain a child offsets column. +It is undefined behavior (UB) to access the offsets child of an empty strings or lists column. +Nested columns like lists and structs may require other children columns to provide the +nested structure of the empty types. + +Use `cudf::make_empty_column()` to create fixed-width and strings columns. +Use `cudf::empty_like()` to create an empty column from an existing `cudf::column_view`. + # cuIO: file reading and writing cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index a254171ef11..f4cce8e6da6 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -17,12 +17,8 @@ #include #include -#include #include #include -#include - -#include #include #include diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 6bbe32de134..e72661ce49a 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -24,8 +24,6 @@ #include -#include - namespace CUDF_EXPORT cudf { /** * @addtogroup column_factories diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 48f89b8be25..6db5c8b3c7b 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index ec5a511bb7c..486808ebe18 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp index 4b74d12f306..26b1bec2ced 100644 --- a/cpp/include/cudf/detail/is_element_valid.hpp +++ b/cpp/include/cudf/detail/is_element_valid.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 482265d633e..025e2ccc3ec 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -166,16 +166,9 @@ size_type inplace_bitmask_binop(Binop op, rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); cudf::detail::device_scalar d_counter{0, stream, mr}; - rmm::device_uvector d_masks(masks.size(), stream, mr); - rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(), - masks_begin_bits.data(), - masks_begin_bits.size_bytes(), - cudaMemcpyDefault, - stream.value())); + + auto d_masks = cudf::detail::make_device_uvector_async(masks, stream, mr); + auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr); auto constexpr block_size = 256; cudf::detail::grid_1d config(dest_mask.size(), block_size); diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 5596f78a90b..0a799f27d00 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include /** diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp similarity index 99% rename from cpp/include/cudf/fixed_point/floating_conversion.hpp rename to cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index f0d50edccd1..fce08b4a5c4 100644 --- a/cpp/include/cudf/fixed_point/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -26,14 +26,6 @@ #include namespace CUDF_EXPORT numeric { - -/** - * @addtogroup floating_conversion - * @{ - * @file - * @brief fixed_point <--> floating-point conversion functions. - */ - namespace detail { /** @@ -1141,6 +1133,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val } } // namespace detail - -/** @} */ // end of group } // namespace CUDF_EXPORT numeric diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 0c5327edb91..307a52cd242 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -22,26 +22,27 @@ namespace CUDF_EXPORT cudf { -/** - * @addtogroup column_hash - * @{ - * @file - */ - /** * @brief Type of hash value - * + * @ingroup column_hash */ using hash_value_type = uint32_t; /** * @brief The default seed value for hash functions + * @ingroup column_hash */ static constexpr uint32_t DEFAULT_HASH_SEED = 0; //! Hash APIs namespace hashing { +/** + * @addtogroup column_hash + * @{ + * @file + */ + /** * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table * @@ -183,7 +184,8 @@ std::unique_ptr xxhash_64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** @} */ // end of group + } // namespace hashing -/** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp index 13a76d50346..070b59a117c 100644 --- a/cpp/include/cudf/io/config_utils.hpp +++ b/cpp/include/cudf/io/config_utils.hpp @@ -37,10 +37,13 @@ bool is_gds_enabled(); bool is_kvikio_enabled(); /** - * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If - * KVIKIO_NTHREADS is not set, use 8 threads by default. + * @brief Set KvikIO parameters, including: + * - Compatibility mode, according to the environment variable KVIKIO_COMPAT_MODE. If + * KVIKIO_COMPAT_MODE is not set, enable it by default, which enforces the use of POSIX I/O. + * - Thread pool size, according to the environment variable KVIKIO_NTHREADS. If KVIKIO_NTHREADS is + * not set, use 4 threads by default. */ -void set_thread_pool_nthreads_from_env(); +void set_up_kvikio(); } // namespace cufile_integration diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 7d2cc4ad493..7bec40893fd 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -79,7 +79,7 @@ class datasource { template static std::unique_ptr create(Container&& data_owner) { - return std::make_unique>(std::move(data_owner)); + return std::make_unique>(std::forward(data_owner)); } }; @@ -335,13 +335,19 @@ class datasource { template class owning_buffer : public buffer { public: + // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue + // reference). + static_assert(std::is_rvalue_reference_v, + "The container argument passed to the constructor must be an rvalue."); + /** * @brief Moves the input container into the newly created object. * - * @param data_owner The container to construct the buffer from (ownership is transferred) + * @param moved_data_owner The container to construct the buffer from. Callers should explicitly + * pass std::move(data_owner) to this function to transfer the ownership. */ - owning_buffer(Container&& data_owner) - : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size()) + owning_buffer(Container&& moved_data_owner) + : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size()) { } @@ -349,12 +355,13 @@ class datasource { * @brief Moves the input container into the newly created object, and exposes a subspan of the * buffer. * - * @param data_owner The container to construct the buffer from (ownership is transferred) + * @param moved_data_owner The container to construct the buffer from. Callers should explicitly + * pass std::move(data_owner) to this function to transfer the ownership. * @param data_ptr Pointer to the start of the subspan * @param size The size of the subspan */ - owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size) - : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size) + owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size) + : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size) { } diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp index 11eb4518210..5659f86b0c4 100644 --- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp +++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp @@ -16,16 +16,10 @@ #pragma once -#include #include #include -#include - -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::text::detail::bgzip { diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 385da993262..f9a68e4fffc 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -70,6 +70,7 @@ enum class hash_id { * @param partition_map Non-nullable column of integer values that map each row * in `t` to it's partition. * @param num_partitions The total number of partitions + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Pair containing the reordered table and vector of `num_partitions + * 1` offsets to each partition such that the size of partition `i` is @@ -79,6 +80,7 @@ std::pair, std::vector> partition( table_view const& t, column_view const& partition_map, size_type num_partitions, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -242,6 +244,7 @@ std::pair, std::vector> hash_partition( * @param[in] input The input table to be round-robin partitioned * @param[in] num_partitions Number of partitions for the table * @param[in] start_partition Index of the 1st partition + * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned table's device memory * * @return A std::pair consisting of a unique_ptr to the partitioned table @@ -251,6 +254,7 @@ std::pair, std::vector> round_robi table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition = 0, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp index ba56ff34b97..158e6df7e5f 100644 --- a/cpp/include/cudf/round.hpp +++ b/cpp/include/cudf/round.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -66,6 +67,7 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN }; * @param decimal_places Number of decimal places to round to (default 0). If negative, this * specifies the number of positions to the left of the decimal point. * @param method Rounding method + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @return Column with each of the values rounded @@ -74,6 +76,7 @@ std::unique_ptr round( column_view const& input, int32_t decimal_places = 0, rounding_method method = rounding_method::HALF_UP, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 53e0f3a15d2..046e9745a71 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include #include #include diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp index 97a42243250..3e740b81cc9 100644 --- a/cpp/include/cudf/utilities/default_stream.hpp +++ b/cpp/include/cudf/utilities/default_stream.hpp @@ -16,10 +16,8 @@ #pragma once -#include #include -#include #include namespace CUDF_EXPORT cudf { diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 3f37ae02151..cf8413b597f 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -22,8 +22,6 @@ #include #include -#include - namespace CUDF_EXPORT cudf { /** diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 15b5f921c1b..6351a84e38f 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7c909f1a948..42124461cdf 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext { * * This function uses MurmurHash3_x86_32 for the hash algorithm. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -71,6 +73,8 @@ std::unique_ptr minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -83,7 +87,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -102,6 +106,8 @@ std::unique_ptr minhash( * The hash function returns 2 uint64 values but only the first value * is used with the minhash calculation. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -112,7 +118,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values as UINT64 for each string in input */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -132,6 +138,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -144,7 +152,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -164,6 +172,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -173,7 +183,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash( +[[deprecated]] std::unique_ptr word_minhash( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -193,6 +203,8 @@ std::unique_ptr word_minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -202,7 +214,7 @@ std::unique_ptr word_minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash64( +[[deprecated]] std::unique_ptr word_minhash64( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index c4210699975..4d06aa5d4bc 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -62,11 +62,13 @@ struct hashed_vocabulary { * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. * Note that this is the file AFTER python/perfect_hash.py has been used * for preprocessing. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to allocate any returned objects. * @return vocabulary hash-table elements */ std::unique_ptr load_vocabulary_file( std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -147,6 +149,7 @@ struct tokenizer_result { * @param do_truncate If true, the tokenizer will discard all the token-ids after * `max_sequence_length` for each input string. If false, it will use a new row * in the output token-ids to continue generating the output. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to allocate any returned objects. * @return token-ids, attention-mask, and metadata */ @@ -157,6 +160,7 @@ tokenizer_result subword_tokenize( uint32_t stride, bool do_lower_case, bool do_truncate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index e61601c6fea..e345587f88b 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -292,7 +292,7 @@ std::unique_ptr load_vocabulary( * @throw cudf::logic_error if `delimiter` is invalid * * @param input Strings column to tokenize - * @param vocabulary Used to lookup tokens within + * @param vocabulary Used to lookup tokens within `input` * @param delimiter Used to identify tokens within `input` * @param default_id The token id to be used for tokens not found in the `vocabulary`; * Default is -1 diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp index 3b650d791aa..5815ce33e33 100644 --- a/cpp/src/ast/expression_parser.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,6 @@ #include #include #include -#include -#include -#include #include #include #include diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp index b45b9d0c78c..4c2b56dd4f5 100644 --- a/cpp/src/ast/expressions.cpp +++ b/cpp/src/ast/expressions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index a6c878efbbc..1b23ea12a5e 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -27,15 +27,10 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 482413d0ccb..972f97e8668 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -15,19 +15,13 @@ */ #include -#include #include #include #include -#include #include -#include #include -#include #include -#include - namespace cudf { namespace { struct size_of_helper { diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 386c5ebe478..e831aa9645d 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,9 +26,7 @@ #include #include -#include #include -#include #include namespace cudf { diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index d60fb5ce110..5e2065ba844 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -20,16 +20,11 @@ #include #include #include -#include #include -#include -#include #include #include -#include - #include namespace cudf { diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 1282eec6c44..a001807c82b 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp index 832a72ed5b0..116e3516460 100644 --- a/cpp/src/copying/split.cpp +++ b/cpp/src/copying/split.cpp @@ -14,10 +14,8 @@ * limitations under the License. */ -#include #include #include -#include #include #include diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 2196ee97fee..f786624680c 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -13,12 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include #include #include -#include #include #include diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu new file mode 100644 index 00000000000..519d7cd2f1c --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type max_occupancy_grid_size>( + cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh new file mode 100644 index 00000000000..d353830780f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "compute_mapping_indices.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +__device__ void find_local_mapping(cooperative_groups::thread_block const& block, + cudf::size_type idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + auto const is_valid_input = + idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)); + auto const [result_idx, inserted] = [&]() { + if (is_valid_input) { + auto const result = shared_set.insert_and_find(idx); + auto const matched_idx = *result.first; + auto const inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = idx; + local_mapping_index[idx] = shared_set_index; + } + return cuda::std::pair{matched_idx, inserted}; + } + return cuda::std::pair{0, false}; // dummy values + }(); + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + block.sync(); + if (is_valid_input) { + // element was already in set + if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cooperative_groups::thread_block const& block, + cudf::size_type cardinality, + SetRef global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + // for all unique keys in shared memory hash set, stores their matches in + // global hash set to `global_mapping_index` + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const input_idx = shared_set_indices[idx]; + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] = + *global_set.insert_and_find(input_idx).first; + } +} + +/* + * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given + * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds + * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating + * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the + * global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback) +{ + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ cuco::window windows[window_extent.value()]; + + auto raw_set = cuco::static_set_ref{ + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + cuco::thread_scope_block, + cuco::aow_storage_ref{ + window_extent, windows}}; + auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); + + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto idx = cudf::detail::grid_1d::global_thread_id(); + idx - block.thread_rank() < num_input_rows; + idx += stride) { + find_local_mapping(block, + idx, + num_input_rows, + shared_set, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); } + break; + } + } + + // Insert unique keys from shared to global hash set if block-cardinality + // doesn't exceed the threshold upper-limit + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index); + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +cudf::size_type max_occupancy_grid_size(cudf::size_type n) +{ + cudf::size_type max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream) +{ + mapping_indices_kernel<<>>( + num, + global_set, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + needs_global_memory_fallback); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp new file mode 100644 index 00000000000..473ad99e650 --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +/* + * @brief Computes the maximum number of active blocks of the given kernel that can be executed on + * the underlying device + */ +template +[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n); + +template +void compute_mapping_indices(cudf::size_type grid_size, + cudf::size_type num, + SetRef global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu new file mode 100644 index 00000000000..81c3c9e456f --- /dev/null +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_mapping_indices.cuh" +#include "compute_mapping_indices.hpp" + +namespace cudf::groupby::detail::hash { +template cudf::size_type +max_occupancy_grid_size>(cudf::size_type n); + +template void compute_mapping_indices>( + cudf::size_type grid_size, + cudf::size_type num, + nullable_hash_set_ref_t global_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cuda::std::atomic_flag* needs_global_memory_fallback, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu new file mode 100644 index 00000000000..12c02a1865e --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_shared_memory_aggs.hpp" +#include "global_memory_aggregator.cuh" +#include "helpers.cuh" +#include "shared_memory_aggregator.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +namespace { +/// Functor used by type dispatcher returning the size of the underlying C++ type +struct size_of_functor { + template + __device__ constexpr cudf::size_type operator()() + { + return sizeof(T); + } +}; + +/// Shared memory data alignment +CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; + +// Prepares shared memory data required by each output column, exits if +// no enough memory space to perform the shared memory aggregation for the +// current output column +__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, + cudf::size_type& col_end, + cudf::mutable_table_device_view output_values, + cudf::size_type output_size, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::size_type total_agg_size) +{ + col_start = col_end; + cudf::size_type bytes_allocated = 0; + + auto const valid_col_size = + cudf::util::round_up_safe(static_cast(sizeof(bool) * cardinality), ALIGNMENT); + + while (bytes_allocated < total_agg_size && col_end < output_size) { + auto const col_idx = col_end; + auto const next_col_size = + cudf::util::round_up_safe(cudf::type_dispatcher( + output_values.column(col_idx).type(), size_of_functor{}) * + cardinality, + ALIGNMENT); + auto const next_col_total_size = next_col_size + valid_col_size; + + if (bytes_allocated + next_col_total_size > total_agg_size) { + CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); + } + + shmem_agg_res_offsets[col_end] = bytes_allocated; + shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; + + bytes_allocated += next_col_total_size; + ++col_end; + } +} + +// Each block initialize its own shared memory aggregation results +__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::mutable_table_device_view output_values, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + auto target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + d_agg_kinds[col_idx], + initialize_shmem{}, + target, + target_mask, + idx); + } + } + block.sync(); +} + +__device__ void compute_pre_aggregrations(cudf::size_type col_start, + cudf::size_type col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view source, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates global memory sources to shared memory targets + for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; + source_idx += cudf::detail::grid_1d::grid_stride()) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { + auto const target_idx = local_mapping_index[source_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto const source_col = source.column(col_idx); + + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(source_col.type(), + d_agg_kinds[col_idx], + shmem_element_aggregator{}, + target, + target_mask, + target_idx, + source_col, + source_idx); + } + } + } +} + +__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view target, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* agg_res_offsets, + cudf::size_type* agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates shared memory sources to global memory targets + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const target_idx = + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto target_col = target.column(col_idx); + + cuda::std::byte* source = + reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); + bool* source_mask = reinterpret_cast(shmem_agg_storage + agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + d_agg_kinds[col_idx], + gmem_element_aggregator{}, + target_col, + target_idx, + input_values.column(col_idx), + source, + source_mask, + idx); + } + } + block.sync(); +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + cudf::size_type total_agg_size, + cudf::size_type offsets_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ cudf::size_type col_start; + __shared__ cudf::size_type col_end; + extern __shared__ cuda::std::byte shmem_agg_storage[]; + + cudf::size_type* shmem_agg_res_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size); + cudf::size_type* shmem_agg_mask_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size + offsets_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + if (block.thread_rank() == 0) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + total_agg_size); + } + block.sync(); + + initialize_shmem_aggregations(block, + col_start, + col_end, + output_values, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + d_agg_kinds); + + compute_pre_aggregrations(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + block.sync(); + + compute_final_aggregations(block, + col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + } +} +} // namespace + +std::size_t available_shared_memory_size(cudf::size_type grid_size) +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size = 0; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return cudf::util::round_down_safe(static_cast(0.5 * dynamic_shmem_size), + ALIGNMENT); +} + +void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream) +{ + // For each aggregation, need one offset determining where the aggregation is + // performed, another indicating the validity of the aggregation + auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); + // The rest of shmem is utilized for the actual arrays in shmem + CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, + "No enough space for shared memory aggregations"); + auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + single_pass_shmem_aggs_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + d_agg_kinds, + shmem_agg_size, + shmem_offsets_size); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp new file mode 100644 index 00000000000..653821fd53b --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { + +std::size_t available_shared_memory_size(cudf::size_type grid_size); + +void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp index 2bf983e5e90..dfad51f27d4 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 0d117ca35b3..00836567b4f 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -54,15 +54,6 @@ using shmem_extent_t = CUDF_HOST_DEVICE auto constexpr window_extent = cuco::make_window_extent(shmem_extent_t{}); -/** - * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. - */ -CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) -{ - std::size_t constexpr base = 8; - return cudf::util::div_rounding_up_safe(num, base) * base; -} - using row_hash_t = cudf::experimental::row::hash::device_row_hasher; diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 73791b3aa71..28a5b578e00 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -23,6 +23,87 @@ #include namespace cudf::groupby::detail::hash { +// TODO: TO BE REMOVED issue tracked via #17171 +template +__device__ constexpr bool is_supported() +{ + return cudf::is_fixed_width() and + ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or + (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or + (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or + (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or + (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or + (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported()); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + using DeviceType = cudf::device_storage_type_t; + return cudf::detail::corresponding_operator_t::template identity(); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator"); +} + +template +__device__ T get_identity() +{ + if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) { + if constexpr (cudf::is_timestamp()) { + return k == cudf::aggregation::ARGMAX + ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} + : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)}; + } else { + using DeviceType = cudf::device_storage_type_t; + return k == cudf::aggregation::ARGMAX + ? static_cast(cudf::detail::ARGMAX_SENTINEL) + : static_cast(cudf::detail::ARGMIN_SENTINEL); + } + } + return identity_from_operator(); +} + +template +struct initialize_target_element { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + DeviceType* target_casted = reinterpret_cast(target); + + target_casted[idx] = get_identity(); + + target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID); + } +}; + +struct initialize_shmem { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + initialize_target_element{}(target, target_mask, idx); + } +}; + /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index a9085a1f1fd..3041e261945 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index c7bfd4aecf4..a0c51940c87 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -302,7 +302,8 @@ std::unique_ptr md5(table_view const& input, } return md5_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Digest size in bytes auto constexpr digest_size = 32; diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index ebaec8e2775..eb002cf9c6f 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -513,7 +513,8 @@ std::unique_ptr sha_hash(table_view const& input, CUDF_EXPECTS( std::all_of( input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }), - "Unsupported column type for hash function."); + "Unsupported column type for hash function.", + cudf::data_type_error); // Result column allocation and creation auto begin = thrust::make_constant_iterator(Hasher::digest_size); diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index a99262fb3bf..c69ebe12d2c 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -20,11 +20,6 @@ #include #include -#include - -#include -#include - #include namespace cudf { diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 1b79fbf9eda..e4bdedf6603 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index a1be6aade4e..4395b741e53 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -16,11 +16,8 @@ #include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index d5caa4720ac..b3fcca62314 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -17,7 +17,6 @@ #include "avro.hpp" #include -#include #include namespace cudf { diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index 2e992546ccc..fd2c781b8a1 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -18,11 +18,9 @@ #include "avro_common.hpp" -#include #include #include #include -#include #include #include #include diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 583bd6a3523..2e1cda2d6b7 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -18,9 +18,7 @@ #include "gpuinflate.hpp" -#include #include -#include #include #include diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index d4d6f46b99a..fb8c308065d 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -24,8 +24,6 @@ #include #include -#include - #include // uncompress #include // memset diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a8682e6a760..ceaeb5d8f85 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -32,10 +32,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index e024ac4c503..ecbe1e6c89b 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1ef7c5f6a08..29ea6e45db6 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -19,10 +19,7 @@ #include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 8a740ae17ef..2bc15ea19cb 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -127,7 +128,8 @@ datasource::owning_buffer get_record_range_raw_input( std::size_t const total_source_size = sources_size(sources, 0, 0); auto constexpr num_delimiter_chars = 1; - auto const num_extra_delimiters = num_delimiter_chars * (sources.size() - 1); + auto const delimiter = reader_opts.get_delimiter(); + auto const num_extra_delimiters = num_delimiter_chars * sources.size(); compression_type const reader_compression = reader_opts.get_compression(); std::size_t const chunk_offset = reader_opts.get_byte_range_offset(); std::size_t chunk_size = reader_opts.get_byte_range_size(); @@ -135,10 +137,10 @@ datasource::owning_buffer get_record_range_raw_input( CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset, "Invalid offsetting", std::invalid_argument); - auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; - chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; + auto should_load_till_last_source = !chunk_size || chunk_size >= total_source_size - chunk_offset; + chunk_size = should_load_till_last_source ? total_source_size - chunk_offset : chunk_size; - int num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced; + int num_subchunks_prealloced = should_load_till_last_source ? 0 : max_subchunks_prealloced; std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); // The allocation for single source compressed input is estimated by assuming a ~4:1 @@ -155,17 +157,17 @@ datasource::owning_buffer get_record_range_raw_input( // Offset within buffer indicating first read position std::int64_t buffer_offset = 0; - auto readbufspan = - ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream); + auto readbufspan = ingest_raw_input( + bufspan, sources, reader_compression, chunk_offset, chunk_size, delimiter, stream); auto const shift_for_nonzero_offset = std::min(chunk_offset, 1); auto const first_delim_pos = - chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream); + chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, delimiter, stream); if (first_delim_pos == -1) { // return empty owning datasource buffer auto empty_buf = rmm::device_buffer(0, stream); return datasource::owning_buffer(std::move(empty_buf)); - } else if (!should_load_all_sources) { + } else if (!should_load_till_last_source) { // Find next delimiter std::int64_t next_delim_pos = -1; std::size_t next_subchunk_start = chunk_offset + chunk_size; @@ -180,14 +182,15 @@ datasource::owning_buffer get_record_range_raw_input( reader_compression, next_subchunk_start, size_per_subchunk, + delimiter, stream); - next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset; + next_delim_pos = find_first_delimiter(readbufspan, delimiter, stream) + buffer_offset; next_subchunk_start += size_per_subchunk; } if (next_delim_pos < buffer_offset) { if (next_subchunk_start >= total_source_size) { // If we have reached the end of source list but the source does not terminate with a - // newline character + // delimiter character next_delim_pos = buffer_offset + readbufspan.size(); } else { // Our buffer_size estimate is insufficient to read until the end of the line! We need to @@ -209,10 +212,26 @@ datasource::owning_buffer get_record_range_raw_input( reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, next_delim_pos - first_delim_pos - shift_for_nonzero_offset); } + + // Add delimiter to end of buffer - possibly adding an empty line to the input buffer - iff we are + // reading till the end of the last source i.e. should_load_till_last_source is true Note that the + // table generated from the JSONL input remains unchanged since empty lines are ignored by the + // parser. + size_t num_chars = readbufspan.size() - first_delim_pos - shift_for_nonzero_offset; + if (num_chars) { + auto last_char = delimiter; + cudf::detail::cuda_memcpy_async( + device_span(reinterpret_cast(buffer.data()), buffer.size()) + .subspan(readbufspan.size(), 1), + host_span(&last_char, 1, false), + stream); + num_chars++; + } + return datasource::owning_buffer( std::move(buffer), reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, - readbufspan.size() - first_delim_pos - shift_for_nonzero_offset); + num_chars); } // Helper function to read the current batch using byte range offsets and size @@ -245,6 +264,7 @@ device_span ingest_raw_input(device_span buffer, compression_type compression, std::size_t range_offset, std::size_t range_size, + char delimiter, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -296,7 +316,7 @@ device_span ingest_raw_input(device_span buffer, if (sources.size() > 1) { static_assert(num_delimiter_chars == 1, "Currently only single-character delimiters are supported"); - auto const delimiter_source = thrust::make_constant_iterator('\n'); + auto const delimiter_source = thrust::make_constant_iterator(delimiter); auto const d_delimiter_map = cudf::detail::make_device_uvector_async( delimiter_map, stream, cudf::get_current_device_resource_ref()); thrust::scatter(rmm::exec_policy_nosync(stream), diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index 982190eecb5..4def69cc629 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -56,6 +56,7 @@ device_span ingest_raw_input(device_span buffer, compression_type compression, size_t range_offset, size_t range_size, + char delimiter, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index c42348a165f..0081ed30d17 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp index 4c1079cffe8..7e5db4b7617 100644 --- a/cpp/src/io/orc/reader_impl_helpers.cpp +++ b/cpp/src/io/orc/reader_impl_helpers.cpp @@ -16,8 +16,6 @@ #include "reader_impl_helpers.hpp" -#include - namespace cudf::io::orc::detail { std::unique_ptr create_empty_column(size_type orc_col_id, diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp index 5528b2ee763..4cded30d89b 100644 --- a/cpp/src/io/orc/reader_impl_helpers.hpp +++ b/cpp/src/io/orc/reader_impl_helpers.hpp @@ -20,9 +20,6 @@ #include "io/orc/orc.hpp" #include "io/utilities/column_buffer.hpp" -#include -#include - #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index ddf65e9020f..d15435b2553 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -27,7 +27,6 @@ #include "ipc/Schema_generated.h" #include "writer_impl_helpers.hpp" -#include #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp index 9bc435bf6c8..66810ee163a 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.hpp +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -22,10 +22,9 @@ #pragma once #include -#include -#include +#include +#include #include -#include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 12c24e2b848..b87f2e9c692 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -22,10 +22,7 @@ #include #include -#include -#include #include -#include namespace CUDF_EXPORT cudf { namespace io::parquet::detail { diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index d4778b1ea15..05859d60c03 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -17,7 +17,6 @@ #pragma once #include "parquet.hpp" -#include "parquet_common.hpp" #include #include diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 4522ea7fe56..45380e6ea20 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -37,7 +37,14 @@ struct block_scan_results { }; template -static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) +using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size]; + +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage +template +__device__ inline static void scan_block_exclusive_sum( + int thread_bit, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { int const t = threadIdx.x; int const warp_index = t / cudf::detail::warp_size; @@ -45,15 +52,19 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; uint32_t warp_bits = ballot(thread_bit); - scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); + scan_block_exclusive_sum( + warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage); } +// Similar to CUB, must __syncthreads() after calling if reusing temp_storage template -__device__ static void scan_block_exclusive_sum(uint32_t warp_bits, - int warp_lane, - int warp_index, - uint32_t lane_mask, - block_scan_results& results) +__device__ static void scan_block_exclusive_sum( + uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results, + block_scan_temp_storage& temp_storage) { // Compute # warps constexpr int num_warps = decode_block_size / cudf::detail::warp_size; @@ -64,49 +75,64 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits, results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); // Share the warp counts amongst the block threads - __shared__ int warp_counts[num_warps]; - if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } - __syncthreads(); + if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; } + __syncthreads(); // Sync to share counts between threads/warps // Compute block-wide results results.block_count = 0; results.thread_count_within_block = results.thread_count_within_warp; for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { - results.block_count += warp_counts[warp_idx]; - if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + results.block_count += temp_storage[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; } } } -template -__device__ inline void gpuDecodeFixedWidthValues( +template +__device__ void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { constexpr int num_warps = block_size / cudf::detail::warp_size; constexpr int max_batch_size = num_warps * cudf::detail::warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.physical_type; + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + + int const dtype = s->col.physical_type; + uint32_t const dtype_len = s->dtype_len; + + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; while (pos < end) { int const batch_size = min(max_batch_size, end - pos); - int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int const thread_pos = pos + t; - // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { + if (thread_pos < target_pos && dst_pos >= 0) { // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; - uint32_t dtype_len = s->dtype_len; - void* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + int const src_pos = [&]() { + if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; } + return thread_pos; + }(); + + void* const dst = data_out + (static_cast(dst_pos) * dtype_len); + if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) { switch (dtype) { case INT32: gpuOutputFast(s, sb, src_pos, static_cast(dst)); break; @@ -145,15 +171,15 @@ __device__ inline void gpuDecodeFixedWidthValues( } } -template +template struct decode_fixed_width_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthValues(s, sb, start, end, t); + gpuDecodeFixedWidthValues(s, sb, start, end, t); } }; -template +template __device__ inline void gpuDecodeFixedWidthSplitValues( page_state_s* s, state_buf* const sb, int start, int end, int t) { @@ -161,10 +187,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( constexpr int num_warps = block_size / warp_size; constexpr int max_batch_size = num_warps * warp_size; - PageNestingDecodeInfo* nesting_info_base = s->nesting_info; - int const dtype = s->col.physical_type; - auto const data_len = thrust::distance(s->data_start, s->data_end); - auto const num_values = data_len / s->dtype_len_in; + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const data_out = s->nesting_info[leaf_level_index].data_out; + + int const dtype = s->col.physical_type; + auto const data_len = thrust::distance(s->data_start, s->data_end); + auto const num_values = data_len / s->dtype_len_in; + + int const skipped_leaf_values = s->page.skipped_leaf_values; // decode values int pos = start; @@ -172,21 +203,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( int const batch_size = min(max_batch_size, end - pos); int const target_pos = pos + batch_size; - int const src_pos = pos + t; + int const thread_pos = pos + t; // the position in the output column/buffer - int dst_pos = sb->nz_idx[rolling_index(src_pos)] - s->first_row; + // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls) + int const dst_pos = [&]() { + int dst_pos = sb->nz_idx[rolling_index(thread_pos)]; + if constexpr (!has_lists_t) { dst_pos -= s->first_row; } + return dst_pos; + }(); // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. - if (src_pos < target_pos && dst_pos >= 0) { - // nesting level that is storing actual leaf values - int const leaf_level_index = s->col.max_nesting_depth - 1; + if (thread_pos < target_pos && dst_pos >= 0) { + // src_pos represents the logical row position we want to read from. But in the case of + // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos + // has to take into account the # of values we have to skip in the page to get to the + // desired logical row. For flat hierarchies, skipped_leaf_values will always be 0. + int const src_pos = [&]() { + if constexpr (has_lists_t) { + return thread_pos + skipped_leaf_values; + } else { + return thread_pos; + } + }(); - uint32_t dtype_len = s->dtype_len; - uint8_t const* src = s->data_start + src_pos; - uint8_t* dst = - nesting_info_base[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; + uint32_t const dtype_len = s->dtype_len; + uint8_t const* const src = s->data_start + src_pos; + uint8_t* const dst = data_out + static_cast(dst_pos) * dtype_len; auto const is_decimal = s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL; @@ -239,11 +283,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues( } } -template +template struct decode_fixed_width_split_values_func { __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t) { - gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); + gpuDecodeFixedWidthSplitValues(s, sb, start, end, t); } }; @@ -274,12 +318,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const batch_size = min(max_batch_size, capped_target_value_count - value_count); // definition level - int d = 1; - if (t >= batch_size) { - d = -1; - } else if (def) { - d = static_cast(def[rolling_index(value_count + t)]); - } + int const d = [&]() { + if (t >= batch_size) { + return -1; + } else if (def) { + return static_cast(def[rolling_index(value_count + t)]); + } + return 1; + }(); int const thread_value_count = t; int const block_value_count = batch_size; @@ -340,6 +386,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( if (is_valid) { int const dst_pos = value_count + thread_value_count; int const src_pos = max_depth_valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } // update stuff @@ -396,16 +443,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); // use definition level & row bounds to determine if is valid - int is_valid; - if (t >= batch_size) { - is_valid = 0; - } else if (def) { - int const def_level = - static_cast(def[rolling_index(value_count + t)]); - is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + int const is_valid = [&]() { + if (t >= batch_size) { + return 0; + } else if (def) { + int const def_level = + static_cast(def[rolling_index(value_count + t)]); + return ((def_level > 0) && in_row_bounds) ? 1 : 0; + } + return in_row_bounds; + }(); // thread and block validity count using block_scan = cub::BlockScan; @@ -447,8 +494,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( // output offset if (is_valid) { - int const dst_pos = value_count + thread_value_count; - int const src_pos = valid_count + thread_valid_count; + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; } @@ -460,7 +508,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( if (t == 0) { // update valid value count for decoding and total # of values we've processed ni.valid_count = valid_count; - ni.value_count = value_count; // TODO: remove? this is unused in the non-list path + ni.value_count = value_count; s->nz_count = valid_count; s->input_value_count = value_count; s->input_row_count = value_count; @@ -533,6 +581,239 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v return valid_count; } +template +static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + level_t const* const def, + level_t const* const rep, + int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // how many (input) values we've processed in the page so far, prior to this loop iteration + int value_count = s->input_value_count; + + // how many rows we've processed in the page so far + int input_row_count = s->input_row_count; + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; + + int const row_index_lower_bound = s->row_index_lower_bound; + int const max_depth = s->col.max_nesting_depth - 1; + int max_depth_valid_count = s->nesting_info[max_depth].valid_count; + + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + bool const is_first_lane = (warp_lane == 0); + + __syncthreads(); + __shared__ block_scan_temp_storage temp_storage; + + while (value_count < target_value_count) { + bool const within_batch = value_count + t < target_value_count; + + // get definition level, use repetition level to get start/end depth + // different for each thread, as each thread has a different r/d + auto const [def_level, start_depth, end_depth] = [&]() { + if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); } + + int const level_index = rolling_index(value_count + t); + int const rep_level = static_cast(rep[level_index]); + int const start_depth = s->nesting_info[rep_level].start_depth; + + if constexpr (!nullable) { + return cuda::std::make_tuple(-1, start_depth, max_depth); + } else { + if (def != nullptr) { + int const def_level = static_cast(def[level_index]); + return cuda::std::make_tuple( + def_level, start_depth, s->nesting_info[def_level].end_depth); + } else { + return cuda::std::make_tuple(1, start_depth, max_depth); + } + } + }(); + + // Determine value count & row index + // track (page-relative) row index for the thread so we can compare against input bounds + // keep track of overall # of rows we've read. + int const is_new_row = start_depth == 0 ? 1 : 0; + int num_prior_new_rows, total_num_new_rows; + { + block_scan_results new_row_scan_results; + scan_block_exclusive_sum(is_new_row, new_row_scan_results, temp_storage); + __syncthreads(); + num_prior_new_rows = new_row_scan_results.thread_count_within_block; + total_num_new_rows = new_row_scan_results.block_count; + } + + int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1); + input_row_count += total_num_new_rows; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // VALUE COUNT: + // in_nesting_bounds: if at a nesting level where we need to add value indices + // the bounds: from current rep to the rep AT the def depth + int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0; + int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count; + { + block_scan_results value_count_scan_results; + scan_block_exclusive_sum( + in_nesting_bounds, value_count_scan_results, temp_storage); + __syncthreads(); + + thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp; + warp_value_count = value_count_scan_results.warp_count; + thread_value_count = value_count_scan_results.thread_count_within_block; + block_value_count = value_count_scan_results.block_count; + } + + // iterate by depth + for (int d_idx = 0; d_idx <= max_depth; d_idx++) { + auto& ni = s->nesting_info[d_idx]; + + // everything up to the max_def_level is a non-null value + int const is_valid = [&](int input_def_level) { + if constexpr (nullable) { + return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0; + } else { + return in_nesting_bounds; + } + }(def_level); + + // VALID COUNT: + // Not all values visited by this block will represent a value at this nesting level. + // the validity bit for thread t might actually represent output value t-6. + // the correct position for thread t's bit is thread_value_count. + uint32_t const warp_valid_mask = + WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp); + int thread_valid_count, block_valid_count; + { + auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1; + + block_scan_results valid_count_scan_results; + scan_block_exclusive_sum(warp_valid_mask, + warp_lane, + warp_index, + thread_mask, + valid_count_scan_results, + temp_storage); + __syncthreads(); + thread_valid_count = valid_count_scan_results.thread_count_within_block; + block_valid_count = valid_count_scan_results.block_count; + } + + // compute warp and thread value counts for the -next- nesting level. we need to + // do this for lists so that we can emit an offset for the -current- nesting level. + // the offset for the current nesting level == current length of the next nesting level + int next_thread_value_count_within_warp = 0, next_warp_value_count = 0; + int next_thread_value_count = 0, next_block_value_count = 0; + int next_in_nesting_bounds = 0; + if (d_idx < max_depth) { + // NEXT DEPTH VALUE COUNT: + next_in_nesting_bounds = + ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0; + { + block_scan_results next_value_count_scan_results; + scan_block_exclusive_sum( + next_in_nesting_bounds, next_value_count_scan_results, temp_storage); + __syncthreads(); + + next_thread_value_count_within_warp = + next_value_count_scan_results.thread_count_within_warp; + next_warp_value_count = next_value_count_scan_results.warp_count; + next_thread_value_count = next_value_count_scan_results.thread_count_within_block; + next_block_value_count = next_value_count_scan_results.block_count; + } + + // STORE OFFSET TO THE LIST LOCATION + // if we're -not- at a leaf column and we're within nesting/row bounds + // and we have a valid data_out pointer, it implies this is a list column, so + // emit an offset. + if (in_nesting_bounds && ni.data_out != nullptr) { + const auto& next_ni = s->nesting_info[d_idx + 1]; + int const idx = ni.value_count + thread_value_count; + cudf::size_type const ofs = + next_ni.value_count + next_thread_value_count + next_ni.page_start_value; + + (reinterpret_cast(ni.data_out))[idx] = ofs; + } + } + + // validity is processed per-warp (on lane 0's) + // thi is because when atomic writes are needed, they are 32-bit operations + // + // lists always read and write to the same bounds + // (that is, read and write positions are already pre-bounded by first_row/num_rows). + // since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + if constexpr (nullable) { + if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) { + // absolute bit offset into the output validity map + // is cumulative sum of warp_value_count at the given nesting depth + // DON'T subtract by first_row: since it's lists it's not 1-row-per-value + int const bit_offset = ni.valid_map_offset + thread_value_count; + + store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count); + } + + if (t == 0) { ni.null_count += block_value_count - block_valid_count; } + } + + // if this is valid and we're at the leaf, output dst_pos + // Read value_count before the sync, so that when thread 0 modifies it we've already read its + // value + int const current_value_count = ni.value_count; + __syncthreads(); // guard against modification of ni.value_count below + if (d_idx == max_depth) { + if (is_valid) { + int const dst_pos = current_value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; + int const output_index = rolling_index(src_pos); + + // Index from rolling buffer of values (which doesn't include nulls) to final array (which + // includes gaps for nulls) + sb->nz_idx[output_index] = dst_pos; + } + max_depth_valid_count += block_valid_count; + } + + // update stuff + if (t == 0) { + ni.value_count += block_value_count; + ni.valid_map_offset += block_value_count; + } + __syncthreads(); // sync modification of ni.value_count + + // propagate value counts for the next depth level + block_value_count = next_block_value_count; + thread_value_count = next_thread_value_count; + in_nesting_bounds = next_in_nesting_bounds; + warp_value_count = next_warp_value_count; + thread_value_count_within_warp = next_thread_value_count_within_warp; + } // END OF DEPTH LOOP + + int const batch_size = min(max_batch_size, target_value_count - value_count); + value_count += batch_size; + } + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + s->nesting_info[max_depth].valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; + s->input_value_count = value_count; + + // If we have lists # rows != # values + s->input_row_count = input_row_count; + } + + return max_depth_valid_count; +} + // is the page marked nullable or not __device__ inline bool is_nullable(page_state_s* s) { @@ -560,6 +841,23 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } +template +__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t) +{ + // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000: + // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front + // modulo 2 * block_size of course, since that's as many as we process at once + int num_skipped = parquet_stream.skip_decode(t, num_to_skip); + while (num_skipped < num_to_skip) { + // TODO: Instead of decoding, skip within the run to the appropriate location + auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped); + num_skipped += parquet_stream.decode_next(t, to_decode); + __syncthreads(); + } + + return num_skipped; +} + /** * @brief Kernel for computing fixed width non dictionary column data stored in the pages * @@ -579,9 +877,10 @@ template + bool has_lists_t, + template typename DecodeValuesFunc> -CUDF_KERNEL void __launch_bounds__(decode_block_size_t) +CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) gpuDecodePageDataGeneric(PageInfo* pages, device_span chunks, size_t min_row, @@ -621,31 +920,29 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. if (s->num_rows == 0) { return; } - DecodeValuesFunc decode_values; + DecodeValuesFunc decode_values; - bool const nullable = is_nullable(s); - bool const should_process_nulls = nullable && maybe_has_nulls(s); + bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s); // shared buffer. all shared memory is suballocated out of here - // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * - // sizeof(rle_run), size_t{16}) : 0; + constexpr int shared_rep_size = + has_lists_t + ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) + : 0; constexpr int shared_dict_size = has_dict_t ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}) : 0; constexpr int shared_def_size = cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run), size_t{16}); - constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size; + constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size; __shared__ __align__(16) uint8_t shared_buf[shared_buf_size]; // setup all shared memory buffers - int shared_offset = 0; - /* - rle_run *rep_runs = reinterpret_cast*>(shared_buf + shared_offset); - if constexpr (has_lists_t){ - shared_offset += shared_rep_size; - } - */ + int shared_offset = 0; + rle_run* rep_runs = reinterpret_cast*>(shared_buf + shared_offset); + if constexpr (has_lists_t) { shared_offset += shared_rep_size; } + rle_run* dict_runs = reinterpret_cast*>(shared_buf + shared_offset); if constexpr (has_dict_t) { shared_offset += shared_dict_size; } rle_run* def_runs = reinterpret_cast*>(shared_buf + shared_offset); @@ -660,38 +957,51 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) def, s->page.num_input_values); } - /* + rle_stream rep_decoder{rep_runs}; level_t* const rep = reinterpret_cast(pp->lvl_decode_buf[level_type::REPETITION]); - if constexpr(has_lists_t){ + if constexpr (has_lists_t) { rep_decoder.init(s->col.level_bits[level_type::REPETITION], s->abs_lvl_start[level_type::REPETITION], s->abs_lvl_end[level_type::REPETITION], rep, s->page.num_input_values); } - */ rle_stream dict_stream{dict_runs}; if constexpr (has_dict_t) { dict_stream.init( s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values); } - __syncthreads(); // We use two counters in the loop below: processed_count and valid_count. - // - processed_count: number of rows out of num_input_values that we have decoded so far. + // - processed_count: number of values out of num_input_values that we have decoded so far. // the definition stream returns the number of total rows it has processed in each call // to decode_next and we accumulate in process_count. - // - valid_count: number of non-null rows we have decoded so far. In each iteration of the + // - valid_count: number of non-null values we have decoded so far. In each iteration of the // loop below, we look at the number of valid items (which could be all for non-nullable), // and valid_count is that running count. int processed_count = 0; int valid_count = 0; + + // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) + if constexpr (has_lists_t) { + auto const skipped_leaf_values = s->page.skipped_leaf_values; + if (skipped_leaf_values > 0) { + if (should_process_nulls) { + skip_decode(def_decoder, skipped_leaf_values, t); + } + processed_count = skip_decode(rep_decoder, skipped_leaf_values, t); + if constexpr (has_dict_t) { + skip_decode(dict_stream, skipped_leaf_values, t); + } + } + } + // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues // For chunked reads we may not process all of the rows on the page; if not stop early - int last_row = s->first_row + s->num_rows; + int const last_row = s->first_row + s->num_rows; while ((s->error == 0) && (processed_count < s->page.num_input_values) && (s->input_row_count <= last_row)) { int next_valid_count; @@ -701,7 +1011,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) processed_count += def_decoder.decode_next(t); __syncthreads(); - if constexpr (has_nesting_t) { + if constexpr (has_lists_t) { + rep_decoder.decode_next(t); + __syncthreads(); + next_valid_count = gpuUpdateValidityAndRowIndicesLists( + processed_count, s, sb, def, rep, t); + } else if constexpr (has_nesting_t) { next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); } else { @@ -713,9 +1028,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // this function call entirely since all it will ever generate is a mapping of (i -> i) for // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { - processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - next_valid_count = - gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); + if constexpr (has_lists_t) { + processed_count += rep_decoder.decode_next(t); + __syncthreads(); + next_valid_count = gpuUpdateValidityAndRowIndicesLists( + processed_count, s, sb, nullptr, rep, t); + } else { + processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); + next_valid_count = + gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); + } } __syncthreads(); @@ -745,6 +1067,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -754,12 +1077,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, dim3 dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -769,17 +1103,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, decode_kernel_mask::FIXED_WIDTH_NO_DICT, false, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -789,6 +1135,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span pages, decode_kernel_mask::FIXED_WIDTH_NO_DICT, false, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -802,6 +1149,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -811,12 +1159,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -826,17 +1185,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa decode_kernel_mask::FIXED_WIDTH_DICT, true, false, + false, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -846,6 +1217,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span pa decode_kernel_mask::FIXED_WIDTH_DICT, true, false, + true, decode_fixed_width_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -860,6 +1232,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -869,12 +1242,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, dim3 dim_grid(pages.size(), 1); // 1 thread block per page => # blocks if (level_type_size == 1) { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -884,17 +1268,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT, false, false, + false, decode_fixed_width_split_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); } } else { - if (has_nesting) { + if (is_list) { + gpuDecodePageDataGeneric + <<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else if (has_nesting) { gpuDecodePageDataGeneric <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); @@ -904,6 +1300,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT, false, false, + false, decode_fixed_width_split_values_func> <<>>( pages.device_ptr(), chunks, min_row, num_rows, error_code); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index d604642be54..52d53cb8225 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -183,17 +183,20 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, return decode_kernel_mask::STRING; } - if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) { + if (!is_byte_array(chunk) && !is_boolean(chunk)) { if (page.encoding == Encoding::PLAIN) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_NO_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_NO_DICT; } else if (page.encoding == Encoding::PLAIN_DICTIONARY || page.encoding == Encoding::RLE_DICTIONARY) { - return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED - : decode_kernel_mask::FIXED_WIDTH_DICT; + return is_list(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST + : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED + : decode_kernel_mask::FIXED_WIDTH_DICT; } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) { - return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED - : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; + return is_list(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST + : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED + : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT; } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index be502b581af..dba24b553e6 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -220,6 +220,10 @@ enum class decode_kernel_mask { (1 << 9), // Same as above but for nested, fixed-width data FIXED_WIDTH_NO_DICT_NESTED = (1 << 10), // Run decode kernel for fixed width non-dictionary pages FIXED_WIDTH_DICT_NESTED = (1 << 11), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_DICT_LIST = (1 << 12), // Run decode kernel for fixed width dictionary pages + FIXED_WIDTH_NO_DICT_LIST = (1 << 13), // Run decode kernel for fixed width non-dictionary pages + BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = + (1 << 14), // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists }; // mask representing all the ways in which a string can be encoded @@ -908,6 +912,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -917,6 +922,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -932,6 +938,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -941,6 +948,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -956,6 +964,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] has_nesting Whether or not the data contains nested (but not list) data. + * @param[in] is_list Whether or not the data contains list data. * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -965,6 +974,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages size_t min_row, int level_type_size, bool has_nesting, + bool is_list, kernel_error::pointer error_code, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 32e922b04bb..a965f3325d5 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index dd354b905f3..170c6e8857f 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,8 +16,6 @@ #include "reader_impl.hpp" -#include - namespace cudf::io::parquet::detail { reader::reader() = default; diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 0705ff6f5cc..689386b8957 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -21,11 +21,9 @@ #include #include #include -#include #include #include -#include #include #include @@ -274,6 +272,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, error_code.data(), streams[s_idx++]); } @@ -286,6 +285,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch byte stream split decoder, for list columns + if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) { + DecodeSplitPageFixedWidthData(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -309,6 +322,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch fixed width type decoder for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) { + DecodePageDataFixed(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -321,6 +348,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, error_code.data(), streams[s_idx++]); } @@ -333,6 +361,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, false, + false, + error_code.data(), + streams[s_idx++]); + } + + // launch fixed width type decoder with dictionaries for lists + if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) { + DecodePageDataFixedDict(subpass.pages, + pass.chunks, + num_rows, + skip_rows, + level_type_size, + true, + true, error_code.data(), streams[s_idx++]); } @@ -345,6 +387,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, true, + false, error_code.data(), streams[s_idx++]); } diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 4a0791d5c54..69e783a89d0 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include namespace cudf::io::parquet::detail { @@ -216,6 +217,26 @@ struct rle_stream { decode_index = -1; // signals the first iteration. Nothing to decode. } + __device__ inline int get_rle_run_info(rle_run& run) + { + run.start = cur; + run.level_run = get_vlq32(run.start, end); + + // run_bytes includes the header size + int run_bytes = run.start - cur; + if (is_literal_run(run.level_run)) { + // from the parquet spec: literal runs always come in multiples of 8 values. + run.size = (run.level_run >> 1) * 8; + run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8); + } else { + // repeated value run + run.size = (run.level_run >> 1); + run_bytes += util::div_rounding_up_unsafe(level_bits, 8); + } + + return run_bytes; + } + __device__ inline void fill_run_batch() { // decode_index == -1 means we are on the very first decode iteration for this stream. @@ -226,31 +247,14 @@ struct rle_stream { while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) || fill_index < decode_index + run_buffer_size) && cur < end) { - auto& run = runs[rolling_index(fill_index)]; - // Encoding::RLE + // Pass by reference to fill the runs shared memory with the run data + auto& run = runs[rolling_index(fill_index)]; + int const run_bytes = get_rle_run_info(run); - // bytes for the varint header - uint8_t const* _cur = cur; - int const level_run = get_vlq32(_cur, end); - // run_bytes includes the header size - int run_bytes = _cur - cur; - - // literal run - if (is_literal_run(level_run)) { - // from the parquet spec: literal runs always come in multiples of 8 values. - run.size = (level_run >> 1) * 8; - run_bytes += ((run.size * level_bits) + 7) >> 3; - } - // repeated value run - else { - run.size = (level_run >> 1); - run_bytes += ((level_bits) + 7) >> 3; - } - run.output_pos = output_pos; - run.start = _cur; - run.level_run = level_run; run.remaining = run.size; + run.output_pos = output_pos; + cur += run_bytes; output_pos += run.size; fill_index++; @@ -372,6 +376,39 @@ struct rle_stream { return values_processed_shared; } + __device__ inline int skip_runs(int target_count) + { + // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip + // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info. + // then when it hits the one that matters, we don't process it at all and bail as if we never + // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for + // the first time + while (cur < end) { + rle_run run; + int run_bytes = get_rle_run_info(run); + + if ((output_pos + run.size) > target_count) { + return output_pos; // bail! we've reached the starting run + } + + // skip this run + output_pos += run.size; + cur += run_bytes; + } + + return output_pos; // we skipped everything + } + + __device__ inline int skip_decode(int t, int count) + { + int const output_count = min(count, total_values - cur_values); + + // if level_bits == 0, there's nothing to do + // a very common case: columns with no nulls, especially if they are non-nested + cur_values = (level_bits == 0) ? output_count : skip_runs(output_count); + return cur_values; + } + __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); } }; diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 4baea8655e0..f4a2f29026a 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -22,10 +22,6 @@ #include #include -#include - -#include - #include namespace cudf::io::text { diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 249dc3b5875..6d954753af8 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -21,12 +21,12 @@ #include "column_buffer.hpp" +#include #include #include #include #include -#include #include namespace cudf::io::detail { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index e73b2bc88de..31c8b781e77 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -22,12 +22,9 @@ #pragma once #include -#include #include -#include #include #include -#include #include #include @@ -35,6 +32,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 813743fa7b4..3307b4fa539 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -16,14 +16,10 @@ #include "getenv_or.hpp" -#include #include #include -#include -#include -#include #include namespace cudf::io { @@ -56,11 +52,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_ bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; } -void set_thread_pool_nthreads_from_env() +void set_up_kvikio() { static std::once_flag flag{}; std::call_once(flag, [] { - auto nthreads = getenv_or("KVIKIO_NTHREADS", 8U); + auto const compat_mode = kvikio::detail::getenv_or("KVIKIO_COMPAT_MODE", true); + kvikio::defaults::compat_mode_reset(compat_mode); + + auto const nthreads = getenv_or("KVIKIO_NTHREADS", 4u); kvikio::defaults::thread_pool_nthreads_reset(nthreads); }); } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index a8a275919d8..15de5d85614 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -42,7 +42,7 @@ class file_sink : public data_sink { if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); } if (cufile_integration::is_kvikio_enabled()) { - cufile_integration::set_thread_pool_nthreads_from_env(); + cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath, "w"); CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4e8908a8942..15a4a270ce0 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -33,7 +33,6 @@ #include #include -#include #include namespace cudf { @@ -49,7 +48,7 @@ class file_source : public datasource { { detail::force_init_cuda_context(); if (cufile_integration::is_kvikio_enabled()) { - cufile_integration::set_thread_pool_nthreads_from_env(); + cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 98ed9b28f0a..cf19bc591cc 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -22,8 +22,6 @@ #include #include -#include - #include #include @@ -110,7 +108,11 @@ class cufile_shim { ~cufile_shim() { - if (driver_close != nullptr) driver_close(); + // Explicit cuFile driver close should not be performed here to avoid segfault. However, in the + // absence of driver_close(), cuFile will implicitly do that, which in most cases causes + // segfault anyway. TODO: Revisit this conundrum once cuFile is fixed. + // https://github.com/rapidsai/cudf/issues/17121 + if (cf_lib != nullptr) dlclose(cf_lib); } diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 7e47b5b3d10..584b6213fa3 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -104,7 +104,7 @@ class cufile_shim; /** * @brief Class that provides RAII for cuFile file registration. */ -struct cufile_registered_file { +class cufile_registered_file { void register_handle(); public: diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp index c0bbca39167..cf252fe63af 100644 --- a/cpp/src/io/utilities/row_selection.cpp +++ b/cpp/src/io/utilities/row_selection.cpp @@ -16,10 +16,7 @@ #include "io/utilities/row_selection.hpp" -#include - #include -#include namespace cudf::io::detail { diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp index 7c607099cdc..e826feff201 100644 --- a/cpp/src/io/utilities/row_selection.hpp +++ b/cpp/src/io/utilities/row_selection.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 89c47d246d0..34a0bdce124 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -16,11 +16,8 @@ #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp index 0585e02a031..d9a29203133 100644 --- a/cpp/src/jit/util.cpp +++ b/cpp/src/jit/util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace jit { struct get_data_ptr_functor { diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 17008e80e79..ebab3beb08f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -834,10 +834,11 @@ std::pair, std::vector> partition( table_view const& t, column_view const& partition_map, size_type num_partitions, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr); + return detail::partition(t, partition_map, num_partitions, stream, mr); } } // namespace cudf diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index 5a4c90a67a5..ab6ab393878 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -273,11 +273,11 @@ std::pair, std::vector> round_robi table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_robin_partition( - input, num_partitions, start_partition, cudf::get_default_stream(), mr); + return detail::round_robin_partition(input, num_partitions, start_partition, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index b0a84a6d50c..d27420658d6 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1126,12 +1126,8 @@ std::pair, rmm::device_uvector> generate_mer * `max` of 0. * * @param tdv input tdigests. The tdigests within this column are grouped by key. - * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `group_offsets`. * @param group_offsets a device iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `h_group_offsets`. + * counted as one even when the cluster is empty in it. * @param group_labels a device iterator of the the group label for each tdigest cluster including * empty clusters. * @param num_group_labels the number of unique group labels. @@ -1142,9 +1138,8 @@ std::pair, rmm::device_uvector> generate_mer * * @return A column containing the merged tdigests. */ -template +template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_group_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1313,21 +1308,13 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } - auto group_offsets_ = group_offsets_fn{input.size()}; - auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_labels = thrust::make_constant_iterator(0); - return to_tdigest_scalar(merge_tdigests(tdv, - h_group_offsets, - group_offsets, - group_labels, - input.size(), - 1, - max_centroids, - stream, - mr), - stream, - mr); + auto group_offsets_ = group_offsets_fn{input.size()}; + auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); + auto group_labels = thrust::make_constant_iterator(0); + return to_tdigest_scalar( + merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr), + stream, + mr); } std::unique_ptr group_tdigest(column_view const& col, @@ -1376,16 +1363,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); } - // bring group offsets back to the host - std::vector h_group_offsets(group_offsets.size()); - cudaMemcpyAsync(h_group_offsets.data(), - group_offsets.begin(), - sizeof(size_type) * group_offsets.size(), - cudaMemcpyDefault, - stream); - return merge_tdigests(tdv, - h_group_offsets.begin(), group_offsets.data(), group_labels.data(), group_labels.size(), diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp index a9f86ac1b5f..17844b6bb0a 100644 --- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp +++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index d187375b69f..75ebc078930 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index d3c0b54f286..b91ae19b51a 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -14,13 +14,10 @@ * limitations under the License. */ -#include #include #include #include #include -#include -#include namespace cudf { diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 40d1d8a0a53..c4f6c135dde 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -13,16 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include #include #include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp index 72c23395a93..7cad31c0658 100644 --- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp +++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp @@ -18,13 +18,10 @@ #include #include #include -#include #include #include #include #include -#include -#include #include namespace cudf::detail { diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp index 8a53e937f98..77cb2a8c7f5 100644 --- a/cpp/src/rolling/detail/range_window_bounds.hpp +++ b/cpp/src/rolling/detail/range_window_bounds.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,7 @@ #pragma once #include -#include #include -#include -#include namespace cudf { namespace detail { diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp index 69792136c64..7f698dfcd6b 100644 --- a/cpp/src/rolling/range_window_bounds.cpp +++ b/cpp/src/rolling/range_window_bounds.cpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace cudf { namespace { diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 8988d73fb02..332c440aea9 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -358,10 +358,11 @@ std::unique_ptr round(column_view const& input, std::unique_ptr round(column_view const& input, int32_t decimal_places, rounding_method method, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr); + return detail::round(input, decimal_places, method, stream, mr); } } // namespace cudf diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 31535198c58..4ec2174a96f 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -26,8 +26,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index 656fe61fbbe..9f242bdffe0 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 0db1adf1223..f5d052c6657 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -152,12 +153,8 @@ struct format_compiler { } // create program in device memory - d_items.resize(items.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyDefault, - stream.value())); + d_items = cudf::detail::make_device_uvector_sync( + items, stream, cudf::get_current_device_resource_ref()); } format_item const* compiled_format_items() { return d_items.data(); } diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp index d1990733e81..60ad714dfec 100644 --- a/cpp/src/strings/regex/regexec.cpp +++ b/cpp/src/strings/regex/regexec.cpp @@ -24,7 +24,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp index 219d1174d42..1cc405234b2 100644 --- a/cpp/src/strings/strings_scalar_factories.cpp +++ b/cpp/src/strings/strings_scalar_factories.cpp @@ -16,7 +16,6 @@ #include #include -#include #include diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index b0284e9cb96..e14142a9ad1 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 5df9943303d..4012ee3d21c 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -21,13 +21,10 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index cb707c94288..41c64c6decb 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 8a5340dc20d..659beb749af 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -20,10 +20,7 @@ #include #include -#include - #include -#include #include namespace cudf { diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index eca703e2604..b13ad0a7de8 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -289,10 +289,12 @@ std::unique_ptr load_vocabulary_file( } // namespace detail std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr) + std::string const& filename_hashed_vocabulary, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr); + return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index d7e04a0c208..dee589d6daf 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -293,17 +293,12 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, uint32_t stride, bool do_lower_case, bool do_truncate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::subword_tokenize(strings, - vocabulary_table, - max_sequence_length, - stride, - do_lower_case, - do_truncate, - cudf::get_default_stream(), - mr); + return detail::subword_tokenize( + strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr); } } // namespace nvtext diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 52b96bc9039..b919ac16956 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -23,8 +23,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp index 53ca0608170..d979bda41d0 100644 --- a/cpp/src/utilities/cuda.cpp +++ b/cpp/src/utilities/cuda.cpp @@ -18,8 +18,6 @@ #include #include -#include - namespace cudf::detail { cudf::size_type num_multiprocessors() diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 9d8e3cf2fa6..e30806a5011 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 58971552758..000526723c4 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 8c29182bfb5..7069b59be26 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index a68dc84e340..c1e71f5f8f9 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -19,8 +19,6 @@ #include #include -#include - namespace cudf { namespace { diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp index 3095b342748..84c8529641d 100644 --- a/cpp/src/utilities/type_checks.cpp +++ b/cpp/src/utilities/type_checks.cpp @@ -21,8 +21,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a4213dcbe94..23632f6fbba 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -611,7 +611,6 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp - text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp @@ -711,11 +710,13 @@ ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp ST ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_ROUND_TEST streams/round_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing) @@ -742,6 +743,7 @@ ConfigureTest( streams/text/ngrams_test.cpp streams/text/replace_test.cpp streams/text/stemmer_test.cpp + streams/text/subword_tokenize_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index a4bde50a21e..7af88d8aa34 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -26,14 +25,8 @@ #include #include #include -#include -#include -#include #include #include -#include - -#include #include @@ -41,7 +34,6 @@ #include #include #include -#include #include template diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index aa5b49567e6..3bd67001c16 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -26,9 +26,7 @@ #include #include #include -#include #include -#include #include diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp index 03cc87a1968..e9a2761db4a 100644 --- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp +++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index fe221fb1c48..799bf646e52 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index ab230ab036e..5570a7d498c 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -26,8 +25,6 @@ #include -#include - template struct rep_type_impl { using type = void; diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 631f5150829..d700adaebd5 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp index 6de9121158b..470437f4112 100644 --- a/cpp/tests/column/column_view_device_span_test.cpp +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 37ab4b8f387..ad344476332 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 603187f0330..aa9d508b6aa 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -26,11 +26,8 @@ #include #include #include -#include #include -#include - #include class ColumnFactoryTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 18140c34abd..aedc498964a 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -34,8 +34,6 @@ #include #include -#include - #include #include #include diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp index cfbd181f944..e1cdfe9beed 100644 --- a/cpp/tests/copying/copy_if_else_nested_tests.cpp +++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp index 25d93da277b..e2133a546e4 100644 --- a/cpp/tests/copying/copy_range_tests.cpp +++ b/cpp/tests/copying/copy_range_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 4124f749012..9c00725d5d2 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp index 247090aac90..93f71345c5c 100644 --- a/cpp/tests/copying/gather_list_tests.cpp +++ b/cpp/tests/copying/gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,6 @@ #include #include #include -#include -#include #include #include diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp index 28098878086..795e3f30aa1 100644 --- a/cpp/tests/copying/gather_str_tests.cpp +++ b/cpp/tests/copying/gather_str_tests.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index 1598ab2646a..b2c0f7acc3a 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -17,20 +17,15 @@ #include #include #include -#include #include #include #include #include #include -#include -#include -#include #include #include #include -#include #include diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp index 07ce672b14d..908dcd67673 100644 --- a/cpp/tests/copying/gather_tests.cpp +++ b/cpp/tests/copying/gather_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index 90ff97e7355..b2d64dac7c8 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -16,10 +16,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp index 4f28ff12941..1f76efdc4c3 100644 --- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -16,13 +16,10 @@ #include #include #include -#include #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp index e4b2d319ddf..46516436901 100644 --- a/cpp/tests/copying/reverse_tests.cpp +++ b/cpp/tests/copying/reverse_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,17 +17,13 @@ #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp index 2f76e3f1fcd..8be5d8c1fbb 100644 --- a/cpp/tests/copying/sample_tests.cpp +++ b/cpp/tests/copying/sample_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,9 @@ */ #include -#include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp index 42d2e004d6b..23faa6e5b86 100644 --- a/cpp/tests/copying/scatter_list_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include using mask_vector = std::vector; using size_column = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp index a82860a3eec..1f87fcfcc99 100644 --- a/cpp/tests/copying/scatter_list_tests.cpp +++ b/cpp/tests/copying/scatter_list_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp index 78572b0bb37..1d1da8a1b1e 100644 --- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp +++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp index c92244d047b..7d88e9af85f 100644 --- a/cpp/tests/copying/scatter_struct_tests.cpp +++ b/cpp/tests/copying/scatter_struct_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include using namespace cudf::test::iterators; diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp index 41a753cd0ac..74c04446bdd 100644 --- a/cpp/tests/copying/scatter_tests.cpp +++ b/cpp/tests/copying/scatter_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -23,7 +22,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp index 8881fb344a2..a133ae43872 100644 --- a/cpp/tests/copying/segmented_gather_list_tests.cpp +++ b/cpp/tests/copying/segmented_gather_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index ff6808d9a79..72a8e7357bc 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -30,7 +29,6 @@ #include #include -#include using TestTypes = cudf::test::Types; diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index aef0d4ad78a..3868a147fa8 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -22,12 +22,8 @@ #include #include -#include #include #include -#include -#include -#include #include #include diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp index 0905f9babdc..90457f8d74c 100644 --- a/cpp/tests/copying/utility_tests.cpp +++ b/cpp/tests/copying/utility_tests.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 603edb27c7c..44f99adc0e9 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -23,14 +23,11 @@ #include #include -#include #include #include #include #include -#include - #define XXX false // stub for null values constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index 46bf5468922..ebc8c11e86c 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -24,8 +24,6 @@ #include #include -#include - struct DictionaryAddKeysTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryAddKeysTest, StringsColumn) diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp index 5db0e9fa1e4..dfa3ede5d46 100644 --- a/cpp/tests/dictionary/encode_test.cpp +++ b/cpp/tests/dictionary/encode_test.cpp @@ -21,8 +21,6 @@ #include #include -#include - struct DictionaryEncodeTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryEncodeTest, EncodeStringColumn) diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp index 18696b66e48..bc7d19201aa 100644 --- a/cpp/tests/dictionary/fill_test.cpp +++ b/cpp/tests/dictionary/fill_test.cpp @@ -18,13 +18,10 @@ #include #include -#include #include #include #include -#include - struct DictionaryFillTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryFillTest, StringsColumn) diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp index 25501b4fde7..2774173b80a 100644 --- a/cpp/tests/dictionary/search_test.cpp +++ b/cpp/tests/dictionary/search_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp index d80f8dee079..8c15d6dbecd 100644 --- a/cpp/tests/dictionary/slice_test.cpp +++ b/cpp/tests/dictionary/slice_test.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp index 26badefe698..a5e2db6a005 100644 --- a/cpp/tests/filling/fill_tests.cpp +++ b/cpp/tests/filling/fill_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp index 6326765c68b..c856984a4a3 100644 --- a/cpp/tests/filling/repeat_tests.cpp +++ b/cpp/tests/filling/repeat_tests.cpp @@ -17,14 +17,11 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include @@ -33,7 +30,6 @@ #include #include -#include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 0783b4e5bbb..53782c90c26 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index a222289216d..b96c6909e55 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -18,17 +18,14 @@ #include #include #include -#include #include #include -#include #include #include #include #include -#include #include using namespace numeric; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index a79b6a32916..ba456084a7c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -20,8 +20,6 @@ #include #include -#include - template struct groupby_collect_list_test : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index 61d2838590b..dfd7eb82c4a 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp index 26f714632dd..f8cc813e877 100644 --- a/cpp/tests/groupby/correlation_tests.cpp +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -25,7 +25,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp index e3eb2da201f..81378bb91e8 100644 --- a/cpp/tests/groupby/covariance_tests.cpp +++ b/cpp/tests/groupby/covariance_tests.cpp @@ -23,10 +23,8 @@ #include #include -#include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp index 5d99d15ae77..df0375d6a09 100644 --- a/cpp/tests/groupby/groupby_test_util.cpp +++ b/cpp/tests/groupby/groupby_test_util.cpp @@ -17,8 +17,8 @@ #include "groupby_test_util.hpp" #include -#include #include +#include #include #include @@ -27,9 +27,6 @@ #include #include #include -#include - -#include void test_single_agg(cudf::column_view const& keys, cudf::column_view const& values, diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 755b0c20f17..9d2e613be3e 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,8 @@ #pragma once -#include #include -#include #include -#include enum class force_use_sort_impl : bool { NO, YES }; diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp index 2d447025919..783cfb17e49 100644 --- a/cpp/tests/groupby/histogram_tests.cpp +++ b/cpp/tests/groupby/histogram_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp index d86de798844..6195e0179ec 100644 --- a/cpp/tests/groupby/max_scan_tests.cpp +++ b/cpp/tests/groupby/max_scan_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 279d71560b4..4481e2dc022 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index 9736bb84dd6..1bfba265478 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 7f31bc9089f..f2a50248b4a 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -22,8 +22,6 @@ #include #include -#include - using namespace cudf::test::iterators; template diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp index 14c9ceb4508..49f9d7cb10a 100644 --- a/cpp/tests/groupby/shift_tests.cpp +++ b/cpp/tests/groupby/shift_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include template diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 69e518cbf8d..b54adb52496 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp index c1a6e6ff6e1..b4622f5eb81 100644 --- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp @@ -17,11 +17,9 @@ #include #include #include -#include #include #include -#include #include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index e28e71442a6..3aa0bda6ae8 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -137,7 +136,7 @@ TEST_F(SHA1HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } TEST_F(SHA1HashTest, StructsUnsupported) @@ -146,7 +145,7 @@ TEST_F(SHA1HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 61b584f94df..3f6aeb9d5e6 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -137,7 +136,7 @@ TEST_F(SHA224HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } TEST_F(SHA224HashTest, StructsUnsupported) @@ -146,7 +145,7 @@ TEST_F(SHA224HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 8bc47c92c6b..9519e96fbae 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -136,7 +135,7 @@ TEST_F(SHA256HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } TEST_F(SHA256HashTest, StructsUnsupported) @@ -145,7 +144,7 @@ TEST_F(SHA256HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 4c79934f98d..9de566b9d9b 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -155,7 +154,7 @@ TEST_F(SHA384HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } TEST_F(SHA384HashTest, StructsUnsupported) @@ -164,7 +163,7 @@ TEST_F(SHA384HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 0eb1c60b8fc..95e5245f38e 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -155,7 +154,7 @@ TEST_F(SHA512HashTest, ListsUnsupported) auto const input = cudf::table_view({strings_list_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } TEST_F(SHA512HashTest, StructsUnsupported) @@ -164,7 +163,7 @@ TEST_F(SHA512HashTest, StructsUnsupported) auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; auto const input = cudf::table_view({struct_col}); - EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); + EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error); } template diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp index ab4ed829681..d8694a72d94 100644 --- a/cpp/tests/hashing/xxhash_64_test.cpp +++ b/cpp/tests/hashing/xxhash_64_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include #include -#include -#include #include using NumericTypesNoBools = diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 2151ec6e22f..1ddc33e749a 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -17,17 +17,13 @@ #include "nanoarrow_utils.hpp" #include -#include #include #include -#include #include #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index ef9936b214c..d93ef28aab8 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -28,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp index 80a2e4b2ffd..3916025bf22 100644 --- a/cpp/tests/interop/from_arrow_stream_test.cpp +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -17,27 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include -#include #include -#include -#include -#include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include - struct VectorOfArrays { std::vector arrays; nanoarrow::UniqueSchema schema; diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 6e742b9e4cf..18efae75cb1 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -25,9 +25,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -37,8 +35,6 @@ #include #include -#include -#include std::unique_ptr get_cudf_table() { diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 7ba586461dc..29aa928c277 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -17,21 +17,15 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include -#include -#include #include #include #include #include #include -#include #include #include diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fcb4433b42e..fa3aa82fee2 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -17,20 +17,14 @@ #include "nanoarrow_utils.hpp" #include -#include #include -#include -#include #include #include #include #include -#include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index a6aa4b22eca..86295d8efb1 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -19,14 +19,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index b265dcf9273..cc1e367d114 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -17,14 +17,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -32,18 +30,12 @@ #include #include #include -#include -#include #include -#include #include -#include - #include #include -#include #include #include #include diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp index 3c41f21b0a4..1b85541687a 100644 --- a/cpp/tests/io/file_io_test.cpp +++ b/cpp/tests/io/file_io_test.cpp @@ -15,13 +15,10 @@ */ #include -#include #include #include -#include - // Base test fixture for tests struct CuFileIOTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index d23acf3ae00..c8c2d18903f 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -29,7 +28,6 @@ #include #include -#include #include diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 36a9fa45d7d..5a833142695 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -39,8 +39,6 @@ #include -#include - #include #include #include @@ -2976,6 +2974,24 @@ TEST_F(JsonReaderTest, JsonDtypeSchema) cudf::test::debug_output_level::ALL_ERRORS); } +TEST_F(JsonReaderTest, LastRecordInvalid) +{ + std::string data = R"({"key": "1"} + {"key": "})"; + std::map schema{{"key", {dtype()}}}; + auto opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(schema) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .build(); + auto const result = cudf::io::read_json(opts); + + EXPECT_EQ(result.metadata.schema_info[0].name, "key"); + cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})}; + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}}); +} + // Test case for dtype pruning with column order TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) { @@ -3238,4 +3254,5 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) } } } + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 15682c6ae6b..887d4fa783f 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -15,12 +15,8 @@ */ #include "io/json/nested_json.hpp" -#include "io/utilities/hostdevice_vector.hpp" #include -#include -#include -#include #include #include @@ -29,9 +25,9 @@ #include #include -#include #include +#include #include #include #include diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh index 9383797d91b..c31bb2d24e0 100644 --- a/cpp/tests/io/json/json_utils.cuh +++ b/cpp/tests/io/json/json_utils.cuh @@ -52,6 +52,7 @@ std::vector split_byte_range_reading( reader_opts.get_compression(), reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size(), + reader_opts.get_delimiter(), stream); // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the // return type of that function is size_type. However, when the chunk_size is diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index f32aba0e632..e0e955c4f48 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -21,24 +21,16 @@ #include #include #include -#include #include -#include #include -#include #include -#include #include -#include #include #include #include #include -#include - -#include #include #include diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cce0adbf317..fce99187516 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index c90b81ed27a..d66aa3bde9d 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -22,13 +22,11 @@ #include #include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index f1286a00d22..d66f685cd9c 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 7986a3c6d70..177e6163d4f 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include TEST_F(ParquetReaderTest, UserBounds) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index be2ecd56424..5c3c8342cd2 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include // NOTE: this file exists to define the parquet test's `main()` function. diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index ebadd870091..c40d3bbd299 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index 6f46df20633..79ce908f3e0 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -15,14 +15,11 @@ */ #include -#include #include #include #include -#include - #include #include diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 74d08061df9..60244462e2c 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -19,16 +19,12 @@ #include #include #include -#include -#include #include -#include #include #include #include #include -#include #include using cudf::test::strings_column_wrapper; diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp index 22bc7475dbe..f7f7c0f2721 100644 --- a/cpp/tests/iterator/value_iterator.cpp +++ b/cpp/tests/iterator/value_iterator.cpp @@ -13,7 +13,6 @@ * the License. */ -#include #include CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp index 6f9dfd06730..c9bb691907a 100644 --- a/cpp/tests/jit/parse_ptx_function.cpp +++ b/cpp/tests/jit/parse_ptx_function.cpp @@ -16,7 +16,6 @@ #include "jit/parser.hpp" -#include #include #include diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp index d87f5e54153..971913443e5 100644 --- a/cpp/tests/join/cross_join_tests.cpp +++ b/cpp/tests/join/cross_join_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 178edc52dd3..9070efa38fe 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -15,12 +15,8 @@ */ #include -#include #include -#include #include -#include -#include #include #include @@ -31,7 +27,6 @@ #include #include -#include #include template diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 3431e941359..6a8a54c8465 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -20,17 +20,12 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include #include -#include #include #include #include diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 554d5754e39..ddc65c3f379 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index 42a574ac5c0..53166e04173 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 7b61be113f9..f1404990354 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -16,12 +16,10 @@ #include "large_strings_fixture.hpp" -#include #include #include #include -#include #include #include diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index 007c08ce0fb..f47782a2d02 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -16,8 +16,6 @@ #include "large_strings_fixture.hpp" -#include -#include #include #include diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp index 8fb2b403051..7ae7a6a7414 100644 --- a/cpp/tests/lists/contains_tests.cpp +++ b/cpp/tests/lists/contains_tests.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 92dd5df5ec7..2c24f695c29 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -21,12 +21,8 @@ #include #include -#include -#include #include -#include - #include #include #include diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp index 74545903eb3..dcb906cd2ef 100644 --- a/cpp/tests/lists/sequences_tests.cpp +++ b/cpp/tests/lists/sequences_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp index 5625b47e7ea..18aa118bb81 100644 --- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include namespace cudf::test { diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp index dd528c19e4e..1d7a31fd797 100644 --- a/cpp/tests/merge/merge_dictionary_test.cpp +++ b/cpp/tests/merge/merge_dictionary_test.cpp @@ -17,9 +17,7 @@ #include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index bea044496b3..d9fdb6099f0 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,10 +28,6 @@ #include -#include -#include -#include -#include #include #include diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 6208d395f0a..fad390105d7 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,6 @@ #include #include -#include #include diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp index 89d23c39dca..3693cfbcc72 100644 --- a/cpp/tests/partitioning/round_robin_test.cpp +++ b/cpp/tests/partitioning/round_robin_test.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -30,12 +28,7 @@ #include -#include -#include -#include -#include #include -#include #include using cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp index 6e88365b6e8..23b58618fe1 100644 --- a/cpp/tests/quantiles/quantile_test.cpp +++ b/cpp/tests/quantiles/quantile_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 44d4ec61852..c7e11af8c85 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp index 09cec688509..1117b0d1acf 100644 --- a/cpp/tests/reductions/ewm_tests.cpp +++ b/cpp/tests/reductions/ewm_tests.cpp @@ -18,9 +18,7 @@ #include #include -#include -#include #include template diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp index f5470f7d881..cb412f1e925 100644 --- a/cpp/tests/reductions/list_rank_test.cpp +++ b/cpp/tests/reductions/list_rank_test.cpp @@ -14,14 +14,9 @@ * limitations under the License. */ -#include - #include #include -#include -#include -#include #include struct ListRankScanTest : public cudf::test::BaseFixture { @@ -136,7 +131,7 @@ TEST_F(ListRankScanTest, ListOfStruct) false, false}}; auto col2 = cudf::test::strings_column_wrapper{ - {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"}, + {"x", "x", "a", "a", "b", "", "a", "b", "a", "b", "a", "c", "a", "c", "", "", "b", "b"}, {true, true, true, diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index 3ab1fc01eaa..130458548fc 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include #include @@ -126,7 +125,7 @@ auto make_input_column() { if constexpr (std::is_same_v) { return cudf::test::strings_column_wrapper{ - {"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"}, + {"0", "0", "4", "4", "4", "", "7", "7", "7", "9", "9", "9"}, cudf::test::iterators::null_at(5)}; } else { using fw_wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index bdb98372836..67083f19b3a 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -22,9 +22,7 @@ #include #include -#include #include -#include #include #include #include @@ -33,11 +31,9 @@ #include #include -#include #include #include -#include #include #include @@ -1259,6 +1255,12 @@ TEST_P(StringReductionTest, MinMax) // data and valid arrays std::vector host_strings(GetParam()); std::vector host_bools({true, false, true, true, true, true, false, false, true}); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(host_strings.size()), + host_strings.begin(), + [host_strings, host_bools](auto idx) { + return host_bools[idx] ? host_strings[idx] : std::string{}; + }); bool succeed(true); std::string initial_value = "init"; @@ -1385,7 +1387,7 @@ TEST_F(StringReductionTest, AllNull) std::vector host_strings( {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}); std::vector host_bools(host_strings.size(), false); - auto initial_value = cudf::make_string_scalar("init"); + auto initial_value = cudf::make_string_scalar(""); initial_value->set_valid_async(false); // string column with nulls @@ -3086,21 +3088,28 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls) using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; - // `null` means null at child column. - // `NULL` means null at parent column. auto const input = [] { auto child1 = STRINGS_CW{{"año", "bit", - "₹1" /*null*/, - "aaa" /*NULL*/, + "", // child null + "aaa", // parent null "zit", "bat", "aab", - "$1" /*null*/, - "€1" /*NULL*/, + "", // child null + "€1", // parent null "wut"}, nulls_at({2, 7})}; - auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10}, + auto child2 = INTS_CW{{1, + 2, + 0, // child null + 4, // parent null + 5, + 6, + 7, + 0, // child null + 9, // parent NULL + 10}, nulls_at({2, 7})}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; }(); diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index c4463d68a68..5f911597b02 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include #include -#include #include #include @@ -414,12 +412,13 @@ TEST_F(ScanStringsTest, MoreStringsMinMax) { int row_count = 512; - auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) { + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto idx) -> bool { return (idx % 23) != 22; }); + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [validity](auto idx) { + if (validity[idx] == 0) return std::string{}; char const s = static_cast('a' + (idx % 26)); return std::string{1, s}; }); - auto validity = cudf::detail::make_counting_transform_iterator( - 0, [](auto idx) -> bool { return (idx % 23) != 22; }); cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity); thrust::host_vector v(data_begin, data_begin + row_count); @@ -622,21 +621,28 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls) using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; - // `null` means null at child column. - // `NULL` means null at parent column. auto const input = [] { auto child1 = STRINGS_CW{{"año", "bit", - "₹1" /*null*/, - "aaa" /*NULL*/, + "", // child null + "aaa", // parent null "zit", "bat", "aab", - "$1" /*null*/, - "€1" /*NULL*/, + "", // child null + "€1", // parent null "wut"}, nulls_at({2, 7})}; - auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10}, + auto child2 = INTS_CW{{1, + 2, + 0, // child null + 4, // parent null + 5, + 6, + 7, + 0, // child null + 9, // parent null + 10}, nulls_at({2, 7})}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; }(); @@ -694,25 +700,25 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls) auto const expected = [] { auto child1 = STRINGS_CW{{"año", "año", - "" /*null*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/, - "" /*NULL*/}, + "", // child null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + "", // parent null + ""}, // parent null null_at(2)}; auto child2 = INTS_CW{{1, 1, - 0 /*null*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/, - 0 /*NULL*/}, + 0, // child null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0, // parent null + 0}, // parent null null_at(2)}; return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})}; }(); diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp index 858697d8ef5..c2cce4bbbfa 100644 --- a/cpp/tests/reductions/scan_tests.hpp +++ b/cpp/tests/reductions/scan_tests.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,9 +20,7 @@ #include #include -#include #include -#include #include #include @@ -30,7 +28,6 @@ #include #include -#include template struct TypeParam_to_host_type { diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp index 239c9ce6ddd..e972ea35ed0 100644 --- a/cpp/tests/replace/clamp_test.cpp +++ b/cpp/tests/replace/clamp_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp index 2de17388ee8..c35f385329a 100644 --- a/cpp/tests/replace/normalize_replace_tests.cpp +++ b/cpp/tests/replace/normalize_replace_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include // This is the main test fixture diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp index 35232204db7..1b9fe92066a 100644 --- a/cpp/tests/replace/replace_nans_tests.cpp +++ b/cpp/tests/replace/replace_nans_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index fcee27305f2..0c8ccea52a6 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -20,13 +20,11 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index b12bf08520f..ae4041bcfaf 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -20,20 +20,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include #include diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp index b3d9b2e2f5f..59585c0e947 100644 --- a/cpp/tests/reshape/byte_cast_tests.cpp +++ b/cpp/tests/reshape/byte_cast_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp index ed76b9d2ea5..25cfc5c5108 100644 --- a/cpp/tests/reshape/tile_tests.cpp +++ b/cpp/tests/reshape/tile_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index 165e0347785..e8a36d9ab48 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp index e7d1e3f0b10..2e1815671a9 100644 --- a/cpp/tests/rolling/empty_input_test.cpp +++ b/cpp/tests/rolling/empty_input_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,7 @@ */ #include -#include #include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp index fcfbd0eee78..2cb9b60000b 100644 --- a/cpp/tests/rolling/grouped_rolling_range_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,21 +17,16 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include -#include #include #include diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp index 78d5daf7e83..78b444bcd93 100644 --- a/cpp/tests/rolling/grouped_rolling_test.cpp +++ b/cpp/tests/rolling/grouped_rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp index de057e96320..6519b0ed4ee 100644 --- a/cpp/tests/rolling/lead_lag_test.cpp +++ b/cpp/tests/rolling/lead_lag_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 2444992e68f..5f2b383ed55 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -17,22 +17,15 @@ #include #include #include -#include #include #include #include -#include -#include #include -#include - #include #include -#include - #include #include diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index 0eaab0c9f7a..dcaa47e722b 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -17,14 +17,10 @@ #include #include #include -#include #include #include -#include -#include #include -#include template using fwcw = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp index 461c41025e9..daf5fcc1d96 100644 --- a/cpp/tests/rolling/range_rolling_window_test.cpp +++ b/cpp/tests/rolling/range_rolling_window_test.cpp @@ -17,22 +17,17 @@ #include #include #include -#include #include #include -#include #include #include -#include -#include #include #include #include #include -#include #include #include diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp index b77451bf0bc..a67555280f4 100644 --- a/cpp/tests/rolling/range_window_bounds_test.cpp +++ b/cpp/tests/rolling/range_window_bounds_test.cpp @@ -15,9 +15,6 @@ */ #include -#include -#include -#include #include #include @@ -25,8 +22,6 @@ #include -#include - struct RangeWindowBoundsTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index 6e0dc16dca9..72a511fd5f1 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 5f132f3ace9..26987ea1b7b 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -22,11 +22,8 @@ #include #include -#include #include -#include - class ScalarFactoryTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp index 78f79ccc648..a3bb1dfda10 100644 --- a/cpp/tests/search/search_dictionary_test.cpp +++ b/cpp/tests/search/search_dictionary_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 7584003e800..fb5d0fcc889 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index c35d359e75c..05b9deb3463 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index 7550cc27161..8d750be5677 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp index 109095192f9..e3c9f8d349e 100644 --- a/cpp/tests/sort/is_sorted_tests.cpp +++ b/cpp/tests/sort/is_sorted_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp index e08a2105aea..ded46cb1f31 100644 --- a/cpp/tests/sort/rank_test.cpp +++ b/cpp/tests/sort/rank_test.cpp @@ -18,10 +18,8 @@ #include #include #include -#include #include -#include #include #include #include diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp index 8ab23936ceb..ce4148a941e 100644 --- a/cpp/tests/sort/sort_nested_types_tests.cpp +++ b/cpp/tests/sort/sort_nested_types_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 6a35e977b46..e1505c7a474 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -28,7 +28,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp index 655166e0d62..88de9d51523 100644 --- a/cpp/tests/sort/stable_sort_tests.cpp +++ b/cpp/tests/sort/stable_sort_tests.cpp @@ -25,9 +25,6 @@ #include #include -#include -#include - #include #include diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 6c0582fb846..1204b019739 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -20,9 +20,7 @@ #include #include #include -#include -#include #include #include #include @@ -31,8 +29,6 @@ #include #include -#include -#include #include struct ApplyBooleanMask : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index a2dab649961..ee1bb3ead92 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 14d7d8789ac..c618ff68cbb 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -27,8 +26,6 @@ #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp index bf72da5c840..71321361564 100644 --- a/cpp/tests/stream_compaction/drop_nans_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp @@ -15,12 +15,9 @@ */ #include -#include #include #include -#include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp index dbac1d58195..d3b45c2323e 100644 --- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp +++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp @@ -15,12 +15,10 @@ */ #include -#include #include #include #include -#include #include #include #include diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp index 6c6c53331d4..cc847da6340 100644 --- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp +++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp @@ -15,20 +15,16 @@ */ #include -#include #include #include #include #include -#include #include #include #include #include -#include - auto constexpr null{0}; // null at current level auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp index 640d159fc4f..bad93e92712 100644 --- a/cpp/tests/stream_compaction/unique_count_tests.cpp +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -15,16 +15,11 @@ */ #include -#include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index d5b6915b520..e2b32b898b3 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -15,22 +15,16 @@ */ #include -#include #include #include #include -#include #include -#include #include #include #include #include -#include -#include - using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp index 2a7b52b1b6b..3dcc6f9e632 100644 --- a/cpp/tests/streams/binaryop_test.cpp +++ b/cpp/tests/streams/binaryop_test.cpp @@ -21,7 +21,6 @@ #include #include -#include #include class BinaryopTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp index 42894a0ebcb..a74ee64f8de 100644 --- a/cpp/tests/streams/io/csv_test.cpp +++ b/cpp/tests/streams/io/csv_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp index f98e685ed0c..d352c6c3b2a 100644 --- a/cpp/tests/streams/io/json_test.cpp +++ b/cpp/tests/streams/io/json_test.cpp @@ -19,9 +19,7 @@ #include #include -#include #include -#include #include #include diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp index b0eff1d3340..5bb17226029 100644 --- a/cpp/tests/streams/io/multibyte_split_test.cpp +++ b/cpp/tests/streams/io/multibyte_split_test.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp index cc43bf15b5d..10722557e6a 100644 --- a/cpp/tests/streams/io/orc_test.cpp +++ b/cpp/tests/streams/io/orc_test.cpp @@ -17,19 +17,11 @@ #include #include #include -#include -#include #include #include -#include #include -#include -#include -#include -#include -#include #include #include diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp index 9d2dec2d697..18bb80e64af 100644 --- a/cpp/tests/streams/io/parquet_test.cpp +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -17,13 +17,9 @@ #include #include #include -#include -#include #include #include -#include -#include #include #include diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp index 2811bb676fa..27bd7e080c9 100644 --- a/cpp/tests/streams/join_test.cpp +++ b/cpp/tests/streams/join_test.cpp @@ -19,11 +19,9 @@ #include #include -#include #include #include #include -#include #include #include diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp index e96224003f4..ed37a72545f 100644 --- a/cpp/tests/streams/null_mask_test.cpp +++ b/cpp/tests/streams/null_mask_test.cpp @@ -14,15 +14,12 @@ * limitations under the License. */ -#include - #include #include #include #include #include -#include class NullMaskTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/partitioning_test.cpp b/cpp/tests/streams/partitioning_test.cpp new file mode 100644 index 00000000000..636c5c1f1f9 --- /dev/null +++ b/cpp/tests/streams/partitioning_test.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using cudf::test::fixed_width_column_wrapper; +using cudf::test::strings_column_wrapper; + +class PartitionTest : public cudf::test::BaseFixture {}; + +TEST_F(PartitionTest, Struct) +{ + fixed_width_column_wrapper A({1, 2}, {0, 1}); + auto struct_col = cudf::test::structs_column_wrapper({A}, {0, 1}).release(); + auto table_to_partition = cudf::table_view{{*struct_col}}; + fixed_width_column_wrapper map{9, 2}; + + auto num_partitions = 12; + auto result = + cudf::partition(table_to_partition, map, num_partitions, cudf::test::get_default_stream()); +} + +TEST_F(PartitionTest, EmptyInput) +{ + auto const empty_column = fixed_width_column_wrapper{}; + auto const num_partitions = 5; + auto const start_partition = 0; + auto const [out_table, out_offsets] = + cudf::round_robin_partition(cudf::table_view{{empty_column}}, + num_partitions, + start_partition, + cudf::test::get_default_stream()); +} + +TEST_F(PartitionTest, ZeroPartitions) +{ + fixed_width_column_wrapper floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f}); + fixed_width_column_wrapper integers({1, 2, 3, 4, 5, 6, 7, 8}); + strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"}); + auto input = cudf::table_view({floats, integers, strings}); + + auto columns_to_hash = std::vector({2}); + + cudf::size_type const num_partitions = 0; + auto [output, offsets] = cudf::hash_partition(input, + columns_to_hash, + num_partitions, + cudf::hash_id::HASH_MURMUR3, + cudf::DEFAULT_HASH_SEED, + cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp index b4f013fc960..9ab972302e4 100644 --- a/cpp/tests/streams/reduction_test.cpp +++ b/cpp/tests/streams/reduction_test.cpp @@ -17,11 +17,8 @@ #include #include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp index b352ad2c0d2..4d9899870b4 100644 --- a/cpp/tests/streams/rolling_test.cpp +++ b/cpp/tests/streams/rolling_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include class RollingTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/round_test.cpp b/cpp/tests/streams/round_test.cpp new file mode 100644 index 00000000000..b8fda022db8 --- /dev/null +++ b/cpp/tests/streams/round_test.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class RoundTest : public cudf::test::BaseFixture {}; + +TEST_F(RoundTest, RoundHalfToEven) +{ + std::vector vals = {1.729, 17.29, 172.9, 1729}; + cudf::test::fixed_width_column_wrapper input(vals.begin(), vals.end()); + cudf::round(input, 0, cudf::rounding_method::HALF_UP, cudf::test::get_default_stream()); +} + +TEST_F(RoundTest, RoundHalfAwayFromEven) +{ + std::vector vals = {1.5, 2.5, 1.35, 1.45, 15, 25}; + cudf::test::fixed_width_column_wrapper input(vals.begin(), vals.end()); + cudf::round(input, -1, cudf::rounding_method::HALF_EVEN, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 07b2d77cc04..e7b282601e1 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -15,20 +15,16 @@ */ #include -#include #include #include #include -#include #include #include #include #include #include -#include - auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp index 36e595ab9fa..449e0830b0c 100644 --- a/cpp/tests/streams/strings/factory_test.cpp +++ b/cpp/tests/streams/strings/factory_test.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp index 4b4d0a7aff5..154e1c1b715 100644 --- a/cpp/tests/streams/strings/reverse_test.cpp +++ b/cpp/tests/streams/strings/reverse_test.cpp @@ -21,7 +21,6 @@ #include #include -#include class StringsReverseTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/text/subword_tokenize_test.cpp b/cpp/tests/streams/text/subword_tokenize_test.cpp new file mode 100644 index 00000000000..9474e6b269c --- /dev/null +++ b/cpp/tests/streams/text/subword_tokenize_test.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +// Global environment for temporary files +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +struct TextSubwordTest : public cudf::test::BaseFixture {}; + +// Create a fake hashed vocab text file for the tests in this source file. +// The vocab only includes the following words: +// 'this', 'is', 'a', 'test', 'tést' +// The period '.' character also has a token id. +void create_hashed_vocab(std::string const& hash_file) +{ + constexpr size_t coefsize = 23; + std::vector> coefficients(coefsize, {65559, 0}); + std::ofstream outfile(hash_file, std::ofstream::out); + outfile << "1\n0\n" << coefficients.size() << "\n"; + for (auto c : coefficients) { + outfile << c.first << " " << c.second << "\n"; + } + std::vector hash_table(coefsize, 0); + outfile << hash_table.size() << "\n"; + hash_table[0] = 3015668L; // based on values + hash_table[1] = 6205475701751155871L; // from the + hash_table[5] = 6358029; // bert_hash_table.txt + hash_table[16] = 451412625363L; // file for the test + hash_table[20] = 6206321707968235495L; // words above + for (auto h : hash_table) { + outfile << h << "\n"; + } + outfile << "100\n101\n102\n\n"; +} + +TEST(TextSubwordTest, Tokenize) +{ + uint32_t const nrows = 100; + std::vector h_strings(nrows, "This is a test. A test this is."); + cudf::test::strings_column_wrapper strings(h_strings.cbegin(), h_strings.cend()); + std::string const hash_file = temp_env->get_temp_filepath("hashed_vocab.txt"); + create_hashed_vocab(hash_file); + auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream()); + + uint32_t const max_sequence_length = 16; + uint32_t const stride = 16; + + auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, + *vocab, + max_sequence_length, + stride, + true, // do_lower_case + false, // do_truncate + cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index cf81dc6fb42..9f168abcb31 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -15,17 +15,11 @@ */ #include -#include #include #include -#include -#include #include -#include -#include #include -#include #include #include diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index 9c0ecaa52c0..06b9c2fa3c1 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -23,10 +23,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index bb57d6f5e8a..e53adcf373a 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp index 00317146088..c92f1cfc8f8 100644 --- a/cpp/tests/strings/combine/join_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -22,7 +22,6 @@ #include #include #include -#include using namespace cudf::test::iterators; diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp index 5cf4015b9e9..51dcc60d95e 100644 --- a/cpp/tests/strings/concatenate_tests.cpp +++ b/cpp/tests/strings/concatenate_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ #include #include -#include #include diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index b3dc3010c67..da0db0fc056 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 7e0338f1bf4..37b25d9b287 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4821a7fa999..7eb4b32d078 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -28,8 +27,6 @@ #include -#include - struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 79054551498..b788c05c152 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -23,8 +23,6 @@ #include #include -#include - struct StringsConvertTest : public cudf::test::BaseFixture {}; template diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 26bcfe8028d..c08effdb969 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -24,9 +24,6 @@ #include #include -#include -#include - #include #include diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index 219bd6d8b01..a34ff25cb69 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -17,28 +17,18 @@ #include #include #include -#include #include #include #include -#include #include -#include #include -#include -#include -#include -#include #include #include #include -#include #include -#include -#include #include #include diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp index c33eedf9bd9..c0df2f01a63 100644 --- a/cpp/tests/structs/utilities_tests.cpp +++ b/cpp/tests/structs/utilities_tests.cpp @@ -14,21 +14,15 @@ * limitations under the License. */ -#include "cudf_test/default_stream.hpp" - #include #include #include -#include #include #include #include -#include -#include #include #include -#include #include #include diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp index 5fa63c47cf0..216c4d7b6bb 100644 --- a/cpp/tests/table/row_operators_tests.cpp +++ b/cpp/tests/table/row_operators_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp index 1637ba7d7d3..363f1a0ba5d 100644 --- a/cpp/tests/table/table_tests.cpp +++ b/cpp/tests/table/table_tests.cpp @@ -17,17 +17,14 @@ #include #include #include -#include #include #include #include -#include #include #include #include -#include template using column_wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index e23f3f6e7d8..ef35a4472cf 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -21,13 +21,9 @@ #include #include -#include #include -#include -#include - #include struct MinHashTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 1acb4fc4265..c72c7cfc80e 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -28,8 +28,6 @@ #include -#include - struct TextGenerateNgramsTest : public cudf::test::BaseFixture {}; TEST_F(TextGenerateNgramsTest, Ngrams) diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index b0d41004e7e..2515cc917fa 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp index a343913411c..82c4bf53cfc 100644 --- a/cpp/tests/text/stemmer_tests.cpp +++ b/cpp/tests/text/stemmer_tests.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index a615780c02a..782551ad66e 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -19,13 +19,11 @@ #include #include -#include #include #include #include -#include #include // Global environment for temporary files diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 2684123c08a..9437440f34d 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -20,10 +20,8 @@ #include #include -#include #include #include -#include #include diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp index ba16c100e7a..42ca872a936 100644 --- a/cpp/tests/transform/nans_to_null_test.cpp +++ b/cpp/tests/transform/nans_to_null_test.cpp @@ -17,12 +17,10 @@ #include #include #include -#include #include #include #include -#include template struct NaNsToNullTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp index 5a88c402b8c..7797b2b2cf8 100644 --- a/cpp/tests/transpose/transpose_test.cpp +++ b/cpp/tests/transpose/transpose_test.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp index 0d9092c33da..46468af515d 100644 --- a/cpp/tests/types/traits_test.cpp +++ b/cpp/tests/types/traits_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 45b89b76070..ed4c1340dbb 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -20,18 +20,15 @@ #include #include -#include #include #include #include #include #include -#include #include #include -#include #include static auto const test_timestamps_D = std::vector{ diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp index 5bfbf70d5f9..663a919f3f4 100644 --- a/cpp/tests/unary/math_ops_test.cpp +++ b/cpp/tests/unary/math_ops_test.cpp @@ -22,10 +22,6 @@ #include #include #include -#include -#include - -#include #include diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index e7477c34642..3c616461c74 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -23,7 +23,6 @@ #include #include -#include #include template diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp index ab5a31ce161..555d89b7dc5 100644 --- a/cpp/tests/utilities/random_seed.cpp +++ b/cpp/tests/utilities/random_seed.cpp @@ -13,8 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include -#include +#include namespace cudf { namespace test { diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp index 7aa05af4591..2a57d678d07 100644 --- a/cpp/tests/utilities_tests/column_debug_tests.cpp +++ b/cpp/tests/utilities_tests/column_debug_tests.cpp @@ -16,12 +16,9 @@ #include #include -#include #include #include -#include - #include #include diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 9d6d5ccb9b5..a13ce825d0b 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -17,20 +17,16 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include -#include - template struct ColumnUtilitiesTest : public cudf::test::BaseFixture { cudf::test::UniformRandomGenerator random; diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp index 479c6687e75..339678f3be8 100644 --- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp index 5e3fda5e6f7..ff50dc39979 100644 --- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp index fecb896f95a..c1c5776be74 100644 --- a/cpp/tests/utilities_tests/type_check_tests.cpp +++ b/cpp/tests/utilities_tests/type_check_tests.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp index 849457056e4..6c3a84763a0 100644 --- a/cpp/tests/utilities_tests/type_list_tests.cpp +++ b/cpp/tests/utilities_tests/type_list_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include using namespace cudf::test; // this will make reading code way easier @@ -23,6 +22,7 @@ namespace { // Work around to remove parentheses surrounding a type template struct argument_type; + template struct argument_type { using type = U; diff --git a/dependencies.yaml b/dependencies.yaml index ff97b67f0ce..90255ca674c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -232,7 +232,7 @@ files: key: cudf-pandas-tests includes: - test_python_cudf_pandas - py_rapids_build_cudf_polars: + py_build_cudf_polars: output: pyproject pyproject_dir: python/cudf_polars extras: @@ -399,21 +399,21 @@ dependencies: - output_types: conda packages: # Align nvcomp version with rapids-cmake - - nvcomp==4.0.1 + - nvcomp==4.1.0.6 specific: - output_types: [requirements, pyproject] matrices: - matrix: cuda: "12.*" packages: - - nvidia-nvcomp-cu12==4.0.1 + - nvidia-nvcomp-cu12==4.1.0.6 - matrix: cuda: "11.*" packages: - - nvidia-nvcomp-cu11==4.0.1 + - nvidia-nvcomp-cu11==4.1.0.6 - matrix: packages: - - nvidia-nvcomp==4.0.1 + - nvidia-nvcomp==4.1.0.6 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -727,7 +727,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.8,<1.9 + - polars>=1.11,<1.13 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] @@ -828,6 +828,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - scipy + - mmh3 - output_types: conda packages: - aiobotocore>=2.2.0 @@ -836,12 +837,14 @@ dependencies: - msgpack-python - moto>=4.0.8 - s3fs>=2022.3.0 - - output_types: pyproject + - python-xxhash + - output_types: [pyproject, requirements] packages: - msgpack - &tokenizers tokenizers==0.15.2 - &transformers transformers==4.39.3 - tzdata + - xxhash specific: - output_types: [conda, requirements] matrices: diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md index 6fce268f309..f4d2c7319b3 100644 --- a/docs/cudf/source/developer_guide/contributing_guide.md +++ b/docs/cudf/source/developer_guide/contributing_guide.md @@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting. Specifically, cuDF uses the following tools: -- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance. -- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently. +- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance. - [`mypy`](http://mypy-lang.org/) performs static type checking. In conjunction with [type hints](https://docs.python.org/3/library/typing.html), `mypy` can help catch various bugs that are otherwise difficult to find. diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 95f5f9734dd..46221b6015b 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -38,10 +38,10 @@ "import os\n", "\n", "import cupy as cp\n", + "import dask_cudf\n", "import pandas as pd\n", "\n", "import cudf\n", - "import dask_cudf\n", "\n", "cp.random.seed(12)\n", "\n", diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst index d05501f4a4a..f711327f9ed 100644 --- a/docs/cudf/source/user_guide/api_docs/index.rst +++ b/docs/cudf/source/user_guide/api_docs/index.rst @@ -19,7 +19,7 @@ This page provides a list of all publicly accessible modules, methods and classe general_utilities window io - subword_tokenize + tokenize_vocabulary string_handling list_handling struct_handling diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst new file mode 100644 index 00000000000..6bd1fbd821b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst @@ -0,0 +1,6 @@ +======= +hashing +======= + +.. automodule:: pylibcudf.hashing + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 62e14a67ee5..997ece6d29c 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + hashing interop join json diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst new file mode 100644 index 00000000000..908fcc4fde6 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst @@ -0,0 +1,6 @@ +================ +byte_pair_encode +================ + +.. automodule:: pylibcudf.nvtext.byte_pair_encode + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index e0735a197fd..9ba47fd8d70 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -8,7 +8,10 @@ nvtext generate_ngrams jaccard minhash + byte_pair_encode ngrams_tokenize normalize replace stemmer + subword_tokenize + tokenize diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst new file mode 100644 index 00000000000..818714bec6a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst @@ -0,0 +1,6 @@ +================ +subword_tokenize +================ + +.. automodule:: pylibcudf.nvtext.subword_tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst new file mode 100644 index 00000000000..85c5a27b09d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst @@ -0,0 +1,6 @@ +======== +tokenize +======== + +.. automodule:: pylibcudf.nvtext.tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst index 9850ee10098..699e38ebbe5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -1,6 +1,6 @@ -==== -find -==== +======= +findall +======= .. automodule:: pylibcudf.strings.findall :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst index e39ca18a12b..4de9bced86f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst @@ -1,5 +1,5 @@ ===== -Table +table ===== .. automodule:: pylibcudf.table diff --git a/docs/cudf/source/user_guide/api_docs/string_handling.rst b/docs/cudf/source/user_guide/api_docs/string_handling.rst index ab0f085e1a6..91d3e33960b 100644 --- a/docs/cudf/source/user_guide/api_docs/string_handling.rst +++ b/docs/cudf/source/user_guide/api_docs/string_handling.rst @@ -60,6 +60,7 @@ strings and apply several methods to it. These can be accessed like isupper istimestamp istitle + jaccard_index join len like @@ -67,6 +68,7 @@ strings and apply several methods to it. These can be accessed like lower lstrip match + minhash ngrams ngrams_tokenize normalize_characters @@ -90,7 +92,6 @@ strings and apply several methods to it. These can be accessed like slice_from slice_replace split - rsplit startswith strip swapcase diff --git a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst deleted file mode 100644 index cd240fe4db4..00000000000 --- a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst +++ /dev/null @@ -1,12 +0,0 @@ -================ -SubwordTokenizer -================ -.. currentmodule:: cudf.core.subword_tokenizer - -Constructor -~~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - SubwordTokenizer - SubwordTokenizer.__call__ diff --git a/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst new file mode 100644 index 00000000000..1b5c965f3c9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst @@ -0,0 +1,12 @@ +================== +TokenizeVocabulary +================== +.. currentmodule:: cudf.core.tokenize_vocabulary + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + TokenizeVocabulary + TokenizeVocabulary.tokenize diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 75eafcc5387..abfe5a1b178 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -101,6 +101,8 @@ "outputs": [], "source": [ "# define a scalar function\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -247,6 +249,8 @@ "outputs": [], "source": [ "# redefine the same function from above\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -1622,6 +1626,8 @@ "outputs": [], "source": [ "# a user defined aggregation function.\n", + "\n", + "\n", "def udaf(df):\n", " return df[\"b\"].max() - df[\"b\"].min() / 2" ] diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md index 97b961b455b..62db062cc45 100644 --- a/docs/cudf/source/user_guide/io/io.md +++ b/docs/cudf/source/user_guide/io/io.md @@ -91,16 +91,28 @@ SDK is available for download [here](https://developer.nvidia.com/gpudirect-storage). GDS is also included in CUDA Toolkit 11.4 and higher. -Use of GPUDirect Storage in cuDF is enabled by default, but can be -disabled through the environment variable `LIBCUDF_CUFILE_POLICY`. +Use of GPUDirect Storage in cuDF is disabled by default, but can be +enabled through the environment variable `LIBCUDF_CUFILE_POLICY`. This variable also controls the GDS compatibility mode. There are four valid values for the environment variable: -- "GDS": Enable GDS use; GDS compatibility mode is *off*. -- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*. -- "KVIKIO": Enable GDS through [KvikIO](https://github.com/rapidsai/kvikio). -- "OFF": Completely disable GDS use. +- "GDS": Enable GDS use. If the cuFile library cannot be properly loaded, +fall back to the GDS compatibility mode. +- "ALWAYS": Enable GDS use. If the cuFile library cannot be properly loaded, +throw an exception. +- "KVIKIO": Enable GDS compatibility mode through [KvikIO](https://github.com/rapidsai/kvikio). +Note that KvikIO also provides the environment variable `KVIKIO_COMPAT_MODE` for GDS +control that may alter the effect of "KVIKIO" option in cuDF: + - By default, `KVIKIO_COMPAT_MODE` is unset. In this case, cuDF enforces + the GDS compatibility mode, and the system configuration check for GDS I/O + is never performed. + - If `KVIKIO_COMPAT_MODE=ON`, this is the same with the above case. + - If `KVIKIO_COMPAT_MODE=OFF`, KvikIO enforces GDS I/O without system + configuration check, and will error out if GDS requirements are not met. The + only exceptional case is that if the system does not support files being + opened with the `O_DIRECT` flag, the GDS compatibility mode will be used. +- "OFF": Completely disable GDS and kvikIO use. If no value is set, behavior will be the same as the "KVIKIO" option. diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index e4106574a19..bfb959b12c1 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -155,6 +155,16 @@ public static HostMemoryBuffer allocate(long bytes) { return allocate(bytes, defaultPreferPinned); } + /** + * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators. + * Pinned memory will not be used for these allocations. + * @param bytes size in bytes to allocate + * @return the newly created buffer + */ + public static HostMemoryBuffer allocateRaw(long bytes) { + return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes); + } + /** * Create a host buffer that is memory-mapped to a file. * @param path path to the file to map into host memory @@ -245,8 +255,10 @@ public final void copyFromHostBuffer(long destOffset, HostMemoryBuffer srcData, * @param destOffset offset in bytes in this buffer to start copying to * @param in input stream to copy bytes from * @param byteLength number of bytes to copy + * @throws EOFException If there are not enough bytes in the stream to copy. + * @throws IOException If there is an error reading from the stream. */ - final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException { + public final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException { addressOutOfBoundsCheck(address + destOffset, byteLength, "copy from stream"); byte[] arrayBuffer = new byte[(int) Math.min(1024 * 128, byteLength)]; long left = byteLength; @@ -254,7 +266,7 @@ final void copyFromStream(long destOffset, InputStream in, long byteLength) thro int amountToCopy = (int) Math.min(arrayBuffer.length, left); int amountRead = in.read(arrayBuffer, 0, amountToCopy); if (amountRead < 0) { - throw new EOFException(); + throw new EOFException("Unexpected end of stream, expected " + left + " more bytes"); } setBytes(destOffset, arrayBuffer, 0, amountRead); destOffset += amountRead; diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java deleted file mode 100644 index 72c2e659372..00000000000 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package ai.rapids.cudf; - -/** - * Represents some amount of host memory that has been reserved. A reservation guarantees that one - * or more allocations up to the reserved amount, minus padding for alignment will succeed. A - * reservation typically guarantees the amount can be allocated one, meaning when a buffer - * allocated from a reservation is freed it is not returned to the reservation, but to the pool of - * memory the reservation originally came from. If more memory is allocated from the reservation - * an OutOfMemoryError may be thrown, but it is not guaranteed to happen. - * - * When the reservation is closed any unused reservation will be returned to the pool of memory - * the reservation came from. - */ -public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {} diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java index 76b2799aad6..ae8a0e17f9d 100644 --- a/java/src/main/java/ai/rapids/cudf/Schema.java +++ b/java/src/main/java/ai/rapids/cudf/Schema.java @@ -29,26 +29,52 @@ public class Schema { public static final Schema INFERRED = new Schema(); private final DType topLevelType; + + /** + * Default value for precision value, when it is not specified or the column type is not decimal. + */ + private static final int UNKNOWN_PRECISION = -1; + + /** + * Store precision for the top level column, only applicable if the column is a decimal type. + *

+ * This variable is not designed to be used by any libcudf's APIs since libcudf does not support + * precisions for fixed point numbers. + * Instead, it is used only to pass down the precision values from Spark's DecimalType to the + * JNI level, where some JNI functions require these values to perform their operations. + */ + private final int topLevelPrecision; + private final List childNames; private final List childSchemas; private boolean flattened = false; private String[] flattenedNames; private DType[] flattenedTypes; + private int[] flattenedPrecisions; private int[] flattenedCounts; private Schema(DType topLevelType, + int topLevelPrecision, List childNames, List childSchemas) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; this.childNames = childNames; this.childSchemas = childSchemas; } + private Schema(DType topLevelType, + List childNames, + List childSchemas) { + this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas); + } + /** * Inferred schema. */ private Schema() { topLevelType = null; + topLevelPrecision = UNKNOWN_PRECISION; childNames = null; childSchemas = null; } @@ -104,14 +130,17 @@ private void flattenIfNeeded() { if (flatLen == 0) { flattenedNames = null; flattenedTypes = null; + flattenedPrecisions = null; flattenedCounts = null; } else { String[] names = new String[flatLen]; DType[] types = new DType[flatLen]; + int[] precisions = new int[flatLen]; int[] counts = new int[flatLen]; - collectFlattened(names, types, counts, 0); + collectFlattened(names, types, precisions, counts, 0); flattenedNames = names; flattenedTypes = types; + flattenedPrecisions = precisions; flattenedCounts = counts; } flattened = true; @@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) { return startingLength; } - private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) { + private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) { if (childSchemas != null) { for (int i = 0; i < childSchemas.size(); i++) { Schema child = childSchemas.get(i); names[offset] = childNames.get(i); types[offset] = child.topLevelType; + precisions[offset] = child.topLevelPrecision; if (child.childNames != null) { counts[offset] = child.childNames.size(); } else { counts[offset] = 0; } offset++; - offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset); + offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset); } } return offset; @@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() { return ret; } + /** + * Get decimal precisions of the columns' types flattened from all levels in schema by + * depth-first traversal. + *

+ * This is used to pass down the decimal precisions from Spark to only the JNI layer, where + * some JNI functions require precision values to perform their operations. + * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not + * support precisions for fixed point numbers. + * + * @return An array containing decimal precision of all columns in schema. + */ + public int[] getFlattenedDecimalPrecisions() { + flattenIfNeeded(); + return flattenedPrecisions; + } + /** * Get the types of the columns in schema flattened from all levels by depth-first traversal. * @return An array containing types of all columns in schema. @@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() { public static class Builder { private final DType topLevelType; + private final int topLevelPrecision; private final List names; private final List types; - private Builder(DType topLevelType) { + private Builder(DType topLevelType, int topLevelPrecision) { this.topLevelType = topLevelType; + this.topLevelPrecision = topLevelPrecision; if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) { // There can be children names = new ArrayList<>(); @@ -322,14 +370,19 @@ private Builder(DType topLevelType) { } } + private Builder(DType topLevelType) { + this(topLevelType, UNKNOWN_PRECISION); + } + /** * Add a new column * @param type the type of column to add * @param name the name of the column to add (Ignored for list types) + * @param precision the decimal precision, only applicable for decimal types * @return the builder for the new column. This should really only be used when the type * passed in is a LIST or a STRUCT. */ - public Builder addColumn(DType type, String name) { + public Builder addColumn(DType type, String name, int precision) { if (names == null) { throw new IllegalStateException("A column of type " + topLevelType + " cannot have children"); @@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) { if (names.contains(name)) { throw new IllegalStateException("Cannot add duplicate names to a schema"); } - Builder ret = new Builder(type); + Builder ret = new Builder(type, precision); types.add(ret); names.add(name); return ret; } + public Builder addColumn(DType type, String name) { + return addColumn(type, name, UNKNOWN_PRECISION); + } + /** * Adds a single column to the current schema. addColumn is preferred as it can be used * to support nested types. * @param type the type of the column. * @param name the name of the column. + * @param precision the decimal precision, only applicable for decimal types. * @return this for chaining. */ + public Builder column(DType type, String name, int precision) { + addColumn(type, name, precision); + return this; + } + public Builder column(DType type, String name) { - addColumn(type, name); + addColumn(type, name, UNKNOWN_PRECISION); return this; } @@ -366,7 +429,7 @@ public Schema build() { children.add(b.build()); } } - return new Schema(topLevelType, names, children); + return new Schema(topLevelType, topLevelPrecision, names, children); } } } diff --git a/pyproject.toml b/pyproject.toml index 661c68ee62e..6933484f4e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ select = [ "F", # pycodestyle Warning "W", + # isort + "I", # no-blank-line-before-function "D201", # one-blank-line-after-class diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 7b2b71cf216..0e4afadccf5 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,27 +56,23 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from config import cudf # noqa: W0611, E402, F401 -from utils import ( # noqa: E402 - OrderedSet, - collapse_fixtures, - column_generators, - make_fixture, -) - # Turn off isort until we upgrade to 5.8.0 # https://github.com/pycqa/isort/issues/1594 -# isort: off from config import ( # noqa: W0611, E402, F401 NUM_COLS, NUM_ROWS, collect_ignore, + cudf, # noqa: W0611, E402, F401 pytest_collection_modifyitems, pytest_sessionfinish, pytest_sessionstart, ) - -# isort: on +from utils import ( # noqa: E402 + OrderedSet, + collapse_fixtures, + column_generators, + make_fixture, +) @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"]) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 065655505b8..94dbdf5534d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -688,15 +688,18 @@ cdef class Column: # special case for string column is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) if is_string_column: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: + if cv.num_children() == 0: base_nbytes = 0 else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size + # get the size from offset child column (device to host copy) + offsets_column_index = 0 + offset_child_column = cv.child(offsets_column_index) + if offset_child_column.size() == 0: + base_nbytes = 0 + else: + chars_size = get_element( + offset_child_column, offset_child_column.size()-1).value + base_nbytes = chars_size if data_ptr: if data_owner is None: diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 30353c4be6c..4221e745e65 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -4,7 +4,7 @@ import pickle from libc.stdint cimport uint8_t, uintptr_t from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.gather cimport ( - segmented_gather as cpp_segmented_gather, -) -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type @@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index): ) -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - cdef shared_ptr[lists_column_view] source_LCV = ( - make_shared[lists_column_view](source_column.view()) - ) - cdef shared_ptr[lists_column_view] gather_map_LCV = ( - make_shared[lists_column_view](gather_map.view()) - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_segmented_gather( - source_LCV.get()[0], gather_map_LCV.get()[0]) - ) - - result = Column.from_unique_ptr(move(c_result)) - return result - - cdef class _CPackedColumns: @staticmethod diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 9b7ab0888d2..89309b36371 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,27 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock +import pylibcudf as plc -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.hash cimport ( - md5, - murmurhash3_x86_32, - sha1, - sha224, - sha256, - sha384, - sha512, - xxhash_64, -) -from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.table cimport Table from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns - -import pylibcudf as plc @acquire_spill_lock() @@ -37,32 +22,26 @@ def hash_partition(list source_columns, list columns_to_hash, @acquire_spill_lock() def hash(list source_columns, str method, int seed=0): - cdef table_view c_source_view = table_view_from_columns(source_columns) - cdef unique_ptr[column] c_result + cdef Table ctbl = Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ) if method == "murmur3": - with nogil: - c_result = move(murmurhash3_x86_32(c_source_view, seed)) + return Column.from_pylibcudf(plc.hashing.murmurhash3_x86_32(ctbl, seed)) + elif method == "xxhash64": + return Column.from_pylibcudf(plc.hashing.xxhash_64(ctbl, seed)) elif method == "md5": - with nogil: - c_result = move(md5(c_source_view)) + return Column.from_pylibcudf(plc.hashing.md5(ctbl)) elif method == "sha1": - with nogil: - c_result = move(sha1(c_source_view)) + return Column.from_pylibcudf(plc.hashing.sha1(ctbl)) elif method == "sha224": - with nogil: - c_result = move(sha224(c_source_view)) + return Column.from_pylibcudf(plc.hashing.sha224(ctbl)) elif method == "sha256": - with nogil: - c_result = move(sha256(c_source_view)) + return Column.from_pylibcudf(plc.hashing.sha256(ctbl)) elif method == "sha384": - with nogil: - c_result = move(sha384(c_source_view)) + return Column.from_pylibcudf(plc.hashing.sha384(ctbl)) elif method == "sha512": - with nogil: - c_result = move(sha512(c_source_view)) - elif method == "xxhash64": - with nogil: - c_result = move(xxhash_64(c_source_view, seed)) + return Column.from_pylibcudf(plc.hashing.sha512(ctbl)) else: - raise ValueError(f"Unsupported hash function: {method}") - return Column.from_unique_ptr(move(c_result)) + raise ValueError( + f"Unsupported hashing algorithm {method}." + ) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 1dc586bb257..1c9d3a01b80 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,49 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython cimport pycapsule -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - import pylibcudf -from pylibcudf.libcudf.interop cimport ( - DLManagedTensor, - from_dlpack as cpp_from_dlpack, - to_dlpack as cpp_to_dlpack, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_unique_ptr, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ListDtype, StructDtype -def from_dlpack(dlpack_capsule): +def from_dlpack(object dlpack_capsule): """ Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ - cdef DLManagedTensor* dlpack_tensor = pycapsule.\ - PyCapsule_GetPointer(dlpack_capsule, 'dltensor') - pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor') - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_from_dlpack(dlpack_tensor) - ) - - res = columns_from_unique_ptr(move(c_result)) - dlpack_tensor.deleter(dlpack_tensor) - return res + return columns_from_pylibcudf_table( + pylibcudf.interop.from_dlpack(dlpack_capsule) + ) def to_dlpack(list source_columns): @@ -52,39 +25,13 @@ def to_dlpack(list source_columns): DLPack Tensor PyCapsule will have the name "dltensor". """ - if any(column.null_count for column in source_columns): - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) - - cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_columns(source_columns) - - with nogil: - dlpack_tensor = cpp_to_dlpack( - source_table_view + return pylibcudf.interop.to_dlpack( + pylibcudf.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) - - return pycapsule.PyCapsule_New( - dlpack_tensor, - 'dltensor', - dlmanaged_tensor_pycapsule_deleter ) -cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: - cdef DLManagedTensor* dlpack_tensor = 0 - try: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'used_dltensor') - return # we do not call a used capsule's deleter - except Exception: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'dltensor') - dlpack_tensor.deleter(dlpack_tensor) - - def gather_metadata(object cols_dtypes): """ Generates a ColumnMetadata vector for each column. diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7e8710bedb6..12432ac6d5d 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table -import pylibcudf +import pylibcudf as plc from pylibcudf cimport Scalar @@ -17,7 +17,7 @@ from pylibcudf cimport Scalar @acquire_spill_lock() def count_elements(Column col): return Column.from_pylibcudf( - pylibcudf.lists.count_elements( + plc.lists.count_elements( col.to_pylibcudf(mode="read")) ) @@ -25,8 +25,8 @@ def count_elements(Column col): @acquire_spill_lock() def explode_outer(list source_columns, int explode_column_idx): return columns_from_pylibcudf_table( - pylibcudf.lists.explode_outer( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), + plc.lists.explode_outer( + plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]), explode_column_idx, ) ) @@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( - pylibcudf.lists.distinct( + plc.lists.distinct( col.to_pylibcudf(mode="read"), nulls_equal, nans_all_equal, @@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): @acquire_spill_lock() def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( - pylibcudf.lists.sort_lists( + plc.lists.sort_lists( col.to_pylibcudf(mode="read"), ascending, null_order.BEFORE if na_position == "first" else null_order.AFTER, @@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position): @acquire_spill_lock() def extract_element_scalar(Column col, size_type index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index, ) @@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index): @acquire_spill_lock() def extract_element_column(Column col, Column index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index.to_pylibcudf(mode="read"), ) @@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.contains( + plc.lists.contains( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, ) @@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key): @acquire_spill_lock() def index_of_scalar(Column col, object py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, True, @@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key): @acquire_spill_lock() def index_of_column(Column col, Column search_keys): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), True, @@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_rows( - pylibcudf.Table([ + plc.lists.concatenate_rows( + plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]) ) @@ -121,8 +121,18 @@ def concatenate_rows(list source_columns): @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_list_elements( + plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), dropna, ) ) + + +@acquire_spill_lock() +def segmented_gather(Column source_column, Column gather_map): + return Column.from_pylibcudf( + plc.lists.segmented_gather( + source_column.to_pylibcudf(mode="read"), + gather_map.to_pylibcudf(mode="read"), + ) + ) diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx index 0d768e24f39..2b2762eead2 100644 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -3,49 +3,22 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.byte_pair_encode cimport ( - bpe_merge_pairs as cpp_bpe_merge_pairs, - byte_pair_encoding as cpp_byte_pair_encoding, - load_merge_pairs as cpp_load_merge_pairs, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -cdef class BPEMergePairs: - cdef unique_ptr[cpp_bpe_merge_pairs] c_obj - - def __cinit__(self, Column merge_pairs): - cdef column_view c_pairs = merge_pairs.view() - with nogil: - self.c_obj = move(cpp_load_merge_pairs(c_pairs)) +from pylibcudf import nvtext +from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs # no-cython-lint @acquire_spill_lock() def byte_pair_encoding( Column strings, - BPEMergePairs merge_pairs, + object merge_pairs, object separator ): - cdef column_view c_strings = strings.view() - cdef DeviceScalar d_separator = separator.device_value - cdef const string_scalar* c_separator = d_separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_byte_pair_encoding( - c_strings, - merge_pairs.c_obj.get()[0], - c_separator[0] - ) + return Column.from_pylibcudf( + nvtext.byte_pair_encode.byte_pair_encoding( + strings.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx index ee442ece5c6..5e0bfb74705 100644 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx @@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( - hashed_vocabulary as cpp_hashed_vocabulary, - load_vocabulary_file as cpp_load_vocabulary_file, - move as tr_move, - subword_tokenize as cpp_subword_tokenize, - tokenizer_result as cpp_tokenizer_result, -) from cudf._lib.column cimport Column - -cdef class Hashed_Vocabulary: - cdef unique_ptr[cpp_hashed_vocabulary] c_obj - - def __cinit__(self, hash_file): - cdef string c_hash_file = str(hash_file).encode() - with nogil: - self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) +from pylibcudf import nvtext @acquire_spill_lock() def subword_tokenize_inmem_hash( Column strings, - Hashed_Vocabulary hashed_vocabulary, + object hashed_vocabulary, uint32_t max_sequence_length=64, uint32_t stride=48, bool do_lower=True, @@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash( """ Subword tokenizes text series by using the pre-loaded hashed vocabulary """ - cdef column_view c_strings = strings.view() - cdef cpp_tokenizer_result c_result - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - c_strings, - hashed_vocabulary.c_obj.get()[0], - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - ) + result = nvtext.subword_tokenize.subword_tokenize( + strings.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) # return the 3 tensor components - tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids)) - masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) - metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) + tokens = Column.from_pylibcudf(result[0]) + masks = Column.from_pylibcudf(result[1]) + metadata = Column.from_pylibcudf(result[2]) return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx index a7e63f1e9ae..f473c48e2f7 100644 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx @@ -2,162 +2,85 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.tokenize cimport ( - character_tokenize as cpp_character_tokenize, - count_tokens as cpp_count_tokens, - detokenize as cpp_detokenize, - load_vocabulary as cpp_load_vocabulary, - tokenize as cpp_tokenize, - tokenize_vocabulary as cpp_tokenize_vocabulary, - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint + from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def _tokenize_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_scalar( + strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _tokenize_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiters - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_column( + strings.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _count_tokens_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiter[0] - ) + return Column.from_pylibcudf( + nvtext.tokenize.count_tokens_scalar( + strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def _count_tokens_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiters - ) + return Column.from_pylibcudf( + nvtext.tokenize.count_tokens_column( + strings.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def character_tokenize(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_character_tokenize(c_strings) + return Column.from_pylibcudf( + nvtext.tokenize.character_tokenize( + strings.to_pylibcudf(mode="read") ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() def detokenize(Column strings, Column indices, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_detokenize(c_strings, c_indices, c_separator[0]) + return Column.from_pylibcudf( + nvtext.tokenize.detokenize( + strings.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + py_separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class TokenizeVocabulary: - cdef unique_ptr[cpp_tokenize_vocabulary] c_obj - - def __cinit__(self, Column vocab): - cdef column_view c_vocab = vocab.view() - with nogil: - self.c_obj = move(cpp_load_vocabulary(c_vocab)) + ) @acquire_spill_lock() def tokenize_with_vocabulary(Column strings, - TokenizeVocabulary vocabulary, + object vocabulary, object py_delimiter, size_type default_id): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize_with_vocabulary( - c_strings, - vocabulary.c_obj.get()[0], - c_delimiter[0], - default_id - ) + return Column.from_pylibcudf( + nvtext.tokenize.tokenize_with_vocabulary( + strings.to_pylibcudf(mode="read"), + vocabulary, + py_delimiter.device_value.c_value, + default_id ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 40d0c9eac3a..1589e23f716 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -7,20 +7,11 @@ from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform from pylibcudf.expressions cimport Expression -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns import pylibcudf as plc @@ -121,13 +112,8 @@ def compute_column(list columns, tuple column_names, expr: str): # At the end, all the stack contains is the expression to evaluate. cdef Expression cudf_expr = visitor.expression - cdef table_view tbl = table_view_from_columns(columns) - cdef unique_ptr[column] col - with nogil: - col = move( - libcudf_transform.compute_column( - tbl, - dereference(cudf_expr.c_obj.get()) - ) - ) - return Column.from_unique_ptr(move(col)) + result = plc_transform.compute_column( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + cudf_expr, + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 6e8ad556b08..3b13cc258ab 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,8 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. import sys -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING, Any, TypeVar, Union import numpy as np from pandas import Period, Timedelta, Timestamp @@ -42,7 +42,7 @@ SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation -AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +AggType = Union[str, Callable] # noqa: UP007 +MultiColumnAggType = Union[ # noqa: UP007 + AggType, Iterable[AggType], dict[Any, Iterable[AggType]] ] diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index caff019f575..ffa306bf93f 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -18,6 +18,9 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from collections.abc import Mapping + def host_memory_allocation(nbytes: int) -> memoryview: """Allocate host memory using NumPy diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 0bd8d6054b3..ecf9807cfc2 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,13 +2,16 @@ from __future__ import annotations -from typing import Literal, Mapping +from typing import TYPE_CHECKING, Literal from typing_extensions import Self import cudf from cudf.core.buffer.buffer import Buffer, BufferOwner +if TYPE_CHECKING: + from collections.abc import Mapping + class ExposureTrackedBuffer(Buffer): """An exposure tracked buffer. diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 6ca64a0a2be..8d38a5f2272 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -2,9 +2,10 @@ from __future__ import annotations +import pylibcudf as plc + import cudf from cudf._lib.nvtext.byte_pair_encode import ( - BPEMergePairs as cpp_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, ) @@ -25,7 +26,9 @@ class BytePairEncoder: """ def __init__(self, merges_pair: "cudf.Series"): - self.merge_pairs = cpp_merge_pairs(merges_pair._column) + self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( + merges_pair._column.to_pylibcudf(mode="read") + ) def __call__(self, text, separator: str = " ") -> cudf.Series: """ diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 06791df7dc0..a1e87d04bc9 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -29,4 +29,3 @@ Decimal128Column, DecimalBaseColumn, ) -from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 864e87b5377..087d0ed65f5 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -26,6 +26,7 @@ if TYPE_CHECKING: from collections import abc + from collections.abc import Mapping, Sequence import numba.cuda diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7674565e2c3..d2f9d208c77 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,10 +4,11 @@ import pickle from collections import abc +from collections.abc import MutableSequence, Sequence from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np @@ -579,8 +580,8 @@ def _wrap_binop_normalization(self, other): if cudf.utils.utils.is_na_like(other): return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: - # Try and maintain the dtype - other = other.dtype.type(other.item()) + # Return numpy scalar + other = other[()] return self.normalize_binop_value(other) def _scatter_by_slice( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2c9b0baa9b6..b6dc250e64d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal, Sequence, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np import pandas as pd @@ -31,6 +31,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 8803ebd6791..8ae06f72d1e 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -3,8 +3,9 @@ from __future__ import annotations import warnings +from collections.abc import Sequence from decimal import Decimal -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import cupy as cp import numpy as np diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c6a39199e3b..6b25e568f00 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -11,7 +11,6 @@ from typing_extensions import Self import cudf -from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( concatenate_list_elements, concatenate_rows, @@ -22,6 +21,7 @@ extract_element_scalar, index_of_column, index_of_scalar, + segmented_gather, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column @@ -34,6 +34,8 @@ from cudf.core.missing import NA if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 05a0ab2e09a..a91c080fe21 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,9 +2,7 @@ from __future__ import annotations -from typing import Union, overload - -from typing_extensions import Literal +from typing import Literal, Union, overload import cudf import cudf.core.column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 78d2814ed26..620cae65374 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -28,7 +28,7 @@ from .numerical_base import NumericalBaseColumn if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Sequence from cudf._typing import ( ColumnBinaryOperand, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b25e486d855..856ce0f75de 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,7 +5,7 @@ import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast, overload +from typing import TYPE_CHECKING, cast, overload import numpy as np import pandas as pd @@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + from collections.abc import Sequence + import cupy import numba.cuda diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6b6f3e517a8..087d6474e7f 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -19,6 +19,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype _unit_to_nanoseconds_conversion = { diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bc093fdaa9a..496e86ed709 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -5,8 +5,9 @@ import itertools import sys from collections import abc +from collections.abc import Mapping from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Mapping, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7d4d34f5b04..bf1c39b23da 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,8 +13,8 @@ import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from collections.abc import Callable, Iterator, MutableMapping +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numba diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5250a741d3d..aa601a2b322 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -3,7 +3,7 @@ import enum from collections import abc -from typing import Any, Iterable, Mapping, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import cupy as cp import numpy as np @@ -20,6 +20,9 @@ build_column, ) +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + # Implementation of interchange protocol classes # ---------------------------------------------- @@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum): _DtypeKind.BOOL, _DtypeKind.STRING, } -ProtoDtype = Tuple[_DtypeKind, int, str, str] +ProtoDtype = tuple[_DtypeKind, int, str, str] class _CuDFBuffer: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 37ad6b8fabb..205edd91d9d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import pickle import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -36,6 +36,7 @@ from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import MutableMapping from types import ModuleType from cudf._typing import Dtype, ScalarLike diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 81b20488d8d..e59b948aba9 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -8,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable, Literal +from typing import TYPE_CHECKING, Any, Literal import cupy as cp import numpy as np @@ -36,6 +36,8 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: + from collections.abc import Iterable + from cudf._typing import ( AggType, DataFrameOrSeries, @@ -479,6 +481,11 @@ def get_group(self, name, obj=None): "instead of ``gb.get_group(name, obj=df)``.", FutureWarning, ) + if is_list_like(self._by): + if isinstance(name, tuple) and len(name) == 1: + name = name[0] + else: + raise KeyError(name) return obj.iloc[self.indices[name]] @_performance_tracking diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cd07c58c5d9..1b90e9f9df0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,10 +5,10 @@ import operator import pickle import warnings -from collections.abc import Hashable +from collections.abc import Hashable, MutableMapping from functools import cache, cached_property from numbers import Number -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5952815deef..e031f2a4e8e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -10,9 +10,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - MutableMapping, TypeVar, cast, ) @@ -63,6 +61,8 @@ from cudf.utils.utils import _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import Callable, MutableMapping + from cudf._typing import ( ColumnLike, DataFrameOrSeries, diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 8182e5cede2..ce6a5c960dd 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,9 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Union - -from typing_extensions import TypeAlias +from typing import Any, TypeAlias import cudf from cudf.api.types import _is_scalar_or_zero_d_array, is_integer @@ -46,11 +44,11 @@ class ScalarIndexer: key: GatherMap -IndexingSpec: TypeAlias = Union[ - EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer -] +IndexingSpec: TypeAlias = ( + EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer +) -ColumnLabels: TypeAlias = List[str] +ColumnLabels: TypeAlias = list[str] def destructure_iloc_key( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 92d094d9de5..bfff62f0a89 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,7 +8,7 @@ import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, MutableMapping +from typing import TYPE_CHECKING, Any import cupy as cp import numpy as np @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator, Hashable + from collections.abc import Generator, Hashable, MutableMapping from typing_extensions import Self diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 29ed18ac0ce..9b60424c924 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal import cupy import numpy as np @@ -71,6 +71,8 @@ from cudf.utils.performance_tracking import _performance_tracking if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow as pa from cudf._typing import ( diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 9e59b134b73..dda1f199078 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -6,8 +6,9 @@ import cupy as cp +import pylibcudf as plc + from cudf._lib.nvtext.subword_tokenize import ( - Hashed_Vocabulary as cpp_hashed_vocabulary, subword_tokenize_inmem_hash as cpp_subword_tokenize, ) @@ -50,7 +51,9 @@ class SubwordTokenizer: def __init__(self, hash_file: str, do_lower_case: bool = True): self.do_lower_case = do_lower_case - self.vocab_file = cpp_hashed_vocabulary(hash_file) + self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary( + hash_file + ) def __call__( self, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 99d85c0c5c0..1e31376cce8 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -2,9 +2,10 @@ from __future__ import annotations +import pylibcudf as plc + import cudf from cudf._lib.nvtext.tokenize import ( - TokenizeVocabulary as cpp_tokenize_vocabulary, tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, ) @@ -20,7 +21,9 @@ class TokenizeVocabulary: """ def __init__(self, vocabulary: "cudf.Series"): - self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column) + self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( + vocabulary._column.to_pylibcudf(mode="read") + ) def tokenize( self, text, delimiter: str = "", default_id: int = -1 diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 68f34fa28ff..885e7b16644 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,7 +4,7 @@ import math import re import warnings -from typing import Literal, Sequence +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -20,6 +20,9 @@ from cudf.core import column from cudf.core.index import ensure_index +if TYPE_CHECKING: + from collections.abc import Sequence + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { "year": "year", diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index c364d55e677..73afde407db 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -10,9 +10,9 @@ import pickle import types import warnings -from collections.abc import Callable, Iterator +from collections.abc import Callable, Iterator, Mapping from enum import IntEnum -from typing import Any, Literal, Mapping +from typing import Any, Literal import numpy as np diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index f82e300e83d..38103a71908 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, NamedTuple +from typing import Any, ContextManager, NamedTuple # noqa: UP035 from typing_extensions import Self diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index 8870fbc5c28..bb2fc00d9fc 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -9,6 +9,7 @@ python analyze-test-failures.py Example: +------- python analyze-test-failures.py log.json frame/* """ diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index d12d2697729..59966a5ff0c 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs): @pytest.fixture(scope="session", autouse=True) # type: ignore def patch_testing_functions(): - tm.assert_produces_warning = null_assert_warnings + tm.assert_produces_warning = null_assert_warnings # noqa: F821 pytest.raises = replace_kwargs({"match": None})(pytest.raises) diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 4ea0b3b4413..a0ad872e4c7 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -5,7 +5,8 @@ """ Summarizes the test results per module. -Examples: +Examples +-------- python summarize-test-results.py log.json python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 949fa909b5b..71b6bbd688d 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -3431,3 +3431,16 @@ def test_binop_eq_ne_index_series(data1, data2): expected = gi.to_pandas() != gs.to_pandas() assert_eq(expected, actual) + + +@pytest.mark.parametrize("scalar", [np.datetime64, np.timedelta64]) +def test_binop_lhs_numpy_datetimelike_scalar(scalar): + slr1 = scalar(1, "ms") + slr2 = scalar(1, "ns") + result = slr1 < cudf.Series([slr2]) + expected = slr1 < pd.Series([slr2]) + assert_eq(result, expected) + + result = slr2 < cudf.Series([slr1]) + expected = slr2 < pd.Series([slr1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 6b222841622..e4422e204bc 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -4059,3 +4059,19 @@ def test_ndim(): pgb = pser.groupby([0, 0, 1]) ggb = gser.groupby(cudf.Series([0, 0, 1])) assert pgb.ndim == ggb.ndim + + +@pytest.mark.skipif( + not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2" +) +def test_get_group_list_like(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.groupby(["a"]).get_group((1,)) + expected = df.to_pandas().groupby(["a"]).get_group((1,)) + assert_eq(result, expected) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group((1, 2)) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group([1]) diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index a75a20a4681..63fd9601fc1 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -387,7 +387,8 @@ def test_dir_bound_method( ): """This test will fail because dir for bound methods is currently incorrect, but we have no way to fix it without materializing the slow - type, which is unnecessarily expensive.""" + type, which is unnecessarily expensive. + """ Fast, FastIntermediate = fast_and_intermediate_with_doc Slow, SlowIntermediate = slow_and_intermediate_with_doc diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index feab04ffadc..b6105c17b3e 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -53,6 +53,7 @@ test = [ "cramjam", "fastavro>=0.22.9", "hypothesis", + "mmh3", "msgpack", "pytest-benchmark", "pytest-cases>=3.8.2", @@ -63,6 +64,7 @@ test = [ "tokenizers==0.15.2", "transformers==4.39.3", "tzdata", + "xxhash", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pandas-tests = [ "ipython", @@ -81,50 +83,6 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "pylibcudf" -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] - [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" @@ -174,3 +132,18 @@ wheel.packages = ["cudf"] provider = "scikit_build_core.metadata.regex" input = "cudf/VERSION" regex = "(?P.*)" + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "pylibcudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 87e19a2bccf..667cd7b1db8 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -32,51 +32,20 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", - "streamz", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "cudf_kafka", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf_kafka"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda", "streamz"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 2c195f6637c..08bc9d0ea3f 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -9,10 +9,11 @@ from typing import TYPE_CHECKING, cast import pyarrow as pa -import pylibcudf as plc import polars as pl +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.utils import dtypes diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 41b1defab39..2af9fdaacc5 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa + import pylibcudf as plc from cudf_polars.containers import Column diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py index 11a47e7ea51..245bdbefe88 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py @@ -8,10 +8,10 @@ from typing import TYPE_CHECKING, ClassVar -import pylibcudf as plc - from polars.polars import _expr_nodes as pl_expr +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 9c14a8386f3..8db8172ebd1 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -10,10 +10,11 @@ from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -import pylibcudf as plc from polars.polars import _expr_nodes as pl_expr +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ( ExecutionContext, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 596e193d8fe..65fa4bfa62f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -9,10 +9,11 @@ from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -import pylibcudf as plc from polars.polars import _expr_nodes as pl_expr +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ExecutionContext, Expr diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py index c8aa993b994..c16313bf83c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any import pyarrow as pa + import pylibcudf as plc from cudf_polars.containers import Column diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py index 0247256e507..77d7d4c0d22 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING import pyarrow as pa + import pylibcudf as plc from cudf_polars.containers import Column diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 62b54c63a8d..8b66c9d4676 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -10,11 +10,12 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ExecutionContext, Expr from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 53f6ed29239..6f22544c050 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa + import pylibcudf as plc from cudf_polars.containers import Column diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index eb93929cf61..04aa74024cd 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -20,14 +20,16 @@ from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl +import pylibcudf as plc + import cudf_polars.dsl.expr as expr from cudf_polars.containers import Column, DataFrame from cudf_polars.dsl.nodebase import Node +from cudf_polars.dsl.to_ast import to_parquet_filter from cudf_polars.utils import dtypes if TYPE_CHECKING: @@ -418,9 +420,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: colnames[0], ) elif self.typ == "parquet": + filters = None + if self.predicate is not None and self.row_index is None: + # Can't apply filters during read if we have a row index. + filters = to_parquet_filter(self.predicate.value) tbl_w_meta = plc.io.parquet.read_parquet( plc.io.SourceInfo(self.paths), columns=with_columns, + filters=filters, nrows=n_rows, skip_rows=self.skip_rows, ) @@ -429,6 +436,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # TODO: consider nested column names? tbl_w_meta.column_names(include_children=False), ) + if filters is not None: + # Mask must have been applied. + return df elif self.typ == "ndjson": json_schema: list[tuple[str, str, list]] = [ (name, typ, []) for name, typ in self.schema.items() @@ -666,11 +676,11 @@ def __init__( raise NotImplementedError( "rolling window/groupby" ) # pragma: no cover; rollingwindow constructor has already raised + if self.options.dynamic: + raise NotImplementedError("dynamic group by") if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - if len(self.keys) == 0: - raise NotImplementedError("dynamic groupby") @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -802,10 +812,10 @@ class Join(IR): right_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], bool, tuple[int, int] | None, - str | None, + str, bool, ] """ @@ -840,7 +850,7 @@ def __init__( @staticmethod @cache def _joiners( - how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "semi", "anti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -862,13 +872,13 @@ def _joiners( plc.copying.OutOfBoundsPolicy.NULLIFY, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "leftsemi": + elif how == "semi": return ( plc.join.left_semi_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - elif how == "leftanti": + elif how == "anti": return ( plc.join.left_anti_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -933,7 +943,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left, right = (c.evaluate(cache=cache) for c in self.children) how, join_nulls, zlice, suffix, coalesce = self.options - suffix = "_right" if suffix is None else suffix if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps @@ -955,7 +964,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: columns[left.num_columns :], right.column_names, strict=True ) ] - return DataFrame([*left_cols, *right_cols]) + return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py new file mode 100644 index 00000000000..9a0838631cc --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -0,0 +1,265 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Conversion of expression nodes to libcudf AST nodes.""" + +from __future__ import annotations + +from functools import partial, reduce, singledispatch +from typing import TYPE_CHECKING, TypeAlias + +from polars.polars import _expr_nodes as pl_expr + +import pylibcudf as plc +from pylibcudf import expressions as plc_expr + +from cudf_polars.dsl import expr +from cudf_polars.dsl.traversal import CachingVisitor +from cudf_polars.typing import GenericTransformer + +if TYPE_CHECKING: + from collections.abc import Mapping + +# Can't merge these op-mapping dictionaries because scoped enum values +# are exposed by cython with equality/hash based one their underlying +# representation type. So in a dict they are just treated as integers. +BINOP_TO_ASTOP = { + plc.binaryop.BinaryOperator.EQUAL: plc_expr.ASTOperator.EQUAL, + plc.binaryop.BinaryOperator.NULL_EQUALS: plc_expr.ASTOperator.NULL_EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL: plc_expr.ASTOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS: plc_expr.ASTOperator.LESS, + plc.binaryop.BinaryOperator.LESS_EQUAL: plc_expr.ASTOperator.LESS_EQUAL, + plc.binaryop.BinaryOperator.GREATER: plc_expr.ASTOperator.GREATER, + plc.binaryop.BinaryOperator.GREATER_EQUAL: plc_expr.ASTOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.ADD: plc_expr.ASTOperator.ADD, + plc.binaryop.BinaryOperator.SUB: plc_expr.ASTOperator.SUB, + plc.binaryop.BinaryOperator.MUL: plc_expr.ASTOperator.MUL, + plc.binaryop.BinaryOperator.DIV: plc_expr.ASTOperator.DIV, + plc.binaryop.BinaryOperator.TRUE_DIV: plc_expr.ASTOperator.TRUE_DIV, + plc.binaryop.BinaryOperator.FLOOR_DIV: plc_expr.ASTOperator.FLOOR_DIV, + plc.binaryop.BinaryOperator.PYMOD: plc_expr.ASTOperator.PYMOD, + plc.binaryop.BinaryOperator.BITWISE_AND: plc_expr.ASTOperator.BITWISE_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc_expr.ASTOperator.BITWISE_OR, + plc.binaryop.BinaryOperator.BITWISE_XOR: plc_expr.ASTOperator.BITWISE_XOR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc_expr.ASTOperator.LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc_expr.ASTOperator.LOGICAL_OR, + plc.binaryop.BinaryOperator.NULL_LOGICAL_AND: plc_expr.ASTOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.NULL_LOGICAL_OR: plc_expr.ASTOperator.NULL_LOGICAL_OR, +} + +UOP_TO_ASTOP = { + plc.unary.UnaryOperator.SIN: plc_expr.ASTOperator.SIN, + plc.unary.UnaryOperator.COS: plc_expr.ASTOperator.COS, + plc.unary.UnaryOperator.TAN: plc_expr.ASTOperator.TAN, + plc.unary.UnaryOperator.ARCSIN: plc_expr.ASTOperator.ARCSIN, + plc.unary.UnaryOperator.ARCCOS: plc_expr.ASTOperator.ARCCOS, + plc.unary.UnaryOperator.ARCTAN: plc_expr.ASTOperator.ARCTAN, + plc.unary.UnaryOperator.SINH: plc_expr.ASTOperator.SINH, + plc.unary.UnaryOperator.COSH: plc_expr.ASTOperator.COSH, + plc.unary.UnaryOperator.TANH: plc_expr.ASTOperator.TANH, + plc.unary.UnaryOperator.ARCSINH: plc_expr.ASTOperator.ARCSINH, + plc.unary.UnaryOperator.ARCCOSH: plc_expr.ASTOperator.ARCCOSH, + plc.unary.UnaryOperator.ARCTANH: plc_expr.ASTOperator.ARCTANH, + plc.unary.UnaryOperator.EXP: plc_expr.ASTOperator.EXP, + plc.unary.UnaryOperator.LOG: plc_expr.ASTOperator.LOG, + plc.unary.UnaryOperator.SQRT: plc_expr.ASTOperator.SQRT, + plc.unary.UnaryOperator.CBRT: plc_expr.ASTOperator.CBRT, + plc.unary.UnaryOperator.CEIL: plc_expr.ASTOperator.CEIL, + plc.unary.UnaryOperator.FLOOR: plc_expr.ASTOperator.FLOOR, + plc.unary.UnaryOperator.ABS: plc_expr.ASTOperator.ABS, + plc.unary.UnaryOperator.RINT: plc_expr.ASTOperator.RINT, + plc.unary.UnaryOperator.BIT_INVERT: plc_expr.ASTOperator.BIT_INVERT, + plc.unary.UnaryOperator.NOT: plc_expr.ASTOperator.NOT, +} + +SUPPORTED_STATISTICS_BINOPS = { + plc.binaryop.BinaryOperator.EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS, + plc.binaryop.BinaryOperator.LESS_EQUAL, + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.GREATER_EQUAL, +} + +REVERSED_COMPARISON = { + plc.binaryop.BinaryOperator.EQUAL: plc.binaryop.BinaryOperator.EQUAL, + plc.binaryop.BinaryOperator.NOT_EQUAL: plc.binaryop.BinaryOperator.NOT_EQUAL, + plc.binaryop.BinaryOperator.LESS: plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL: plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.GREATER: plc.binaryop.BinaryOperator.LESS, + plc.binaryop.BinaryOperator.GREATER_EQUAL: plc.binaryop.BinaryOperator.LESS_EQUAL, +} + + +Transformer: TypeAlias = GenericTransformer[expr.Expr, plc_expr.Expression] + + +@singledispatch +def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression: + """ + Translate an expression to a pylibcudf Expression. + + Parameters + ---------- + node + Expression to translate. + self + Recursive transformer. The state dictionary should contain a + `for_parquet` key indicating if this transformation should + provide an expression suitable for use in parquet filters. + + If `for_parquet` is `False`, the dictionary should contain a + `name_to_index` mapping that maps column names to their + integer index in the table that will be used for evaluation of + the expression. + + Returns + ------- + pylibcudf Expression. + + Raises + ------ + NotImplementedError or KeyError if the expression cannot be translated. + """ + raise NotImplementedError(f"Unhandled expression type {type(node)}") + + +@_to_ast.register +def _(node: expr.Col, self: Transformer) -> plc_expr.Expression: + if self.state["for_parquet"]: + return plc_expr.ColumnNameReference(node.name) + return plc_expr.ColumnReference(self.state["name_to_index"][node.name]) + + +@_to_ast.register +def _(node: expr.Literal, self: Transformer) -> plc_expr.Expression: + return plc_expr.Literal(plc.interop.from_arrow(node.value)) + + +@_to_ast.register +def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression: + if node.op == plc.binaryop.BinaryOperator.NULL_NOT_EQUALS: + return plc_expr.Operation( + plc_expr.ASTOperator.NOT, + self( + # Reconstruct and apply, rather than directly + # constructing the right expression so we get the + # handling of parquet special cases for free. + expr.BinOp( + node.dtype, plc.binaryop.BinaryOperator.NULL_EQUALS, *node.children + ) + ), + ) + if self.state["for_parquet"]: + op1_col, op2_col = (isinstance(op, expr.Col) for op in node.children) + if op1_col ^ op2_col: + op = node.op + if op not in SUPPORTED_STATISTICS_BINOPS: + raise NotImplementedError( + f"Parquet filter binop with column doesn't support {node.op!r}" + ) + op1, op2 = node.children + if op2_col: + (op1, op2) = (op2, op1) + op = REVERSED_COMPARISON[op] + if not isinstance(op2, expr.Literal): + raise NotImplementedError( + "Parquet filter binops must have form 'col binop literal'" + ) + return plc_expr.Operation(BINOP_TO_ASTOP[op], self(op1), self(op2)) + elif op1_col and op2_col: + raise NotImplementedError( + "Parquet filter binops must have one column reference not two" + ) + return plc_expr.Operation(BINOP_TO_ASTOP[node.op], *map(self, node.children)) + + +@_to_ast.register +def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: + if node.name == pl_expr.BooleanFunction.IsIn: + needles, haystack = node.children + if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16: + # 16 is an arbitrary limit + needle_ref = self(needles) + values = [ + plc_expr.Literal(plc.interop.from_arrow(v)) for v in haystack.value + ] + return reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR), + ( + plc_expr.Operation(plc_expr.ASTOperator.EQUAL, needle_ref, value) + for value in values + ), + ) + if self.state["for_parquet"] and isinstance(node.children[0], expr.Col): + raise NotImplementedError( + f"Parquet filters don't support {node.name} on columns" + ) + if node.name == pl_expr.BooleanFunction.IsNull: + return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])) + elif node.name == pl_expr.BooleanFunction.IsNotNull: + return plc_expr.Operation( + plc_expr.ASTOperator.NOT, + plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])), + ) + elif node.name == pl_expr.BooleanFunction.Not: + return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0])) + raise NotImplementedError(f"AST conversion does not support {node.name}") + + +@_to_ast.register +def _(node: expr.UnaryFunction, self: Transformer) -> plc_expr.Expression: + if isinstance(node.children[0], expr.Col) and self.state["for_parquet"]: + raise NotImplementedError( + "Parquet filters don't support {node.name} on columns" + ) + return plc_expr.Operation( + UOP_TO_ASTOP[node._OP_MAPPING[node.name]], self(node.children[0]) + ) + + +def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None: + """ + Convert an expression to libcudf AST nodes suitable for parquet filtering. + + Parameters + ---------- + node + Expression to convert. + + Returns + ------- + pylibcudf Expression if conversion is possible, otherwise None. + """ + mapper = CachingVisitor(_to_ast, state={"for_parquet": True}) + try: + return mapper(node) + except (KeyError, NotImplementedError): + return None + + +def to_ast( + node: expr.Expr, *, name_to_index: Mapping[str, int] +) -> plc_expr.Expression | None: + """ + Convert an expression to libcudf AST nodes suitable for compute_column. + + Parameters + ---------- + node + Expression to convert. + name_to_index + Mapping from column names to their index in the table that + will be used for expression evaluation. + + Returns + ------- + pylibcudf Expressoin if conversion is possible, otherwise None. + """ + mapper = CachingVisitor( + _to_ast, state={"for_parquet": False, "name_to_index": name_to_index} + ) + try: + return mapper(node) + except (KeyError, NotImplementedError): + return None diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 522c4a6729c..5181214819e 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,23 +5,29 @@ from __future__ import annotations +import functools import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl import polars.polars as plrs from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +import pylibcudf as plc + from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes, sorting +if TYPE_CHECKING: + from cudf_polars.typing import ExprTransformer + __all__ = ["translate_ir", "translate_named_expr"] @@ -182,7 +188,71 @@ def _( with set_node(visitor, node.input_right): inp_right = translate_ir(visitor, n=None) right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] - return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + if (how := node.options[0]) in { + "inner", + "left", + "right", + "full", + "cross", + "semi", + "anti", + }: + return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + else: + how, op1, op2 = how + if how != "ie_join": + raise NotImplementedError( + f"Unsupported join type {how}" + ) # pragma: no cover; asof joins not yet exposed + # No exposure of mixed/conditional joins in pylibcudf yet, so in + # the first instance, implement by doing a cross join followed by + # a filter. + _, join_nulls, zlice, suffix, coalesce = node.options + cross = ir.Join( + schema, + [], + [], + ("cross", join_nulls, None, suffix, coalesce), + inp_left, + inp_right, + ) + dtype = plc.DataType(plc.TypeId.BOOL8) + if op2 is None: + ops = [op1] + else: + ops = [op1, op2] + suffix = cross.options[3] + + # Column references in the right table refer to the post-join + # names, so with suffixes. + def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(e, expr.Col) and e.name in inp_left.schema: + return type(e)(e.dtype, f"{e.name}{suffix}") + return reuse_if_unchanged(e, rec) + + mapper = make_recursive(_rename) + right_on = [ + expr.NamedExpr( + f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new + ) + for new, old in zip( + (mapper(e.value) for e in right_on), right_on, strict=True + ) + ] + mask = functools.reduce( + functools.partial( + expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND + ), + ( + expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value) + for op, left, right in zip(ops, left_on, right_on, strict=True) + ), + ) + filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross) + if zlice is not None: + offset, length = zlice + return ir.Slice(schema, offset, length, filtered) + return filtered @_translate_ir.register @@ -319,8 +389,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - # Polars 1.7 changes definition of the CSV reader options schema name. - if (version := visitor.version()) >= (3, 0): + if (version := visitor.version()) >= (4, 0): raise NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 7b6f3848fc4..7b45c1eaa06 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -151,7 +151,7 @@ def assert_collect_raises( collect_kwargs: dict[OptimizationArgs, bool] | None = None, polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, -): +) -> None: """ Assert that collecting the result of a query raises the expected exceptions. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 05b76d76808..e01ccd05527 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -16,7 +16,7 @@ from collections.abc import Mapping -def pytest_addoption(parser: pytest.Parser): +def pytest_addoption(parser: pytest.Parser) -> None: """Add plugin-specific options.""" group = parser.getgroup( "cudf-polars", "Plugin to set GPU as default engine for polars tests" @@ -28,7 +28,7 @@ def pytest_addoption(parser: pytest.Parser): ) -def pytest_configure(config: pytest.Config): +def pytest_configure(config: pytest.Config) -> None: """Enable use of this module as a pytest plugin to enable GPU collection.""" no_fallback = config.getoption("--cudf-polars-no-fallback") collect = polars.LazyFrame.collect @@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config): "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", @@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", @@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", @@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config): "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", @@ -148,7 +172,7 @@ def pytest_configure(config: pytest.Config): def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] -): +) -> None: """Mark known failing tests.""" if config.getoption("--cudf-polars-no-fallback"): # Don't xfail tests if running without fallback diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index a27a3395c35..57c5fdaa7cf 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -8,10 +8,10 @@ from collections.abc import Hashable, Mapping from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union -import pylibcudf as plc - from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +import pylibcudf as plc + if TYPE_CHECKING: from collections.abc import Callable from typing import TypeAlias diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 4154a404e98..1d0479802ca 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -8,11 +8,12 @@ from functools import cache import pyarrow as pa -import pylibcudf as plc from typing_extensions import assert_never import polars as pl +import pylibcudf as plc + __all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 4a7ad6b3cf2..a119cab3b74 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,11 +12,12 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") +POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11") +POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12") def _ensure_polars_version(): - if POLARS_VERSION_LT_18: + if POLARS_VERSION_LT_111: raise ImportError( - "cudf_polars requires py-polars v1.8 or greater." + "cudf_polars requires py-polars v1.11 or greater." ) # pragma: no cover diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index a8bb634732f..a2c62ef9460 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.8,<1.9", + "polars>=1.11,<1.13", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -188,7 +188,7 @@ required-imports = ["from __future__ import annotations"] [tool.ruff.lint.isort.sections] polars = ["polars"] -rapids = ["rmm", "cudf"] +rapids = ["rmm", "pylibcudf"] [tool.ruff.format] docstring-code-format = true diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 1f26ab1af9f..95541b4ecc3 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -4,9 +4,10 @@ from __future__ import annotations import pyarrow -import pylibcudf as plc import pytest +import pylibcudf as plc + from cudf_polars.containers import Column diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 5c68fb8f0aa..d68c8d90163 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -3,11 +3,12 @@ from __future__ import annotations -import pylibcudf as plc import pytest import polars as pl +import pylibcudf as plc + from cudf_polars.containers import Column, DataFrame from cudf_polars.testing.asserts import assert_gpu_result_equal diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py index 84e33262869..de8fec301fe 100644 --- a/python/cudf_polars/tests/dsl/test_expr.py +++ b/python/cudf_polars/tests/dsl/test_expr.py @@ -3,9 +3,10 @@ from __future__ import annotations -import pylibcudf as plc import pytest +import pylibcudf as plc + from cudf_polars.dsl import expr diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py new file mode 100644 index 00000000000..57d794d4890 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_to_ast.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + +import pylibcudf as plc + +import cudf_polars.dsl.ir as ir_nodes +from cudf_polars import translate_ir +from cudf_polars.containers.dataframe import DataFrame, NamedColumn +from cudf_polars.dsl.to_ast import to_ast + + +@pytest.fixture(scope="module") +def df(): + return pl.LazyFrame( + { + "c": ["a", "b", "c", "d", "e", "f"], + "a": [1, 2, 3, None, 4, 5], + "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64), + "d": [False, True, True, None, False, False], + } + ) + + +@pytest.mark.parametrize( + "expr", + [ + pl.col("a").is_in([0, 1]), + pl.col("a").is_between(0, 2), + (pl.col("a") < pl.col("b")).not_(), + pl.lit(2) > pl.col("a"), + pl.lit(2) >= pl.col("a"), + pl.lit(2) < pl.col("a"), + pl.lit(2) <= pl.col("a"), + pl.lit(0) == pl.col("a"), + pl.lit(1) != pl.col("a"), + (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()), + (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0), + pl.col("a").is_null(), + pl.col("a").is_not_null(), + pl.col("b").is_finite(), + pytest.param( + pl.col("a").sin(), + marks=pytest.mark.xfail(reason="Need to insert explicit casts"), + ), + pl.col("b").cos(), + pl.col("a").abs().is_between(0, 2), + pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)), + [pl.col("a") * 2, pl.col("b") + pl.col("a")], + pl.col("d").not_(), + ], +) +def test_compute_column(expr, df): + q = df.select(expr) + ir = translate_ir(q._ldf.visit()) + + assert isinstance(ir, ir_nodes.Select) + table = ir.children[0].evaluate(cache={}) + name_to_index = {c.name: i for i, c in enumerate(table.columns)} + + def compute_column(e): + ast = to_ast(e.value, name_to_index=name_to_index) + if ast is not None: + return NamedColumn( + plc.transform.compute_column(table.table, ast), name=e.name + ) + return e.evaluate(table) + + got = DataFrame(map(compute_column, ir.exprs)).to_polars() + + expect = q.collect() + + assert_frame_equal(expect, got) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 6505a786855..15c644d7978 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -5,11 +5,11 @@ from functools import singledispatch -import pylibcudf as plc - import polars as pl from polars.testing import assert_frame_equal +import pylibcudf as plc + from cudf_polars import translate_ir from cudf_polars.dsl import expr, ir from cudf_polars.dsl.traversal import ( diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index ced49bdc254..52bc4a9ac71 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pylibcudf as plc import pytest import polars as pl +import pylibcudf as plc + from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 2a37683478b..62df8ce1498 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -4,11 +4,12 @@ import itertools -import pylibcudf as plc import pytest import polars as pl +import pylibcudf as plc + from cudf_polars import translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 7d9ec98db97..8ca7a7b9264 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from contextlib import nullcontext + import pytest import polars as pl +from polars.testing import assert_frame_equal from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_112 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -22,6 +26,11 @@ def how(request): return request.param +@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)]) +def zlice(request): + return request.param + + @pytest.fixture def left(): return pl.LazyFrame( @@ -37,8 +46,9 @@ def left(): def right(): return pl.LazyFrame( { - "a": [1, 4, 3, 7, None, None], - "c": [2, 3, 4, 5, 6, 7], + "a": [1, 4, 3, 7, None, None, 1], + "c": [2, 3, 4, 5, 6, 7, 8], + "d": [6, None, 7, 8, -1, 2, 4], } ) @@ -70,11 +80,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr): query = left.join( right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query, check_row_order=how == "left") -def test_cross_join(left, right): +def test_left_join_with_slice(left, right, join_nulls, zlice): + q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True) + ctx = nullcontext() + if zlice is not None: + q_expect = q.collect().slice(*zlice) + q = q.slice(*zlice) + if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)): + # https://github.com/pola-rs/polars/issues/19403 + # https://github.com/pola-rs/polars/issues/19405 + ctx = pytest.raises(AssertionError) + assert_frame_equal( + q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True)) + ) + + with ctx: + assert_gpu_result_equal(q) + + +def test_cross_join(left, right, zlice): q = left.join(right, how="cross") + if zlice is not None: + q = q.slice(*zlice) assert_gpu_result_equal(q) @@ -86,3 +116,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "conditions", + [ + [pl.col("a") < pl.col("a_right")], + [pl.col("a_right") <= pl.col("a") * 2], + [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], + [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + ], +) +def test_join_where(left, right, conditions, zlice): + q = left.join_where(right, *conditions) + + assert_gpu_result_equal(q, check_row_order=False) + + if zlice is not None: + q_len = q.slice(*zlice).select(pl.len()) + # Can't compare result, since row order is not guaranteed and + # therefore we only check the length + + assert_gpu_result_equal(q_len) diff --git a/python/cudf_polars/tests/test_parquet_filters.py b/python/cudf_polars/tests/test_parquet_filters.py new file mode 100644 index 00000000000..545a89250fc --- /dev/null +++ b/python/cudf_polars/tests/test_parquet_filters.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +@pytest.fixture(scope="module") +def df(): + return pl.DataFrame( + { + "c": ["a", "b", "c", "d", "e", "f"], + "a": [1, 2, 3, None, 4, 5], + "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64), + "d": [-1, 2, -3, None, 4, -5], + } + ) + + +@pytest.fixture(scope="module") +def pq_file(tmp_path_factory, df): + tmp_path = tmp_path_factory.mktemp("parquet_filter") + df.write_parquet(tmp_path / "tmp.pq", row_group_size=3) + return pl.scan_parquet(tmp_path / "tmp.pq") + + +@pytest.mark.parametrize( + "expr", + [ + pl.col("a").is_in([0, 1]), + pl.col("a").is_between(0, 2), + (pl.col("a") < 2).not_(), + pl.lit(2) > pl.col("a"), + pl.lit(2) >= pl.col("a"), + pl.lit(2) < pl.col("a"), + pl.lit(2) <= pl.col("a"), + pl.lit(0) == pl.col("a"), + pl.lit(1) != pl.col("a"), + pl.col("a") == pl.col("d"), + (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()), + (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0), + pl.col("b").is_finite(), + pl.col("a").is_null(), + pl.col("a").is_not_null(), + pl.col("a").abs().is_between(0, 2), + pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)), + ], +) +@pytest.mark.parametrize("selection", [["c", "b"], ["a"], ["a", "c"], ["b"], "c"]) +def test_scan_by_hand(expr, selection, pq_file): + df = pq_file.collect() + q = pq_file.filter(expr).select(*selection) + # Not using assert_gpu_result_equal because + # https://github.com/pola-rs/polars/issues/19238 + got = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) + expect = df.filter(expr).select(*selection) + assert_frame_equal(got, expect) diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py index e7770bfadac..3b3b4f0f8db 100644 --- a/python/cudf_polars/tests/utils/test_broadcast.py +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -3,9 +3,10 @@ from __future__ import annotations -import pylibcudf as plc import pytest +import pylibcudf as plc + from cudf_polars.containers import Column from cudf_polars.dsl.ir import broadcast diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 1cda9b71387..c5135bc6414 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -2,6 +2,7 @@ import socket import pytest + from custreamz import kafka diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index af45f49d9b4..a8ab05a3922 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,50 +65,20 @@ include = [ ] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "streamz", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["streamz"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 04c2ad65b99..f9df22cc436 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -7,15 +7,15 @@ # do anything for dask==2024.2.0) config.set({"dataframe.query-planning-warning": False}) -import dask.dataframe as dd -from dask.dataframe import from_delayed +import dask.dataframe as dd # noqa: E402 +from dask.dataframe import from_delayed # noqa: E402 -import cudf +import cudf # noqa: E402 -from . import backends -from ._version import __git_commit__, __version__ -from .core import concat, from_cudf, from_dask_dataframe -from .expr import QUERY_PLANNING_ON +from . import backends # noqa: E402, F401 +from ._version import __git_commit__, __version__ # noqa: E402, F401 +from .core import concat, from_cudf, from_dask_dataframe # noqa: E402 +from .expr import QUERY_PLANNING_ON # noqa: E402 def read_csv(*args, **kwargs): @@ -55,9 +55,9 @@ def inner_func(*args, **kwargs): to_orc = raise_not_implemented_error("to_orc") else: - from .core import DataFrame, Index, Series - from .groupby import groupby_agg - from .io import read_text, to_orc + from .core import DataFrame, Index, Series # noqa: F401 + from .groupby import groupby_agg # noqa: F401 + from .io import read_text, to_orc # noqa: F401 __all__ = [ diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py index a76b655ef42..6dadadd5263 100644 --- a/python/dask_cudf/dask_cudf/expr/__init__.py +++ b/python/dask_cudf/dask_cudf/expr/__init__.py @@ -12,8 +12,8 @@ config.set({"dataframe.shuffle.method": "tasks"}) try: - import dask_cudf.expr._collection - import dask_cudf.expr._expr + import dask_cudf.expr._collection # noqa: F401 + import dask_cudf.expr._expr # noqa: F401 except ImportError as err: # Dask *should* raise an error before this. diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 76bb2ea99b4..0421bd755f4 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,11 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from .csv import read_csv -from .json import read_json -from .orc import read_orc, to_orc -from .text import read_text +from .csv import read_csv # noqa: F401 +from .json import read_json # noqa: F401 +from .orc import read_orc, to_orc # noqa: F401 +from .text import read_text # noqa: F401 try: - from .parquet import read_parquet, to_parquet + from .parquet import read_parquet, to_parquet # noqa: F401 except ImportError: pass diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index a781b8242fe..39ac6474958 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -383,6 +383,12 @@ def write_metadata(parts, fmd, fs, path, append=False, **kwargs): metadata_path = fs.sep.join([path, "_metadata"]) _meta = [] if append and fmd is not None: + # Convert to bytes: + if isinstance(fmd, pq.FileMetaData): + with BytesIO() as myio: + fmd.write_metadata_file(myio) + myio.seek(0) + fmd = np.frombuffer(myio.read(), dtype="uint8") _meta = [fmd] _meta.extend([parts[i][0]["meta"] for i in range(len(parts))]) _meta = ( diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index ae5ca480e31..a29cf9a342a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -644,3 +644,23 @@ def test_read_parquet_arrow_filesystem(tmpdir, min_part_size): dd.assert_eq(df, ddf, check_index=False) assert isinstance(ddf._meta, cudf.DataFrame) assert isinstance(ddf.compute(), cudf.DataFrame) + + +@pytest.mark.parametrize("write_metadata_file", [True, False]) +def test_to_parquet_append(tmpdir, write_metadata_file): + df = cudf.DataFrame({"a": [1, 2, 3]}) + ddf = dask_cudf.from_cudf(df, npartitions=1) + ddf.to_parquet( + tmpdir, + append=True, + write_metadata_file=write_metadata_file, + write_index=False, + ) + ddf.to_parquet( + tmpdir, + append=True, + write_metadata_file=write_metadata_file, + write_index=False, + ) + ddf2 = dask_cudf.read_parquet(tmpdir) + dd.assert_eq(cudf.concat([df, df]), ddf2) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 705865d083b..862e8f36eaa 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true +[tool.ruff] +extend = "../../pyproject.toml" -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", -] -known_first_party = [ - "dask_cudf", -] +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["dask_cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", -] +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf"] [tool.pytest.ini_options] addopts = "--tb=native --strict-config --strict-markers" diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 84660cbc276..c6d9ae56467 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA", ] dependencies = [ - "nvidia-nvcomp==4.0.1", + "nvidia-nvcomp==4.1.0.6", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index 15dd2b4c34f..b1d9656afc2 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -26,6 +26,7 @@ set(cython_sources filling.pyx gpumemoryview.pyx groupby.pyx + hashing.pyx interop.pyx join.pyx json.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index aa67b4b1149..aa2ce957173 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -13,6 +13,8 @@ from . cimport ( expressions, filling, groupby, + hashing, + interop, join, json, labeling, @@ -62,6 +64,8 @@ __all__ = [ "filling", "gpumemoryview", "groupby", + "hashing", + "interop", "join", "json", "lists", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 4033062b7e2..62a2170f83e 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -22,6 +22,7 @@ expressions, filling, groupby, + hashing, interop, io, join, @@ -73,6 +74,7 @@ "filling", "gpumemoryview", "groupby", + "hashing", "interop", "io", "join", diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 51b2b4cfaa3..eef73bf4e9d 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -100,6 +100,7 @@ cpdef bool is_supported_operation( The right hand side data type. op : BinaryOperator The operation to check. + Returns ------- bool diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index 72ce680ba7a..335ef435f9b 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -1,15 +1,56 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from pylibcudf.libcudf.datetime cimport datetime_component +from pylibcudf.column cimport Column +from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency +from pylibcudf.scalar cimport Scalar -from .column cimport Column +ctypedef fused ColumnOrScalar: + Column + Scalar +cpdef Column extract_millisecond_fraction( + Column input +) + +cpdef Column extract_microsecond_fraction( + Column input +) -cpdef Column extract_year( - Column col +cpdef Column extract_nanosecond_fraction( + Column input ) cpdef Column extract_datetime_component( - Column col, + Column input, datetime_component component ) + +cpdef Column ceil_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column floor_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column round_datetimes( + Column input, + rounding_frequency freq +) + +cpdef Column add_calendrical_months( + Column timestamps, + ColumnOrScalar months, +) + +cpdef Column day_of_year(Column input) + +cpdef Column is_leap_year(Column input) + +cpdef Column last_day_of_month(Column input) + +cpdef Column extract_quarter(Column input) + +cpdef Column days_in_month(Column input) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index ac4335cca56..9e5e709d81d 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -3,41 +3,106 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.datetime cimport ( + add_calendrical_months as cpp_add_calendrical_months, + ceil_datetimes as cpp_ceil_datetimes, datetime_component, + day_of_year as cpp_day_of_year, + days_in_month as cpp_days_in_month, extract_datetime_component as cpp_extract_datetime_component, - extract_year as cpp_extract_year, + extract_microsecond_fraction as cpp_extract_microsecond_fraction, + extract_millisecond_fraction as cpp_extract_millisecond_fraction, + extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, + extract_quarter as cpp_extract_quarter, + floor_datetimes as cpp_floor_datetimes, + is_leap_year as cpp_is_leap_year, + last_day_of_month as cpp_last_day_of_month, + round_datetimes as cpp_round_datetimes, + rounding_frequency, ) from pylibcudf.libcudf.datetime import \ datetime_component as DatetimeComponent # no-cython-lint +from pylibcudf.libcudf.datetime import \ + rounding_frequency as RoundingFrequency # no-cython-lint + +from cython.operator cimport dereference from .column cimport Column +cpdef Column extract_millisecond_fraction( + Column input +): + """ + Extract the millisecond from a datetime column. + + For details, see :cpp:func:`extract_millisecond_fraction`. + + Parameters + ---------- + input : Column + The column to extract the millisecond from. + + Returns + ------- + Column + Column with the extracted milliseconds. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_millisecond_fraction(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_microsecond_fraction( + Column input +): + """ + Extract the microsecond fraction from a datetime column. + + For details, see :cpp:func:`extract_microsecond_fraction`. + + Parameters + ---------- + input : Column + The column to extract the microsecond fraction from. + + Returns + ------- + Column + Column with the extracted microsecond fractions. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_microsecond_fraction(input.view()) + return Column.from_libcudf(move(result)) -cpdef Column extract_year( - Column values +cpdef Column extract_nanosecond_fraction( + Column input ): """ - Extract the year from a datetime column. + Extract the nanosecond fraction from a datetime column. + + For details, see :cpp:func:`extract_nanosecond_fraction`. Parameters ---------- - values : Column - The column to extract the year from. + input : Column + The column to extract the nanosecond fraction from. Returns ------- Column - Column with the extracted years. + Column with the extracted nanosecond fractions. """ cdef unique_ptr[column] result with nogil: - result = cpp_extract_year(values.view()) + result = cpp_extract_nanosecond_fraction(input.view()) return Column.from_libcudf(move(result)) cpdef Column extract_datetime_component( - Column values, + Column input, datetime_component component ): """ @@ -47,7 +112,7 @@ cpdef Column extract_datetime_component( Parameters ---------- - values : Column + input : Column The column to extract the component from. component : DatetimeComponent The datetime component to extract. @@ -60,5 +125,237 @@ cpdef Column extract_datetime_component( cdef unique_ptr[column] result with nogil: - result = cpp_extract_datetime_component(values.view(), component) + result = cpp_extract_datetime_component(input.view(), component) + return Column.from_libcudf(move(result)) + +cpdef Column ceil_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes up to the nearest multiple of the given frequency. + + For details, see :cpp:func:`ceil_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round up to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_ceil_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) + +cpdef Column floor_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes down to the nearest multiple of the given frequency. + + For details, see :cpp:func:`floor_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round down to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_floor_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) + +cpdef Column round_datetimes( + Column input, + rounding_frequency freq +): + """ + Round datetimes to the nearest multiple of the given frequency. + + For details, see :cpp:func:`round_datetimes`. + + Parameters + ---------- + input : Column + The column of input datetime values. + freq : rounding_frequency + The frequency to round to. + + Returns + ------- + Column + Column of the same datetime resolution as the input column. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_round_datetimes(input.view(), freq) + return Column.from_libcudf(move(result)) + +cpdef Column add_calendrical_months( + Column input, + ColumnOrScalar months, +): + """ + Adds or subtracts a number of months from the datetime + type and returns a timestamp column that is of the same + type as the input timestamps column. + + For details, see :cpp:func:`add_calendrical_months`. + + Parameters + ---------- + input : Column + The column of input timestamp values. + months : ColumnOrScalar + The number of months to add. + + Returns + ------- + Column + Column of computed timestamps. + """ + if not isinstance(months, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + cdef unique_ptr[column] result + + with nogil: + result = cpp_add_calendrical_months( + input.view(), + months.view() if ColumnOrScalar is Column else + dereference(months.get()) + ) + return Column.from_libcudf(move(result)) + +cpdef Column day_of_year(Column input): + """ + Computes the day number since the start of + the year from the datetime. The value is between + [1, {365-366}]. + + For details, see :cpp:func:`day_of_year`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of day numbers. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_day_of_year(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column is_leap_year(Column input): + """ + Check if the year of the given date is a leap year. + + For details, see :cpp:func:`is_leap_year`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of bools indicating whether the given year + is a leap year. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_is_leap_year(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column last_day_of_month(Column input): + """ + Computes the last day of the month. + + For details, see :cpp:func:`last_day_of_month`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of ``TIMESTAMP_DAYS`` representing the last day + of the month. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_last_day_of_month(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column extract_quarter(Column input): + """ + Returns the quarter (ie. a value from {1, 2, 3, 4}) + that the date is in. + + For details, see :cpp:func:`extract_quarter`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column indicating which quarter the date is in. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_extract_quarter(input.view()) + return Column.from_libcudf(move(result)) + +cpdef Column days_in_month(Column input): + """ + Extract the number of days in the month. + + For details, see :cpp:func:`days_in_month`. + + Parameters + ---------- + input : Column + The column of input datetime values. + + Returns + ------- + Column + Column of the number of days in the given month. + """ + cdef unique_ptr[column] result + + with nogil: + result = cpp_days_in_month(input.view()) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index a44c9e25987..1535f68366b 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -5,7 +5,17 @@ from pylibcudf.libcudf.expressions import \ table_reference as TableReference # no-cython-lint from cython.operator cimport dereference -from libc.stdint cimport int32_t, int64_t +from libc.stdint cimport ( + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) +from libcpp cimport bool from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -18,12 +28,14 @@ from pylibcudf.libcudf.scalar.scalar cimport ( ) from pylibcudf.libcudf.types cimport size_type, type_id from pylibcudf.libcudf.wrappers.durations cimport ( + duration_D, duration_ms, duration_ns, duration_s, duration_us, ) from pylibcudf.libcudf.wrappers.timestamps cimport ( + timestamp_D, timestamp_ms, timestamp_ns, timestamp_s, @@ -78,6 +90,34 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.INT16: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.INT8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT16: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.UINT8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.BOOL8: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) elif tid == type_id.FLOAT64: self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) @@ -110,6 +150,10 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.TIMESTAMP_DAYS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) elif tid == type_id.DURATION_NANOSECONDS: self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) @@ -130,6 +174,10 @@ cdef class Literal(Expression): self.c_obj = move(make_unique[libcudf_exp.literal]( dereference(self.scalar.c_obj) )) + elif tid == type_id.DURATION_DAYS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) else: raise NotImplementedError( f"Don't know how to make literal with type id {tid}" diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 0372e1132cc..a47004a1e42 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -77,6 +77,10 @@ cpdef void fill_in_place( The index at which to stop filling. value : Scalar The value to fill with. + + Returns + ------- + None """ with nogil: @@ -101,6 +105,7 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): The initial value of the sequence step : Scalar The step of the sequence + Returns ------- pylibcudf.Column diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd new file mode 100644 index 00000000000..2d070ddda69 --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t + +from .column cimport Column +from .table cimport Table + + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed=* +) + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed=* +) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed=* +) + +cpdef Column md5(Table input) +cpdef Column sha1(Table input) +cpdef Column sha224(Table input) +cpdef Column sha256(Table input) +cpdef Column sha384(Table input) +cpdef Column sha512(Table input) diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx new file mode 100644 index 00000000000..9ea3d4d1bda --- /dev/null +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -0,0 +1,240 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.hash cimport ( + DEFAULT_HASH_SEED, + md5 as cpp_md5, + murmurhash3_x64_128 as cpp_murmurhash3_x64_128, + murmurhash3_x86_32 as cpp_murmurhash3_x86_32, + sha1 as cpp_sha1, + sha224 as cpp_sha224, + sha256 as cpp_sha256, + sha384 as cpp_sha384, + sha512 as cpp_sha512, + xxhash_64 as cpp_xxhash_64, +) +from pylibcudf.libcudf.table.table cimport table + +from .column cimport Column +from .table cimport Table + +LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED + +cpdef Column murmurhash3_x86_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the MurmurHash3 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`murmurhash3_x86_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_murmurhash3_x86_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table murmurhash3_x64_128( + Table input, + uint64_t seed=DEFAULT_HASH_SEED +): + """Computes the MurmurHash3 64-bit hash value of each row in the given table. + + For details, see :cpp:func:`murmurhash3_x64_128`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint64_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Table + A table of two UINT64 columns + """ + cdef unique_ptr[table] c_result + with nogil: + c_result = cpp_murmurhash3_x64_128( + input.view(), + seed + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column xxhash_64( + Table input, + uint64_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 64-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_64`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint64_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_64( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column md5(Table input): + """Computes the MD5 hash value of each row in the given table. + + For details, see :cpp:func:`md5`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the md5 hash of a row from the input + + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_md5(input.view()) + return Column.from_libcudf(move(c_result)) + +cpdef Column sha1(Table input): + """Computes the SHA-1 hash value of each row in the given table. + + For details, see :cpp:func:`sha1`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha1(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha224(Table input): + """Computes the SHA-224 hash value of each row in the given table. + + For details, see :cpp:func:`sha224`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha224(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha256(Table input): + """Computes the SHA-256 hash value of each row in the given table. + + For details, see :cpp:func:`sha256`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha256(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha384(Table input): + """Computes the SHA-384 hash value of each row in the given table. + + For details, see :cpp:func:`sha384`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha384(input.view()) + return Column.from_libcudf(move(c_result)) + + +cpdef Column sha512(Table input): + """Computes the SHA-512 hash value of each row in the given table. + + For details, see :cpp:func:`sha512`. + + Parameters + ---------- + input : Table + The table of columns to hash + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sha512(input.view()) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd new file mode 100644 index 00000000000..2a0a8c15fdd --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table cimport Table + + +cpdef Table from_dlpack(object managed_tensor) + +cpdef object to_dlpack(Table input) diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 642516a1b90..61e812353b7 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -1,6 +1,11 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New +from cpython.pycapsule cimport ( + PyCapsule_GetPointer, + PyCapsule_IsValid, + PyCapsule_New, + PyCapsule_SetName, +) from libc.stdlib cimport free from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport ( ArrowArray, ArrowArrayStream, ArrowSchema, + DLManagedTensor, column_metadata, from_arrow_column as cpp_from_arrow_column, from_arrow_stream as cpp_from_arrow_stream, + from_dlpack as cpp_from_dlpack, to_arrow_host_raw, to_arrow_schema_raw, + to_dlpack as cpp_to_dlpack, ) from pylibcudf.libcudf.table.table cimport table @@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None): # Note that metadata for scalars is primarily important for preserving # information on nested types since names are otherwise irrelevant. return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0] + + +cpdef Table from_dlpack(object managed_tensor): + """ + Convert a DLPack DLTensor into a cudf table. + + For details, see :cpp:func:`cudf::from_dlpack` + + Parameters + ---------- + managed_tensor : PyCapsule + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + Table + Table with a copy of the tensor data. + """ + if not PyCapsule_IsValid(managed_tensor, "dltensor"): + raise ValueError("Invalid PyCapsule object") + cdef unique_ptr[table] c_result + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + managed_tensor, "dltensor" + ) + if dlpack_tensor is NULL: + raise ValueError("PyCapsule object contained a NULL pointer") + PyCapsule_SetName(managed_tensor, "used_dltensor") + + # Note: A copy is always performed when converting the dlpack + # data to a libcudf table. We also delete the dlpack_tensor pointer + # as the pointer is not deleted by libcudf's from_dlpack function. + # TODO: https://github.com/rapidsai/cudf/issues/10874 + # TODO: https://github.com/rapidsai/cudf/issues/10849 + with nogil: + c_result = cpp_from_dlpack(dlpack_tensor) + + cdef Table result = Table.from_libcudf(move(c_result)) + dlpack_tensor.deleter(dlpack_tensor) + return result + + +cpdef object to_dlpack(Table input): + """ + Convert a cudf table into a DLPack DLTensor. + + For details, see :cpp:func:`cudf::to_dlpack` + + Parameters + ---------- + input : Table + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + PyCapsule + 1D or 2D DLPack tensor with a copy of the table data, or nullptr. + """ + for col in input._columns: + if col.null_count(): + raise ValueError( + "Cannot create a DLPack tensor with null values. " + "Input is required to have null count as zero." + ) + cdef DLManagedTensor *dlpack_tensor + + with nogil: + dlpack_tensor = cpp_to_dlpack(input.view()) + + return PyCapsule_New( + dlpack_tensor, + "dltensor", + dlmanaged_tensor_pycapsule_deleter + ) + + +cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: + if PyCapsule_IsValid(pycap_obj, "used_dltensor"): + # we do not call a used capsule's deleter + return + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + pycap_obj, "dltensor" + ) + if dlpack_tensor is not NULL: + dlpack_tensor.deleter(dlpack_tensor) diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index 73cdfb96af5..8bbc120cff8 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint8_t +from libc.stdint cimport int32_t, uint8_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -41,14 +41,14 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: datetime_component component ) except + - ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": - DAY "cudf::datetime::rounding_frequency::DAY" - HOUR "cudf::datetime::rounding_frequency::HOUR" - MINUTE "cudf::datetime::rounding_frequency::MINUTE" - SECOND "cudf::datetime::rounding_frequency::SECOND" - MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND" - MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND" - NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND" + cpdef enum class rounding_frequency(int32_t): + DAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND cdef unique_ptr[column] ceil_datetimes( const column_view& column, rounding_frequency freq @@ -64,6 +64,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: const column_view& timestamps, const column_view& months ) except + + cdef unique_ptr[column] add_calendrical_months( + const column_view& timestamps, + const scalar& months + ) except + cdef unique_ptr[column] day_of_year(const column_view& column) except + cdef unique_ptr[column] is_leap_year(const column_view& column) except + cdef unique_ptr[column] last_day_of_month( diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 51678ba69d8..c4222bc9dc5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -3,6 +3,7 @@ from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view @@ -10,36 +11,44 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: - cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( + cdef unique_ptr[column] murmurhash3_x86_32( const table_view& input, const uint32_t seed - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] md5 "cudf::hashing::md5" ( + cdef unique_ptr[table] murmurhash3_x64_128( + const table_view& input, + const uint64_t seed + ) except +libcudf_exception_handler + + cdef unique_ptr[column] md5( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha1 "cudf::hashing::sha1" ( + cdef unique_ptr[column] sha1( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha224 "cudf::hashing::sha224" ( + cdef unique_ptr[column] sha224( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha256 "cudf::hashing::sha256" ( + cdef unique_ptr[column] sha256( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha384 "cudf::hashing::sha384" ( + cdef unique_ptr[column] sha384( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] sha512 "cudf::hashing::sha512" ( + cdef unique_ptr[column] sha512( const table_view& input - ) except + + ) except +libcudf_exception_handler - cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed - ) except + + ) except +libcudf_exception_handler + +cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + cdef uint32_t DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pyx b/python/pylibcudf/pylibcudf/libcudf/hash.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index 30b97fdec34..b75e9ca7001 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil: cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: - cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor - ) except + + cdef unique_ptr[table] from_dlpack( + const DLManagedTensor* managed_tensor + ) except + - DLManagedTensor* to_dlpack(table_view input_table - ) except + + DLManagedTensor* to_dlpack( + const table_view& input + ) except + cdef cppclass column_metadata: column_metadata() except + diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd index aabac0a617b..8dac86d688d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd @@ -9,14 +9,14 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: - cdef cppclass tokenizer_result "nvtext::tokenizer_result": + cdef cppclass tokenizer_result: uint32_t nrows_tensor uint32_t sequence_length unique_ptr[column] tensor_token_ids unique_ptr[column] tensor_attention_mask unique_ptr[column] tensor_metadata - cdef struct hashed_vocabulary "nvtext::hashed_vocabulary": + cdef cppclass hashed_vocabulary: uint16_t first_token_id uint16_t separator_token_id uint16_t unknown_token_id @@ -26,6 +26,8 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: unique_ptr[column] table unique_ptr[column] bin_coefficients unique_ptr[column] bin_offsets + unique_ptr[column] cp_metadata + unique_ptr[column] aux_cp_table cdef unique_ptr[hashed_vocabulary] load_vocabulary_file( const string &filename_hashed_vocabulary @@ -33,7 +35,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: cdef tokenizer_result subword_tokenize( const column_view & strings, - hashed_vocabulary & hashed_vocablary_obj, + hashed_vocabulary & hashed_vocabulary_obj, uint32_t max_sequence_length, uint32_t stride, bool do_lower, diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index d21510bd731..47d79083b66 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -27,6 +27,11 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: column_view input ) except + + cdef unique_ptr[column] compute_column( + table_view table, + expression expr + ) except + + cdef unique_ptr[column] transform( column_view input, string unary_udf, diff --git a/python/pylibcudf/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd index eabae68bc90..60e293e5cdb 100644 --- a/python/pylibcudf/pylibcudf/libcudf/types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/types.pxd @@ -70,18 +70,19 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: TIMESTAMP_MILLISECONDS TIMESTAMP_MICROSECONDS TIMESTAMP_NANOSECONDS - DICTIONARY32 - STRING - LIST - STRUCT - NUM_TYPE_IDS + DURATION_DAYS DURATION_SECONDS DURATION_MILLISECONDS DURATION_MICROSECONDS DURATION_NANOSECONDS + DICTIONARY32 + STRING + LIST DECIMAL32 DECIMAL64 DECIMAL128 + STRUCT + NUM_TYPE_IDS cdef cppclass data_type: data_type() except + diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd index 7c648425eb5..c9c960d0a79 100644 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd @@ -1,9 +1,10 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport int64_t +from libc.stdint cimport int32_t, int64_t cdef extern from "cudf/wrappers/durations.hpp" namespace "cudf" nogil: + ctypedef int32_t duration_D ctypedef int64_t duration_s ctypedef int64_t duration_ms ctypedef int64_t duration_us diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd index 50d37fd0a68..5dcd144529d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd @@ -1,9 +1,10 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport int64_t +from libc.stdint cimport int32_t, int64_t cdef extern from "cudf/wrappers/timestamps.hpp" namespace "cudf" nogil: + ctypedef int32_t timestamp_D ctypedef int64_t timestamp_s ctypedef int64_t timestamp_ms ctypedef int64_t timestamp_us diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index d97c0a73267..93e3fb15259 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx +set(cython_sources + edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx + replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index a658e57018e..ef837167eb9 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . cimport ( + byte_pair_encode, edit_distance, generate_ngrams, jaccard, @@ -9,6 +10,8 @@ from . cimport ( normalize, replace, stemmer, + subword_tokenize, + tokenize, ) __all__ = [ @@ -16,8 +19,11 @@ __all__ = [ "generate_ngrams", "jaccard", "minhash", + "byte_pair_encode" "ngrams_tokenize", "normalize", "replace", "stemmer", + "subword_tokenize", + "tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 2c1feb089a2..4f125d3a733 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import ( + byte_pair_encode, edit_distance, generate_ngrams, jaccard, @@ -9,6 +10,8 @@ normalize, replace, stemmer, + subword_tokenize, + tokenize, ) __all__ = [ @@ -16,8 +19,11 @@ "generate_ngrams", "jaccard", "minhash", + "byte_pair_encode", "ngrams_tokenize", "normalize", "replace", "stemmer", + "subword_tokenize", + "tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd new file mode 100644 index 00000000000..e4b93e96b9d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.byte_pair_encode cimport bpe_merge_pairs +from pylibcudf.scalar cimport Scalar + + +cdef class BPEMergePairs: + cdef unique_ptr[bpe_merge_pairs] c_obj + +cpdef Column byte_pair_encoding( + Column input, + BPEMergePairs merge_pairs, + Scalar separator=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx new file mode 100644 index 00000000000..76caad276d4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.byte_pair_encode cimport ( + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs as cpp_load_merge_pairs, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.scalar cimport Scalar + + +cdef class BPEMergePairs: + """The table of merge pairs for the BPE encoder. + + For details, see :cpp:class:`cudf::nvtext::bpe_merge_pairs`. + """ + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() + with nogil: + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + +cpdef Column byte_pair_encoding( + Column input, + BPEMergePairs merge_pairs, + Scalar separator=None +): + """ + Byte pair encode the input strings. + + For details, see cpp:func:`cudf::nvtext::byte_pair_encoding` + + Parameters + ---------- + input : Column + Strings to encode. + merge_pairs : BPEMergePairs + Substrings to rebuild each string on. + separator : Scalar + String used to build the output after encoding. Default is a space. + + Returns + ------- + Column + An encoded column of strings. + """ + cdef unique_ptr[column] c_result + + if separator is None: + separator = Scalar.from_libcudf( + cpp_make_string_scalar(" ".encode()) + ) + + with nogil: + c_result = move( + cpp_byte_pair_encoding( + input.view(), + dereference(merge_pairs.c_obj.get()), + dereference(separator.c_obj.get()), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd new file mode 100644 index 00000000000..091c7b897ac --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary + + +cdef class HashedVocabulary: + cdef unique_ptr[hashed_vocabulary] c_obj + +cpdef tuple[Column, Column, Column] subword_tokenize( + Column input, + HashedVocabulary vocabulary_table, + uint32_t max_sequence_length, + uint32_t stride, + bool do_lower_case, + bool do_truncate, +) diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx new file mode 100644 index 00000000000..04643d3bd84 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx @@ -0,0 +1,84 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint32_t +from libcpp cimport bool +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( + load_vocabulary_file as cpp_load_vocabulary_file, + move as tr_move, + subword_tokenize as cpp_subword_tokenize, + tokenizer_result as cpp_tokenizer_result, +) + + +cdef class HashedVocabulary: + """The vocabulary data for use with the subword_tokenize function. + + For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`. + """ + def __cinit__(self, hash_file): + cdef string c_hash_file = str(hash_file).encode() + with nogil: + self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) + +cpdef tuple[Column, Column, Column] subword_tokenize( + Column input, + HashedVocabulary vocabulary_table, + uint32_t max_sequence_length, + uint32_t stride, + bool do_lower_case, + bool do_truncate, +): + """ + Creates a tokenizer that cleans the text, splits it into + tokens and returns token-ids from an input vocabulary. + + For details, see cpp:func:`subword_tokenize` + + Parameters + ---------- + input : Column + The input strings to tokenize. + vocabulary_table : HashedVocabulary + The vocabulary table pre-loaded into this object. + max_sequence_length : uint32_t + Limit of the number of token-ids per row in final tensor for each string. + stride : uint32_t + Each row in the output token-ids will replicate + ``max_sequence_length`` - ``stride`` the token-ids + from the previous row, unless it is the first string. + do_lower_case : bool + If true, the tokenizer will convert uppercase characters in the + input stream to lower-case and strip accents from those characters. + If false, accented and uppercase characters are not transformed. + do_truncate : bool + If true, the tokenizer will discard all the token-ids after + ``max_sequence_length`` for each input string. If false, it + will use a new row in the output token-ids to continue + generating the output. + + Returns + ------- + tuple[Column, Column, Column] + A tuple of three columns containing the + tokens, masks, and metadata. + """ + cdef cpp_tokenizer_result c_result + with nogil: + c_result = tr_move( + cpp_subword_tokenize( + input.view(), + dereference(vocabulary_table.c_obj.get()), + max_sequence_length, + stride, + do_lower_case, + do_truncate, + ) + ) + cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids)) + cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask)) + cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata)) + return tokens, masks, metadata diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd new file mode 100644 index 00000000000..0aed9702d61 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +cdef class TokenizeVocabulary: + cdef unique_ptr[tokenize_vocabulary] c_obj + +cpdef Column tokenize_scalar(Column input, Scalar delimiter=*) + +cpdef Column tokenize_column(Column input, Column delimiters) + +cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*) + +cpdef Column count_tokens_column(Column input, Column delimiters) + +cpdef Column character_tokenize(Column input) + +cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*) + +cpdef Column tokenize_with_vocabulary( + Column input, + TokenizeVocabulary vocabulary, + Scalar delimiter, + size_type default_id=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx new file mode 100644 index 00000000000..ec02e8ebf4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -0,0 +1,262 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.tokenize cimport ( + character_tokenize as cpp_character_tokenize, + count_tokens as cpp_count_tokens, + detokenize as cpp_detokenize, + load_vocabulary as cpp_load_vocabulary, + tokenize as cpp_tokenize, + tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.types cimport size_type + + +cdef class TokenizeVocabulary: + """The Vocabulary object to be used with ``tokenize_with_vocabulary``. + + For details, see :cpp:class:`cudf::nvtext::tokenize_vocabulary`. + """ + def __cinit__(self, Column vocab): + cdef column_view c_vocab = vocab.view() + with nogil: + self.c_obj = move(cpp_load_vocabulary(c_vocab)) + +cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): + """ + Returns a single column of strings by tokenizing the input + strings column using the provided characters as delimiters. + + For details, see cpp:func:`cudf::nvtext::tokenize` + + Parameters + ---------- + input : Column + Strings column to tokenize + delimiter : Scalar + String scalar used to separate individual strings into tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_tokenize( + input.view(), + dereference(delimiter.c_obj.get()), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column tokenize_column(Column input, Column delimiters): + """ + Returns a single column of strings by tokenizing the input + strings column using multiple strings as delimiters. + + For details, see cpp:func:`cudf::nvtext::tokenize` + + Parameters + ---------- + input : Column + Strings column to tokenize + delimiters : Column + Strings column used to separate individual strings into tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_tokenize( + input.view(), + delimiters.view(), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None): + """ + Returns the number of tokens in each string of a strings column + using the provided characters as delimiters. + + For details, see cpp:func:`cudf::nvtext::count_tokens` + + Parameters + ---------- + input : Column + Strings column to count tokens + delimiters : Scalar + String scalar used to separate each string into tokens + + Returns + ------- + Column + New column of token counts + """ + cdef unique_ptr[column] c_result + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_count_tokens( + input.view(), + dereference(delimiter.c_obj.get()), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column count_tokens_column(Column input, Column delimiters): + """ + Returns the number of tokens in each string of a strings column + using multiple strings as delimiters. + + For details, see cpp:func:`cudf::nvtext::count_tokens` + + Parameters + ---------- + input : Column + Strings column to count tokens + delimiters : Column + Strings column used to separate each string into tokens + + Returns + ------- + Column + New column of token counts + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_count_tokens( + input.view(), + delimiters.view(), + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column character_tokenize(Column input): + """ + Returns a single column of strings by converting + each character to a string. + + For details, see cpp:func:`cudf::nvtext::character_tokens` + + Parameters + ---------- + input : Column + Strings column to tokenize + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_character_tokenize(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column detokenize( + Column input, + Column row_indices, + Scalar separator=None +): + """ + Creates a strings column from a strings column of tokens + and an associated column of row ids. + + For details, see cpp:func:`cudf::nvtext::detokenize` + + Parameters + ---------- + input : Column + Strings column to detokenize + row_indices : Column + The relative output row index assigned for each token in the input column + separator : Scalar + String to append after concatenating each token to the proper output row + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + if separator is None: + separator = Scalar.from_libcudf( + cpp_make_string_scalar(" ".encode()) + ) + + with nogil: + c_result = cpp_detokenize( + input.view(), + row_indices.view(), + dereference(separator.c_obj.get()) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column tokenize_with_vocabulary( + Column input, + TokenizeVocabulary vocabulary, + Scalar delimiter, + size_type default_id=-1 +): + """ + Returns the token ids for the input string by looking + up each delimited token in the given vocabulary. + + For details, see cpp:func:`cudf::nvtext::tokenize_with_vocabulary` + + Parameters + ---------- + input : Column + Strings column to tokenize + vocabulary : TokenizeVocabulary + Used to lookup tokens within ``input`` + delimiter : Scalar + Used to identify tokens within ``input`` + default_id : size_type + The token id to be used for tokens not found in the vocabulary; Default is -1 + + Returns + ------- + Column + Lists column of token ids + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_tokenize_with_vocabulary( + input.view(), + dereference(vocabulary.c_obj.get()), + dereference(delimiter.c_obj.get()), + default_id + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 40fa8261905..fa7294c7dbd 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -32,6 +32,7 @@ "capitalize", "case", "char_types", + "combine", "contains", "convert", "extract", diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx index f426b6888ae..91f585cd637 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx @@ -37,6 +37,10 @@ cdef class RegexProgram: flags : Uniont[int, RegexFlags] Regex flags for interpreting special characters in the pattern + Returns + ------- + RegexProgram + A new RegexProgram """ cdef unique_ptr[regex_program] c_prog cdef regex_flags c_flags diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 6db7f04fcbb..2b94f5e3fee 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -136,6 +136,7 @@ cpdef Column replace_slice( Start position where repl will be added. stop : size_type, default -1 End position (exclusive) to use for replacement. + Returns ------- pylibcudf.Column diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index 9f389fa42c4..d95849ef371 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -7,10 +7,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table + +import pylibcudf as plc from pylibcudf.io.types import CompressionType diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index fdce6f353ca..5265e411c7f 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -8,8 +8,9 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest + +import pylibcudf as plc from pylibcudf.io.types import CompressionType sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) @@ -17,13 +18,23 @@ from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES -# This fixture defines the standard set of types that all tests should default to +def _type_to_str(typ): + if isinstance(typ, pa.ListType): + return f"list[{_type_to_str(typ.value_type)}]" + elif isinstance(typ, pa.StructType): + return f"struct[{', '.join(_type_to_str(typ.field(i).type) for i in range(typ.num_fields))}]" + else: + return str(typ) + + +# This fixture defines [the standard set of types that all tests should default to # running on. If there is a need for some tests to run on a different set of types, that # type list fixture should also be defined below here if it is likely to be reused # across modules. Otherwise it may be defined on a per-module basis. @pytest.fixture( scope="session", params=DEFAULT_PA_TYPES, + ids=_type_to_str, ) def pa_type(request): return request.param diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 0cd5064a697..3d9d99ffa61 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -5,10 +5,11 @@ import fastavro import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_and_meta_eq +import pylibcudf as plc + avro_dtype_pairs = [ ("boolean", pa.bool_()), ("int", pa.int32()), diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ab26f23418d..22c83acc47c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -5,9 +5,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( _convert_types, assert_table_and_meta_eq, @@ -15,6 +13,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_CSV_SOURCE_KWARGS = { "format": "csv", diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 9d976fedf00..453e5ce32a8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -3,9 +3,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( assert_table_and_meta_eq, make_source, @@ -13,6 +11,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 42b14b1feff..5ed660ba6cf 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source +import pylibcudf as plc + # Shared kwargs to pass to make_source _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index f6e843ccf66..41298601539 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.parquet import read_table +from utils import assert_table_and_meta_eq, make_source + +import pylibcudf as plc from pylibcudf.expressions import ( ASTOperator, ColumnNameReference, @@ -11,7 +13,6 @@ Literal, Operation, ) -from utils import assert_table_and_meta_eq, make_source # Shared kwargs to pass to make_source _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py index 747f58ec8cf..0c43c363e55 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py @@ -2,9 +2,10 @@ import io -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) def io_class(request): diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py index 76b0424b2af..b3555013927 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import zoneinfo -import pylibcudf as plc import pytest +import pylibcudf as plc + def test_make_timezone_transition_table(): if len(zoneinfo.TZPATH) == 0: diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index f784cb3c191..bbb08e8b95a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -4,10 +4,11 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def idfn(param): ltype, rtype, outtype, plc_op, _ = param diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py index 8cedbc6d42f..e317362a76b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq +import pylibcudf as plc + EMPTY_COL_SIZE = 3 NUMERIC_TYPES = [ diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py index 0e129fdf0ef..24cd6b9e35f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq import rmm +import pylibcudf as plc + VALID_TYPES = [ pa.int8(), pa.int16(), diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 7a5c1664eed..6d8b5993964 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + param_pyarrow_tables = [ pa.table([]), pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py index 628682d0a66..c0a41b96b1a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_copying.py +++ b/python/pylibcudf/pylibcudf/tests/test_copying.py @@ -2,7 +2,6 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import ( DEFAULT_STRUCT_TESTING_TYPE, @@ -16,6 +15,8 @@ metadata_from_arrow_type, ) +import pylibcudf as plc + # TODO: consider moving this to conftest and "pairing" # it with pa_type, so that they don't get out of sync diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 75930d59058..f5f24ef28e2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,13 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import calendar import datetime import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) def datetime_column(has_nulls, request): @@ -45,6 +47,21 @@ def component(request): return request.param +@pytest.fixture( + params=[ + ("day", plc.datetime.RoundingFrequency.DAY), + ("hour", plc.datetime.RoundingFrequency.HOUR), + ("minute", plc.datetime.RoundingFrequency.MINUTE), + ("second", plc.datetime.RoundingFrequency.SECOND), + ("millisecond", plc.datetime.RoundingFrequency.MILLISECOND), + ("microsecond", plc.datetime.RoundingFrequency.MICROSECOND), + ("nanosecond", plc.datetime.RoundingFrequency.NANOSECOND), + ] +) +def rounding_frequency(request): + return request.param + + def test_extract_datetime_component(datetime_column, component): attr, component = component kwargs = {} @@ -58,3 +75,139 @@ def test_extract_datetime_component(datetime_column, component): ).cast(pa.int16()) assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "datetime_func", + [ + "extract_millisecond_fraction", + "extract_microsecond_fraction", + "extract_nanosecond_fraction", + ], +) +def test_datetime_extracting_functions(datetime_column, datetime_func): + pa_col = plc.interop.to_arrow(datetime_column) + got = getattr(plc.datetime, datetime_func)(datetime_column) + kwargs = {} + attr = datetime_func.split("_")[1] + if attr == "weekday": + kwargs = {"count_from_zero": False} + attr = "day_of_week" + expect = getattr(pc, attr)(pa_col, **kwargs).cast(pa.int16()) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "op", + [ + ("ceil_temporal", "ceil_datetimes"), + ("floor_temporal", "floor_datetimes"), + ("round_temporal", "round_datetimes"), + ], +) +def test_rounding_operations(datetime_column, op, rounding_frequency): + got = getattr(plc.datetime, op[1])(datetime_column, rounding_frequency[1]) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = getattr(pc, op[0])( + pa_col, + unit=rounding_frequency[0], + ).cast(pa_got.type) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize( + "months", + [ + pa.scalar(-3, pa.int32()), + pa.scalar(1, pa.int16()), + pa.array([1, -3, 2, 4, -1, 5], pa.int32()), + ], +) +def test_calendrical_months(datetime_column, months): + def add_calendrical_months(timestamps, months): + result = [] + if isinstance(months, pa.Array): + months_list = months.to_pylist() + else: + months_list = [months.as_py()] * len(timestamps) + for i, dt in enumerate(timestamps): + if dt.as_py() is not None: + year, month = dt.as_py().year, dt.as_py().month + new_month = month + months_list[i] + new_year = year + (new_month - 1) // 12 + result.append( + dt.as_py().replace( + year=new_year, month=(new_month - 1) % 12 + 1 + ) + ) + else: + result.append(None) + return pa.array(result) + + pa_col = plc.interop.to_arrow(datetime_column) + got = plc.datetime.add_calendrical_months( + datetime_column, plc.interop.from_arrow(months) + ) + pa_got = plc.interop.to_arrow(got) + expect = add_calendrical_months(pa_col, months).cast(pa_got.type) + assert_column_eq(expect, got) + + +def test_day_of_year(datetime_column): + got = plc.datetime.day_of_year(datetime_column) + pa_got = plc.interop.to_arrow(got) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pa.array( + [ + d.as_py().timetuple().tm_yday if d.as_py() is not None else None + for d in pa_col + ], + type=pa_got.type, + ) + assert_column_eq(expect, got) + + +def test_is_leap_year(datetime_column): + got = plc.datetime.is_leap_year(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pc.is_leap_year(pa_col) + assert_column_eq(expect, got) + + +def test_last_day_of_month(datetime_column): + def last_day_of_month(dates): + return [ + d.replace(day=calendar.monthrange(d.year, d.month)[1]) + if d is not None + else d + for d in dates.to_pylist() + ] + + got = plc.datetime.last_day_of_month(datetime_column) + pa_got = plc.interop.to_arrow(got) + pa_col = plc.interop.to_arrow(datetime_column) + expect = pa.array(last_day_of_month(pa_col), type=pa_got.type) + assert_column_eq(expect, got) + + +def test_extract_quarter(datetime_column): + got = plc.datetime.extract_quarter(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = pc.quarter(pa_col).cast(pa_got.type) + assert_column_eq(expect, got) + + +def test_days_in_month(datetime_column): + def days_in_month(dates): + return [ + calendar.monthrange(d.year, d.month)[1] if d is not None else None + for d in dates.to_pylist() + ] + + got = plc.datetime.days_in_month(datetime_column) + pa_col = plc.interop.to_arrow(datetime_column) + pa_got = plc.interop.to_arrow(got) + expect = pa.array(days_in_month(pa_col), type=pa_got.type) + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py index 5894ef4624c..52c81c49b9d 100644 --- a/python/pylibcudf/pylibcudf/tests/test_expressions.py +++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py @@ -1,10 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc +import pyarrow.compute as pc import pytest +from utils import assert_column_eq -# We can't really evaluate these expressions, so just make sure -# construction works properly +import pylibcudf as plc def test_literal_construction_invalid(): @@ -22,7 +22,7 @@ def test_literal_construction_invalid(): ], ) def test_columnref_construction(tableref): - plc.expressions.ColumnReference(1.0, tableref) + plc.expressions.ColumnReference(1, tableref) def test_columnnameref_construction(): @@ -47,3 +47,35 @@ def test_columnnameref_construction(): ) def test_astoperation_construction(kwargs): plc.expressions.Operation(**kwargs) + + +def test_evaluation(): + table_h = pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + lit = pa.scalar(42, type=pa.int64()) + table = plc.interop.from_arrow(table_h) + # expr = abs(b * c - (a + 42)) + expr = plc.expressions.Operation( + plc.expressions.ASTOperator.ABS, + plc.expressions.Operation( + plc.expressions.ASTOperator.SUB, + plc.expressions.Operation( + plc.expressions.ASTOperator.MUL, + plc.expressions.ColumnReference(1), + plc.expressions.ColumnReference(2), + ), + plc.expressions.Operation( + plc.expressions.ASTOperator.ADD, + plc.expressions.ColumnReference(0), + plc.expressions.Literal(plc.interop.from_arrow(lit)), + ), + ), + ) + + expect = pc.abs( + pc.subtract( + pc.multiply(table_h["b"], table_h["c"]), pc.add(table_h["a"], lit) + ) + ) + got = plc.transform.compute_column(table, expr) + + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py new file mode 100644 index 00000000000..83fb50fa4ef --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -0,0 +1,269 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import hashlib +import struct + +import mmh3 +import numpy as np +import pyarrow as pa +import pytest +import xxhash +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + +SEED = 0 +METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] + + +def scalar_to_binary(x): + if isinstance(x, str): + return x.encode() + elif isinstance(x, float): + return struct.pack("> 2))) + + +def uint_hash_combine_32(lhs, rhs): + return hash_combine_32(np.uint32(lhs), np.uint32(rhs)) + + +def libcudf_mmh3_x86_32(binary): + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + hashval = mmh3.hash(binary, seed) + return hash_combine_32(seed, hashval) + + +@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()]) +def scalar_type(request): + return request.param + + +@pytest.fixture +def pa_scalar_input_column(scalar_type): + if pa.types.is_integer(scalar_type) or pa.types.is_floating(scalar_type): + return pa.array([1, 2, 3], type=scalar_type) + elif pa.types.is_string(scalar_type): + return pa.array(["a", "b", "c"], type=scalar_type) + elif pa.types.is_boolean(scalar_type): + return pa.array([True, True, False], type=scalar_type) + + +@pytest.fixture +def plc_scalar_input_tbl(pa_scalar_input_column): + return plc.interop.from_arrow( + pa.Table.from_arrays([pa_scalar_input_column], names=["data"]) + ) + + +@pytest.fixture(scope="module") +def list_struct_table(): + data = pa.Table.from_pydict( + { + "list": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + "struct": [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}], + } + ) + return data + + +def python_hash_value(x, method): + if method == "murmurhash3_x86_32": + return libcudf_mmh3_x86_32(x) + elif method == "murmurhash3_x64_128": + hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + hasher.update(x) + # libcudf returns a tuple of two 64-bit integers + return hasher.utupledigest() + elif method == "xxhash_64": + return xxhash.xxh64( + x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + else: + return getattr(hashlib, method)(x).hexdigest() + + +@pytest.mark.parametrize( + "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] +) +def test_hash_column_sha_md5( + pa_scalar_input_column, plc_scalar_input_tbl, method +): + plc_hasher = getattr(plc.hashing, method) + + def py_hasher(val): + return getattr(hashlib, method)(scalar_to_binary(val)).hexdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.string(), + ) + got = plc_hasher(plc_scalar_input_tbl) + assert_column_eq(got, expect) + + +def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh64( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint64(), + ) + got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + + assert_column_eq(got, expect) + + +@pytest.mark.parametrize( + "method", ["sha1", "sha224", "sha256", "sha384", "sha512"] +) +@pytest.mark.parametrize("dtype", ["list", "struct"]) +def test_sha_list_struct_err(list_struct_table, dtype, method): + err_types = list_struct_table.select([dtype]) + plc_tbl = plc.interop.from_arrow(err_types) + plc_hasher = getattr(plc.hashing, method) + + with pytest.raises(TypeError): + plc_hasher(plc_tbl) + + +def test_md5_struct_err(list_struct_table): + err_types = list_struct_table.select(["struct"]) + plc_tbl = plc.interop.from_arrow(err_types) + + with pytest.raises(TypeError): + plc.hashing.md5(plc_tbl) + + +def test_murmurhash3_x86_32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return libcudf_mmh3_x86_32(scalar_to_binary(val)) + + got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0) + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0) + assert_column_eq(got, expect) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_list(): + pa_tbl = pa.Table.from_pydict( + { + "list": pa.array( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], type=pa.list_(pa.uint32()) + ) + } + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_list(list_): + hash_value = uint_hash_combine_32(0, hash_single_uint32(len(list_))) + + for element in list_: + hash_value = uint_hash_combine_32( + hash_value, + hash_single_uint32( + element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ), + ) + + final = uint_hash_combine_32( + plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value + ) + return final + + expect = pa.array( + [hash_list(val) for val in pa_tbl["list"].to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + assert_column_eq(got, expect) + + +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +def test_murmurhash3_x86_32_struct(): + pa_tbl = pa.table( + { + "struct": pa.array( + [ + {"a": 1, "b": 2, "c": 3}, + {"a": 4, "b": 5, "c": 6}, + {"a": 7, "b": 8, "c": 9}, + ], + type=pa.struct( + [ + pa.field("a", pa.uint32()), + pa.field("b", pa.uint32(), pa.field("c", pa.uint32())), + ] + ), + ) + } + ) + plc_tbl = plc.interop.from_arrow(pa_tbl) + + def hash_struct(s): + seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + keys = list(s.keys()) + + combined_hash = hash_single_uint32(s[keys[0]], seed=seed) + combined_hash = uint_hash_combine_32(0, combined_hash) + combined_hash = uint_hash_combine_32(seed, combined_hash) + + for key in keys[1:]: + current_hash = hash_single_uint32(s[key], seed=seed) + combined_hash = uint_hash_combine_32(combined_hash, current_hash) + + return combined_hash + + got = plc.hashing.murmurhash3_x86_32( + plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + expect = pa.array( + [hash_struct(val) for val in pa_tbl["struct"].to_pylist()], + type=pa.uint32(), + ) + assert_column_eq(got, expect) + + +def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) + hasher.update(val) + return hasher.utupledigest() + + tuples = [ + py_hasher(scalar_to_binary(val)) + for val in pa_scalar_input_column.to_pylist() + ] + expect = pa.Table.from_arrays( + [ + pa.array([np.uint64(t[0]) for t in tuples]), + pa.array([np.uint64(t[1]) for t in tuples]), + ], + names=["0", "1"], + ) + got = plc.hashing.murmurhash3_x64_128(plc_scalar_input_tbl, 0) + + assert_table_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py index 01c998f16d4..af80b6e5978 100644 --- a/python/pylibcudf/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/pylibcudf/tests/test_interop.py @@ -1,8 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import cupy as cp +import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest +from utils import assert_table_eq + +import pylibcudf as plc def test_list_dtype_roundtrip(): @@ -66,3 +70,31 @@ def test_decimal_other(data_type): arrow_type = plc.interop.to_arrow(data_type, precision=precision) assert arrow_type == pa.decimal128(precision, 0) + + +def test_round_trip_dlpack_plc_table(): + expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]}) + plc_table = plc.interop.from_arrow(expected) + result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + assert_table_eq(expected, result) + + +@pytest.mark.parametrize("array", [np.array, cp.array]) +def test_round_trip_dlpack_array(array): + arr = array([1, 2, 3]) + result = plc.interop.from_dlpack(arr.__dlpack__()) + expected = pa.table({"a": [1, 2, 3]}) + assert_table_eq(expected, result) + + +def test_to_dlpack_error(): + plc_table = plc.interop.from_arrow( + pa.table({"a": [1, None, 3], "b": [5, 6, 7]}) + ) + with pytest.raises(ValueError, match="Cannot create a DLPack tensor"): + plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + + +def test_from_dlpack_error(): + with pytest.raises(ValueError, match="Invalid PyCapsule object"): + plc.interop.from_dlpack(1) diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py index 61e02f4d28d..f43a56046a4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_join.py +++ b/python/pylibcudf/pylibcudf/tests/test_join.py @@ -2,9 +2,10 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc from utils import assert_table_eq +import pylibcudf as plc + def test_cross_join(): left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py index 3d2955211f8..486a9524e92 100644 --- a/python/pylibcudf/pylibcudf/tests/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/test_json.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def plc_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index f7fb7463b50..beacfc63ce5 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("left_inclusive", [True, False]) @pytest.mark.parametrize("right_inclusive", [True, False]) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index 2353a6ff8f9..f3ef555f11d 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def test_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py index 3edcae59edc..cd3da856de2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py +++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.null_mask import MaskState import rmm +import pylibcudf as plc +from pylibcudf.null_mask import MaskState + @pytest.fixture(params=[False, True]) def nullable(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py new file mode 100644 index 00000000000..7d6718a959b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + return pa.array( + [ + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t est", + "s ent", + ] + ) + + +@pytest.mark.parametrize( + "separator", [None, plc.interop.from_arrow(pa.scalar("e"))] +) +def test_byte_pair_encoding(input_col, separator): + plc_col = plc.interop.from_arrow( + pa.array(["test sentence", "thisis test"]) + ) + result = plc.nvtext.byte_pair_encode.byte_pair_encoding( + plc_col, + plc.nvtext.byte_pair_encode.BPEMergePairs( + plc.interop.from_arrow(input_col) + ), + separator, + ) + if separator is None: + expected = pa.array(["test sent ence", "t h is is test"]) + else: + expected = pa.array(["teste esenteence", "teheiseise etest"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py index 7d93c471cc4..8b14e0db576 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def edit_distance_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py index 5cf9874d595..fae4685f81b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py index d5a168426b1..05fe7b53c16 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index 4e389a63f90..ead9ee094af 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) def minhash_input_data(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py index 283a009288d..84748b5597e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index fe28b83c09a..25b6d1389ec 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def norm_spaces_input_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py index 0fb54bb4ee1..65687f31c85 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py index 75d56f587a4..e7f4a971f08 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def input_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py new file mode 100644 index 00000000000..516d0f7f78d --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture +def vocab_file(tmpdir): + hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") + content = "1\n0\n10\n" + coefficients = [65559] * 10 + for c in coefficients: + content = content + str(c) + " 0\n" + table = [0] * 10 + table[0] = 3015668 + content = content + "10\n" + for v in table: + content = content + str(v) + "\n" + content = content + "100\n101\n102\n\n" + hash_file.write(content) + return str(hash_file) + + +@pytest.fixture +def column_input(): + return pa.array(["This is a test"]) + + +@pytest.mark.parametrize("max_sequence_length", [64, 128]) +@pytest.mark.parametrize("stride", [32, 64]) +@pytest.mark.parametrize("do_lower_case", [True, False]) +@pytest.mark.parametrize("do_truncate", [True, False]) +def test_subword_tokenize( + vocab_file, + column_input, + max_sequence_length, + stride, + do_lower_case, + do_truncate, +): + vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file) + tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize( + plc.interop.from_arrow(column_input), + vocab, + max_sequence_length, + stride, + do_lower_case, + do_truncate, + ) + expected_tokens = pa.array( + [100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() + ) + expected_masks = pa.array( + [1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32() + ) + expected_metadata = pa.array([0, 0, 3], type=pa.uint32()) + + assert_column_eq(tokens, expected_tokens) + assert_column_eq(masks, expected_masks) + assert_column_eq(metadata, expected_metadata) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py new file mode 100644 index 00000000000..f1b4a5637e1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + return pa.array(["a", "b c", "d.e:f;"]) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_tokenize_scalar(input_col, delimiter): + result = plc.nvtext.tokenize.tokenize_scalar( + plc.interop.from_arrow(input_col), delimiter + ) + if delimiter is None: + expected = pa.array(["a", "b", "c", "d.e:f;"]) + else: + expected = pa.array(["a", "b c", "d", "e:f;"]) + assert_column_eq(result, expected) + + +def test_tokenize_column(input_col): + delimiters = pa.array([" ", ".", ":", ";"]) + result = plc.nvtext.tokenize.tokenize_column( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters) + ) + expected = pa.array(["a", "b", "c", "d", "e", "f"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_count_tokens_scalar(input_col, delimiter): + result = plc.nvtext.tokenize.count_tokens_scalar( + plc.interop.from_arrow(input_col), delimiter + ) + if delimiter is None: + expected = pa.array([1, 2, 1], type=pa.int32()) + else: + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_count_tokens_column(input_col): + delimiters = pa.array([" ", ".", ":", ";"]) + result = plc.nvtext.tokenize.count_tokens_column( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters) + ) + expected = pa.array([1, 2, 3], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_character_tokenize(input_col): + result = plc.nvtext.tokenize.character_tokenize( + plc.interop.from_arrow(input_col) + ) + expected = pa.array(["a", "b", " ", "c", "d", ".", "e", ":", "f", ";"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))] +) +def test_detokenize(input_col, delimiter): + row_indices = pa.array([0, 0, 1]) + result = plc.nvtext.tokenize.detokenize( + plc.interop.from_arrow(input_col), plc.interop.from_arrow(row_indices) + ) + expected = pa.array(["a b c", "d.e:f;"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("default_id", [-1, 0]) +def test_tokenize_with_vocabulary(input_col, default_id): + result = plc.nvtext.tokenize.tokenize_with_vocabulary( + plc.interop.from_arrow(input_col), + plc.nvtext.tokenize.TokenizeVocabulary( + plc.interop.from_arrow(input_col) + ), + plc.interop.from_arrow(pa.scalar(" ")), + default_id, + ) + pa_result = plc.interop.to_arrow(result) + if default_id == -1: + expected = pa.array([[0], [-1, -1], [2]], type=pa_result.type) + else: + expected = pa.array([[0], [0, 0], [2]], type=pa_result.type) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 444d0089d2c..c55e54cebc6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def partitioning_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py index bac56691306..e4a24fb1c98 100644 --- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py +++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + # Map pylibcudf interpolation options to pyarrow options interp_mapping = { plc.types.Interpolation.LINEAR: "linear", diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py index 777315df538..52598f2c462 100644 --- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py +++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("pat", ["(", "*", "\\"]) def test_regex_program_invalid(pat): diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py index 01115bc363a..ef23e23766a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_reshape.py +++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def reshape_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py index 0b30316b9a0..2526580bc13 100644 --- a/python/pylibcudf/pylibcudf/tests/test_round.py +++ b/python/pylibcudf/pylibcudf/tests/test_round.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(params=["float32", "float64"]) def column(request, has_nulls): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index a1820def0b1..f461657281a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture() def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py index 176ccc55b96..3e31c75c38a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py index 233cc253b14..08ac371fd96 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_case.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def string_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py index bcd030c019e..06b44210d74 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_all_characters_of_type(): pa_array = pa.array(["1", "A"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py index 4a7007a0d6b..eea3ac68e84 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def test_concatenate_scalar_seperator(): plc_table = plc.interop.from_arrow( diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4e4dd7cbb00..ba9a4a7d3b8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def target_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index 69f7a0fdd33..3f3f452c4f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( scope="module", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py index 117c59ff1b8..b391d2b290e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_booleans(): pa_array = pa.array(["true", None, "True"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py index f3e84286a36..c9368d858a4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -3,10 +3,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def fmt(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py index 6d704309bfd..2d3578e4e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -3,10 +3,11 @@ from datetime import datetime, timedelta import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( params=[ diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py index b1c4d729604..012e722038e 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -2,9 +2,10 @@ import decimal import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_fixed_point(): typ = pa.decimal128(38, 2) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py index e9918fab559..8ee2b5075af 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_floats(): typ = pa.float32() diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py index 6d1d565af30..01192c2d1f8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_to_integers(): typ = pa.int8() diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py index 4dc3e512624..b533809f106 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_ipv4_to_integers(): arr = pa.array(["123.45.67.890", None]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py index 8591732b39e..737036a4f0f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.mark.parametrize("na_rep", [None, pa.scalar("")]) @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py index fee8c3fb8f6..528736798c7 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py @@ -2,9 +2,10 @@ import urllib import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_url_encode(): data = ["/home/nfs", None] diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py index 788b86423c4..e70edf4fb33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py index db3b13a5aae..82ec18832a9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py index d6b37a388f0..fa9eee3594b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_find_multiple(): arr = pa.array(["abc", "def"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index debfad92d00..b73d812c898 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -2,9 +2,10 @@ import re import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_findall(): arr = pa.array(["bunny", "rabbit", "hare", "dog"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py index 2ba775d17ae..79498132097 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py index 18b5d8bf4d0..c06c06be7c6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) def test_repeat_strings(repeats): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py index 5a9c2007b73..2c7d25133de 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py index ff2ce348d3b..511f826441a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.mark.parametrize("max_replace_count", [-1, 1]) def test_replace_re_regex_program_scalar(max_replace_count): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py index d9ce5591b98..1759f739e31 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def pa_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py index 80cae8d1c6b..4e80f19b814 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py index 2aeffac8209..450b336ce65 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py index 005e5e4a405..5869e5f4920 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + data_strings = [ "AbC", "123abc", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py index 2ae893e69fb..84fd3354ac6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py index a1c820cd586..00442d866e9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -2,9 +2,10 @@ import textwrap import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_wrap(): width = 12 diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py index e822d6a97a8..ac39ef4c5c9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_table.py +++ b/python/pylibcudf/pylibcudf/tests/test_table.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize( "arrow_tbl", diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index d5c618f07e4..49802fe64ac 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -3,9 +3,10 @@ import math import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_nans_to_nulls(has_nans): if has_nans: diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py index ac11123f680..b0c0bc72ead 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transpose.py +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from packaging.version import parse +import pylibcudf as plc + @pytest.mark.skipif( parse(pa.__version__) < parse("16.0.0"), diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index b530f433c97..4fb623158f0 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -3,6 +3,7 @@ from libcpp cimport bool from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column +from .expressions cimport Expression from .gpumemoryview cimport gpumemoryview from .table cimport Table from .types cimport DataType @@ -10,6 +11,8 @@ from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) +cpdef Column compute_column(Table input, Expression expr) + cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index bce9702752a..e8d95cadb0c 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move, pair @@ -43,6 +44,32 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): ) +cpdef Column compute_column(Table input, Expression expr): + """Create a column by evaluating an expression on a table. + + For details see :cpp:func:`compute_column`. + + Parameters + ---------- + input : Table + Table used for expression evaluation + expr : Expression + Expression to evaluate + + Returns + ------- + Column of the evaluated expression + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_transform.compute_column( + input.view(), dereference(expr.c_obj.get()) + ) + + return Column.from_libcudf(move(c_result)) + + cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): """Create a bitmask from a column of boolean elements diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index 58c7d97e9bc..a0c31f994a3 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -79,6 +79,16 @@ cpdef size_type size_of(DataType t): Only fixed-width types are supported. For details, see :cpp:func:`size_of`. + + Parameters + ---------- + t : DataType + The DataType to get the size of. + + Returns + ------- + int + Size in bytes of an element of the specified type. """ with nogil: return cpp_size_of(t.c_obj) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index ea5b3065896..a80c85a1fa8 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -53,48 +53,20 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module