diff --git a/.env b/.env index 27474b2c73199..be35921f94c3a 100644 --- a/.env +++ b/.env @@ -89,13 +89,13 @@ TZ=UTC # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release +VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01" # 2024.04.26 Release # This must be updated when we update # ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-04-09 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-06-18 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e7e544c2b0e62..e495bfd147de6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -37,7 +37,7 @@ /go/ @zeroshade /java/ @lidavidm /js/ @domoritz @trxcllnt -/matlab/ @kevingurney @kou +/matlab/ @kevingurney @kou @sgilmore10 /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 /r/ @paleolimbot @thisisnic diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 1ea12b0a4d23d..5aec3638a8967 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -120,7 +120,7 @@ jobs: shell: bash run: | gem install test-unit - pip install "cython>=0.29.31" setuptools six pytest jira setuptools-scm + pip install "cython>=0.29.31" setuptools pytest jira setuptools-scm - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 11dc29dcae54e..0d32628859fa0 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -306,8 +306,8 @@ jobs: run: ci/scripts/go_test.sh $(pwd) macos: - name: AMD64 macOS 11 Go ${{ matrix.go }} - runs-on: macos-latest + name: AMD64 macOS 12 Go ${{ matrix.go }} + runs-on: macos-12 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: @@ -364,8 +364,8 @@ jobs: macos-cgo: - name: AMD64 macOS 11 Go ${{ matrix.go }} - CGO - runs-on: macos-latest + name: AMD64 macOS 12 Go ${{ matrix.go }} - CGO + runs-on: macos-12 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index e31f7a4fc4d27..08dbe7c8068c0 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -105,9 +105,9 @@ jobs: run: archery docker push ${{ matrix.image }} macos: - name: AMD64 macOS 11 Java JDK ${{ matrix.jdk }} - runs-on: macos-latest - if: github.event_name == 'push' + name: AMD64 macOS 12 Java JDK ${{ matrix.jdk }} + runs-on: macos-12 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 30 strategy: fail-fast: false diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 059a7430a38ce..ea5f8d694a9c6 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -54,7 +54,7 @@ jobs: name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 90 + timeout-minutes: 240 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index e03d0c2dadce0..c11c8254011f6 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -81,10 +81,10 @@ jobs: run: archery docker push debian-js macos: - name: AMD64 macOS 11 NodeJS ${{ matrix.node }} - runs-on: macos-latest - if: github.event_name == 'push' - timeout-minutes: 90 + name: AMD64 macOS 12 NodeJS ${{ matrix.node }} + runs-on: macos-12 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 30 strategy: fail-fast: false matrix: @@ -114,7 +114,8 @@ jobs: windows: name: AMD64 Windows NodeJS ${{ matrix.node }} runs-on: windows-latest - if: github.event_name == 'push' + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false matrix: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000000..8d54979502430 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Release + +on: + push: + tags: + # Trigger workflow when a tag whose name matches the pattern + # pattern "apache-arrow-{MAJOR}.{MINOR}.{PATCH}" is pushed. + - "apache-arrow-[0-9]+.[0-9]+.[0-9]+" + +permissions: + contents: write + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + publish: + name: Publish + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Get Tag Name of Latest Release Candidate + run: | + rc_tag=$(gh release list --repo apache/arrow | \ + cut -f3 | \ + grep -F "${GITHUB_REF_NAME}-rc" | \ + head -n1) + echo "Latest Release Candidate Tag: ${rc_tag}" + echo "RELEASE_CANDIDATE_TAG_NAME=${rc_tag}" >> ${GITHUB_ENV} + - name: Store Version and Release Candidate Number + run: | + version_with_rc=${RELEASE_CANDIDATE_TAG_NAME#apache-arrow-} + version=${version_with_rc%-rc*} + rc_num=${version_with_rc#${version}-rc} + echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} + echo "VERSION=${version}" >> ${GITHUB_ENV} + echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} + - name: Download Release Candidate Artifacts + run: | + mkdir release_candidate_artifacts + gh release download ${RELEASE_CANDIDATE_TAG_NAME} --repo apache/arrow --dir release_candidate_artifacts + - name: Create Release Title + run: | + title="Apache Arrow ${VERSION}" + echo "RELEASE_TITLE=${title}" >> ${GITHUB_ENV} + # Set the release notes to "TODO" temporarily. After the release notes page + # (https://arrow.apache.org/release/{VERSION}.html) is published, use + # gh release edit to update the release notes to refer to the newly + # pushed web page. See dev/post/post-05-update-gh-release-notes.sh + - name: Create GitHub Release + run: | + gh release create ${GITHUB_REF_NAME} \ + --repo apache/arrow \ + --verify-tag \ + --title "${RELEASE_TITLE}" \ + --notes "TODO" \ + release_candidate_artifacts/* \ No newline at end of file diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml new file mode 100644 index 0000000000000..ec732f0eb33e0 --- /dev/null +++ b/.github/workflows/release_candidate.yml @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Release + +on: + push: + tags: + # Trigger workflow when a tag whose name matches the pattern + # "apache-arrow-{MAJOR}.{MINOR}.{PATCH}-rc{RC_NUM}" is pushed. + - "apache-arrow-[0-9]+.[0-9]+.[0-9]+-rc[0-9]+" + +permissions: + contents: write + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + publish: + name: Publish + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout Arrow + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Store Version and Release Candidate Number + run: | + version_with_rc=${GITHUB_REF_NAME#apache-arrow-} + version=${version_with_rc%-rc*} + rc_num=${version_with_rc#${version}-rc} + echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} + echo "VERSION=${version}" >> ${GITHUB_ENV} + echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} + - name: Create Release Candidate Title + run: | + title="Apache Arrow ${VERSION} RC${RC_NUM}" + echo "RELEASE_CANDIDATE_TITLE=${title}" >> ${GITHUB_ENV} + - name: Create Release Candidate Notes + run: | + release_notes="Release Candidate: ${VERSION} RC${RC_NUM}" + echo "RELEASE_CANDIDATE_NOTES=${release_notes}" >> ${GITHUB_ENV} + - name: Create Release tarball + run: | + cd dev/release/ && ./utils-create-release-tarball.sh ${VERSION} ${RC_NUM} + echo "RELEASE_TARBALL=apache-arrow-${VERSION}.tar.gz" >> ${GITHUB_ENV} + - name: Create GitHub Release + run: | + gh release create ${GITHUB_REF_NAME} \ + --verify-tag \ + --prerelease \ + --title "${RELEASE_CANDIDATE_TITLE}" \ + --notes "Release Notes: ${RELEASE_CANDIDATE_NOTES}" \ + dev/release/${RELEASE_TARBALL} diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index eb00bc5f92a8d..6a29ec8e72cab 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -323,6 +323,7 @@ jobs: ARROW_FLIGHT: ON ARROW_FLIGHT_SQL: ON ARROW_GANDIVA: OFF + ARROW_GLIB_VAPI: "false" ARROW_HDFS: OFF ARROW_HOME: "${{ github.workspace }}/dist" ARROW_JEMALLOC: OFF @@ -345,6 +346,7 @@ jobs: CMAKE_UNITY_BUILD: ON VCPKG_BINARY_SOURCES: 'clear;nuget,GitHub,readwrite' VCPKG_ROOT: "${{ github.workspace }}/vcpkg" + VCPKG_TRIPLET: x64-windows permissions: packages: write steps: @@ -405,7 +407,7 @@ jobs: -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" - name: Build C++ vcpkg dependencies run: | - vcpkg\vcpkg.exe install --triplet x64-windows --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed + vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed - name: Build C++ shell: cmd run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 05bf8e54f9cdb..9bdd4f487bdec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,13 @@ repos: hooks: - id: hadolint-docker name: Docker Format - exclude: ^dev/.*$ + # We can enable this after we fix all existing lint failures. + # files: (/Dockerfile|\.dockerfile)$ + files: >- + ( + ?^ci/docker/python-wheel-windows-test-vs2019\.dockerfile$| + ) + types: [] - repo: https://github.com/pycqa/flake8 rev: 6.1.0 hooks: diff --git a/c_glib/arrow-dataset-glib/enums.h.template b/c_glib/arrow-dataset-glib/enums.h.template index b7d3c99c0bef8..8b89a8b031bdc 100644 --- a/c_glib/arrow-dataset-glib/enums.h.template +++ b/c_glib/arrow-dataset-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GADATASET_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 2d54efadfa230..3425efc5555c8 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -82,6 +82,7 @@ libarrow_dataset_glib = library('arrow-dataset-glib', implicit_include_directories: false, include_directories: base_include_directories, cpp_args: ['-DGADATASET_COMPILATION'], + c_args: ['-DGADATASET_COMPILATION'], soversion: so_version, version: library_version) arrow_dataset_glib = declare_dependency(link_with: libarrow_dataset_glib, diff --git a/c_glib/arrow-glib/enums.h.template b/c_glib/arrow-glib/enums.h.template index b7d3c99c0bef8..e49b717fb30db 100644 --- a/c_glib/arrow-glib/enums.h.template +++ b/c_glib/arrow-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GARROW_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index fd32b35badcb1..36a8274513ed2 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -250,6 +250,7 @@ libarrow_glib = library('arrow-glib', implicit_include_directories: false, include_directories: base_include_directories, cpp_args: ['-DGARROW_COMPILATION'], + c_args: ['-DGARROW_COMPILATION'], soversion: so_version, version: library_version) arrow_glib = declare_dependency(link_with: libarrow_glib, diff --git a/c_glib/gandiva-glib/enums.h.template b/c_glib/gandiva-glib/enums.h.template index b7d3c99c0bef8..d362e14c1b2cb 100644 --- a/c_glib/gandiva-glib/enums.h.template +++ b/c_glib/gandiva-glib/enums.h.template @@ -22,6 +22,8 @@ #include +#include + G_BEGIN_DECLS /*** END file-header ***/ @@ -31,6 +33,7 @@ G_BEGIN_DECLS /*** END file-production ***/ /*** BEGIN value-header ***/ +GGANDIVA_AVAILABLE_IN_ALL GType @enum_name@_get_type(void) G_GNUC_CONST; #define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) /*** END value-header ***/ diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index 8cd00b3805b91..94b923388b7f2 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -86,6 +86,7 @@ libgandiva_glib = library('gandiva-glib', implicit_include_directories: false, include_directories: base_include_directories, cpp_args: ['-DGGANDIVA_COMPILATION'], + c_args: ['-DGGANDIVA_COMPILATION'], soversion: so_version, version: library_version) gandiva_glib = declare_dependency(link_with: libgandiva_glib, diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py index f2fc26132c143..7422432251ff1 100755 --- a/c_glib/tool/generate-version-header.py +++ b/c_glib/tool/generate-version-header.py @@ -84,7 +84,7 @@ def write_header( def generate_visibility_macros(library: str) -> str: - return f"""#if (defined(_WIN32) || defined(__CYGWIN__)) && defined(_MSVC_LANG) && \ + return f"""#if (defined(_WIN32) || defined(__CYGWIN__)) && defined(_MSC_VER) && \ !defined({library}_STATIC_COMPILATION) # define {library}_EXPORT __declspec(dllexport) # define {library}_IMPORT __declspec(dllimport) diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index 4a14a1e437ff6..e88d2b8fe30d5 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -3,6 +3,7 @@ "version-string": "17.0.0-SNAPSHOT", "dependencies": [ "glib", + "gobject-introspection", "pkgconf" ] } diff --git a/ci/docker/centos-7-cpp.dockerfile b/ci/docker/centos-7-cpp.dockerfile index 8c1893cbbb2ae..1f30eed694e4e 100644 --- a/ci/docker/centos-7-cpp.dockerfile +++ b/ci/docker/centos-7-cpp.dockerfile @@ -17,11 +17,25 @@ FROM centos:centos7 +# Update mirrors to use vault.centos.org as CentOS 7 +# is EOL since 2024-06-30 +RUN sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo + # devtoolset is required for C++17 RUN \ yum install -y \ centos-release-scl \ epel-release && \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/^# baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/CentOS-SCLo-scl*.repo && \ yum install -y \ cmake3 \ curl \ diff --git a/ci/docker/conda-python-substrait.dockerfile b/ci/docker/conda-python-substrait.dockerfile index 191795f253000..36dd64e51e7ad 100644 --- a/ci/docker/conda-python-substrait.dockerfile +++ b/ci/docker/conda-python-substrait.dockerfile @@ -24,11 +24,19 @@ FROM ${repo}:${arch}-conda-python-${python} COPY ci/conda_env_python.txt \ ci/conda_env_sphinx.txt \ /arrow/ci/ + +# Note: openjdk is pinned to 17 because the +# substrait repo currently pins to jdk 17. +# Newer jdk versions are currently failing +# due to the recent upgrade to Gradle 8 via +# install_substrait_consumer.sh. +# https://github.com/substrait-io/substrait-java/issues/274 RUN mamba install -q -y \ --file arrow/ci/conda_env_python.txt \ --file arrow/ci/conda_env_sphinx.txt \ $([ "$python" == "3.9" ] && echo "pickle5") \ - python=${python} openjdk \ + python=${python} \ + openjdk=17 \ nomkl && \ mamba clean --all diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 63fd7b1d46820..cb39667af1e10 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -25,6 +25,18 @@ ARG manylinux ENV MANYLINUX_VERSION=${manylinux} # Ensure dnf is installed, especially for the manylinux2014 base +RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo; \ + if [ "${arch}" != "amd64" ]; then \ + sed -i \ + -e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \ + /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \ + fi; \ + fi RUN yum install -y dnf # Install basic dependencies @@ -39,8 +51,7 @@ ENV CPYTHON_VERSION=cp38 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install CMake -# AWS SDK doesn't work with CMake=3.22 due to https://gitlab.kitware.com/cmake/cmake/-/issues/22524 -ARG cmake=3.21.4 +ARG cmake=3.29.2 COPY ci/scripts/install_cmake.sh arrow/ci/scripts/ RUN /arrow/ci/scripts/install_cmake.sh ${arch} linux ${cmake} /usr/local diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 819324a74e12a..32bbb55e82689 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -22,6 +22,8 @@ # contains choco and vs2019 preinstalled FROM abrarov/msvc-2019:2.11.0 +# hadolint shell=cmd.exe + # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" @@ -40,8 +42,8 @@ RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\P (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") -RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% -RUN python -m pip install -U pip setuptools # Install archiver to extract xz archives -RUN choco install --no-progress -r -y archiver +RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% & \ + python -m pip install --no-cache-dir -U pip setuptools & \ + choco install --no-progress -r -y archiver diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index ee01bb220710e..059e45e2a1386 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -34,15 +34,22 @@ if [ -n "${MSYSTEM:-}" ]; then export ARROW_HOME="$(cygpath --unix "${ARROW_HOME}")" fi +export PATH="${ARROW_HOME}/bin:${PATH}" + meson_pkg_config_path="${ARROW_HOME}/lib/pkgconfig" mkdir -p ${build_dir} -if [ -n "${VCPKG_ROOT:-}" ]; then +if [ -n "${VCPKG_ROOT:-}" -a -n "${VCPKG_TRIPLET:-}" ]; then vcpkg_install_root="${build_root}/vcpkg_installed" $VCPKG_ROOT/vcpkg install --x-manifest-root=${source_dir} --x-install-root=${vcpkg_install_root} - export PKG_CONFIG="${vcpkg_install_root}/x64-windows/tools/pkgconf/pkgconf.exe" - meson_pkg_config_path="${vcpkg_install_root}/x64-windows/lib/pkgconfig:${meson_pkg_config_path}" + export PKG_CONFIG="${vcpkg_install_root}/${VCPKG_TRIPLET}/tools/pkgconf/pkgconf.exe" + meson_pkg_config_path="${vcpkg_install_root}/${VCPKG_TRIPLET}/lib/pkgconfig:${meson_pkg_config_path}" + # Configure PATH for libraries required by the gobject-introspection generated binary + cpp_vcpkg_install_root="${build_root}/cpp/vcpkg_installed" + PATH="${cpp_vcpkg_install_root}/${VCPKG_TRIPLET}/debug/bin:${PATH}" + PATH="${cpp_vcpkg_install_root}/${VCPKG_TRIPLET}/bin:${PATH}" + export PATH="${vcpkg_install_root}/${VCPKG_TRIPLET}/bin:${PATH}" fi if [ -n "${VCToolsInstallDir:-}" -a -n "${MSYSTEM:-}" ]; then diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh index d914aa2d8472e..4beade50b4556 100755 --- a/ci/scripts/java_full_build.sh +++ b/ci/scripts/java_full_build.sh @@ -53,7 +53,8 @@ mvn clean \ -Parrow-c-data \ -Parrow-jni \ -Darrow.cpp.build.dir=$dist_dir \ - -Darrow.c.jni.dist.dir=$dist_dir + -Darrow.c.jni.dist.dir=$dist_dir \ + --no-transfer-progress # copy all jar, zip and pom files to the distribution folder find ~/.m2/repository/org/apache/arrow \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 6e29ef58d2318..aa86494a9d47d 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -160,6 +160,26 @@ export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python python setup.py bdist_wheel +echo "=== Strip symbols from wheel ===" +mkdir -p dist/temp-fix-wheel +mv dist/pyarrow-*.whl dist/temp-fix-wheel + +pushd dist/temp-fix-wheel +wheel_name=$(ls pyarrow-*.whl) +# Unzip and remove old wheel +unzip $wheel_name +rm $wheel_name +for filename in $(ls pyarrow/*.so pyarrow/*.so.*); do + echo "Stripping debug symbols from: $filename"; + strip --strip-debug $filename +done +# Zip wheel again after stripping symbols +zip -r $wheel_name . +mv $wheel_name .. +popd + +rm -rf dist/temp-fix-wheel + echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/swift_test.sh b/ci/scripts/swift_test.sh index b523e3891d93c..aba90f31e50d5 100755 --- a/ci/scripts/swift_test.sh +++ b/ci/scripts/swift_test.sh @@ -34,10 +34,14 @@ popd source_dir=${1}/swift/Arrow pushd ${source_dir} +sed 's/\/\/ build://g' Package.swift > Package.swift.build +mv Package.swift.build Package.swift swift test popd source_dir=${1}/swift/ArrowFlight pushd ${source_dir} +sed 's/\/\/ build://g' Package.swift > Package.swift.build +mv Package.swift.build Package.swift swift test popd diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 0d4fb540a2003..136b719ea72dd 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,11 +1,11 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index bdc544e9e..53f6bbc3b 100644 +index 7cab6f726..697ab1bb4 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -74,9 +74,12 @@ vcpkg_cmake_configure( - -DENABLE_MANUAL=OFF +@@ -84,9 +84,12 @@ vcpkg_cmake_configure( + -DBUILD_TESTING=OFF + -DENABLE_CURL_MANUAL=OFF -DCURL_CA_FALLBACK=ON - -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON @@ -15,6 +15,19 @@ index bdc544e9e..53f6bbc3b 100644 ) vcpkg_cmake_install() vcpkg_copy_pdbs() +diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake +index a79c72a59..6b7fa6a66 100644 +--- a/ports/llvm/portfile.cmake ++++ b/ports/llvm/portfile.cmake +@@ -292,6 +292,8 @@ vcpkg_cmake_configure( + ${FEATURE_OPTIONS} + MAYBE_UNUSED_VARIABLES + COMPILER_RT_ENABLE_IOS ++ BOLT_TOOLS_INSTALL_DIR ++ LIBOMP_INSTALL_ALIASES + ) + + vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake index 0c7098082..c603c3653 100644 --- a/ports/snappy/portfile.cmake @@ -52,16 +65,3 @@ index 000000000..e839c93a4 + } + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { -diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index bf9397b66..c3112b673 100644 ---- a/ports/llvm/portfile.cmake -+++ b/ports/llvm/portfile.cmake -@@ -293,6 +293,8 @@ vcpkg_cmake_configure( - ${FEATURE_OPTIONS} - MAYBE_UNUSED_VARIABLES - COMPILER_RT_ENABLE_IOS -+ BOLT_TOOLS_INSTALL_DIR -+ LIBOMP_INSTALL_ALIASES - ) - - vcpkg_cmake_install(ADD_BIN_TO_PATH) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 13d1241990c31..cb4cdfc03ac82 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -220,6 +220,14 @@ "PARQUET_REQUIRE_ENCRYPTION": "ON" } }, + { + "name": "features-valgrind", + "hidden": true, + "cacheVariables": { + "ARROW_RUNTIME_SIMD_LEVEL": "AVX2", + "ARROW_TEST_MEMCHECK": "ON" + } + }, { "name": "ninja-debug-minimal", "inherits": [ @@ -331,6 +339,46 @@ "displayName": "Debug build with everything enabled (except benchmarks)", "cacheVariables": {} }, + { + "name": "ninja-debug-valgrind-basic", + "inherits": [ + "base-debug", + "features-basic", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with reduced dependencies", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind", + "inherits": [ + "base-debug", + "features-main", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with more optional components", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind-minimal", + "inherits": [ + "base-debug", + "features-minimal", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind without anything enabled", + "cacheVariables": {} + }, + { + "name": "ninja-debug-valgrind-maximal", + "inherits": [ + "base-debug", + "features-maximal", + "features-valgrind" + ], + "displayName": "Debug build for Valgrind with everything enabled", + "cacheVariables": {} + }, { "name": "ninja-release-minimal", "inherits": [ @@ -491,4 +539,4 @@ } } ] -} \ No newline at end of file +} diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 3c58ba649c4dd..fe859a0121ca6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -634,8 +634,10 @@ endif() if(DEFINED ENV{ARROW_CARES_URL}) set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}") else() + string(REPLACE "." "_" ARROW_CARES_BUILD_VERSION_UNDERSCORES + ${ARROW_CARES_BUILD_VERSION}) set_urls(CARES_SOURCE_URL - "https://c-ares.haxx.se/download/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" + "https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION_UNDERSCORES}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" "${THIRDPARTY_MIRROR_URL}/cares-${ARROW_CARES_BUILD_VERSION}.tar.gz") endif() @@ -4613,8 +4615,11 @@ macro(build_opentelemetry) set(_OPENTELEMETRY_LIBS common http_client_curl + logs + ostream_log_record_exporter ostream_span_exporter otlp_http_client + otlp_http_log_record_exporter otlp_http_exporter otlp_recordable proto @@ -4647,6 +4652,14 @@ macro(build_opentelemetry) set(_OPENTELEMETRY_STATIC_LIBRARY "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_log_record_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http_log${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_log_record_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_logs${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) else() set(_OPENTELEMETRY_STATIC_LIBRARY "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -4681,9 +4694,16 @@ macro(build_opentelemetry) IMPORTED_LOCATION) list(APPEND OPENTELEMETRY_CMAKE_ARGS - -DWITH_OTLP=ON -DWITH_OTLP_HTTP=ON -DWITH_OTLP_GRPC=OFF + # Disabled because it seemed to cause linking errors. May be worth a closer look. + -DWITH_FUNC_TESTS=OFF + # These options are slated for removal in v1.14 and their features are deemed stable + # as of v1.13. However, setting their corresponding ENABLE_* macros in headers seems + # finicky - resulting in build failures or ABI-related runtime errors during HTTP + # client initialization. There may still be a solution, but we disable them for now. + -DWITH_OTLP_HTTP_SSL_PREVIEW=OFF + -DWITH_OTLP_HTTP_SSL_TLS_PREVIEW=OFF "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") @@ -4757,19 +4777,25 @@ macro(build_opentelemetry) target_link_libraries(opentelemetry-cpp::resources INTERFACE opentelemetry-cpp::common) target_link_libraries(opentelemetry-cpp::trace INTERFACE opentelemetry-cpp::common opentelemetry-cpp::resources) + target_link_libraries(opentelemetry-cpp::logs INTERFACE opentelemetry-cpp::common + opentelemetry-cpp::resources) target_link_libraries(opentelemetry-cpp::http_client_curl - INTERFACE opentelemetry-cpp::ext CURL::libcurl) + INTERFACE opentelemetry-cpp::common opentelemetry-cpp::ext + CURL::libcurl) target_link_libraries(opentelemetry-cpp::proto INTERFACE ${ARROW_PROTOBUF_LIBPROTOBUF}) target_link_libraries(opentelemetry-cpp::otlp_recordable - INTERFACE opentelemetry-cpp::trace opentelemetry-cpp::resources - opentelemetry-cpp::proto) + INTERFACE opentelemetry-cpp::logs opentelemetry-cpp::trace + opentelemetry-cpp::resources opentelemetry-cpp::proto) target_link_libraries(opentelemetry-cpp::otlp_http_client - INTERFACE opentelemetry-cpp::sdk opentelemetry-cpp::proto + INTERFACE opentelemetry-cpp::common opentelemetry-cpp::proto opentelemetry-cpp::http_client_curl nlohmann_json::nlohmann_json) target_link_libraries(opentelemetry-cpp::otlp_http_exporter INTERFACE opentelemetry-cpp::otlp_recordable opentelemetry-cpp::otlp_http_client) + target_link_libraries(opentelemetry-cpp::otlp_http_log_record_exporter + INTERFACE opentelemetry-cpp::otlp_recordable + opentelemetry-cpp::otlp_http_client) foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) @@ -4791,7 +4817,11 @@ if(ARROW_WITH_OPENTELEMETRY) set(opentelemetry-cpp_SOURCE "AUTO") resolve_dependency(opentelemetry-cpp) set(ARROW_OPENTELEMETRY_LIBS - opentelemetry-cpp::trace opentelemetry-cpp::ostream_span_exporter + opentelemetry-cpp::trace + opentelemetry-cpp::logs + opentelemetry-cpp::otlp_http_log_record_exporter + opentelemetry-cpp::ostream_log_record_exporter + opentelemetry-cpp::ostream_span_exporter opentelemetry-cpp::otlp_http_exporter) get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api INTERFACE_INCLUDE_DIRECTORIES) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 5bcd4625b3b67..6dc8358f502f5 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -522,6 +522,7 @@ set(ARROW_UTIL_SRCS util/int_util.cc util/io_util.cc util/list_util.cc + util/logger.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -627,6 +628,17 @@ if(ARROW_WITH_ZSTD) endforeach() endif() +if(ARROW_WITH_OPENTELEMETRY) + arrow_add_object_library(ARROW_TELEMETRY telemetry/logging.cc) + + foreach(ARROW_TELEMETRY_TARGET ${ARROW_TELEMETRY_TARGETS}) + target_link_libraries(${ARROW_TELEMETRY_TARGET} PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) + endforeach() +else() + set(ARROW_TELEMETRY_TARGET_SHARED) + set(ARROW_TELEMETRY_TARGET_STATIC) +endif() + set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static @@ -1016,6 +1028,7 @@ add_arrow_lib(arrow ${ARROW_JSON_TARGET_SHARED} ${ARROW_MEMORY_POOL_TARGET_SHARED} ${ARROW_ORC_TARGET_SHARED} + ${ARROW_TELEMETRY_TARGET_SHARED} ${ARROW_UTIL_TARGET_SHARED} ${ARROW_VENDORED_TARGET_SHARED} ${ARROW_SHARED_PRIVATE_LINK_LIBS} @@ -1031,6 +1044,7 @@ add_arrow_lib(arrow ${ARROW_JSON_TARGET_STATIC} ${ARROW_MEMORY_POOL_TARGET_STATIC} ${ARROW_ORC_TARGET_STATIC} + ${ARROW_TELEMETRY_TARGET_STATIC} ${ARROW_UTIL_TARGET_STATIC} ${ARROW_VENDORED_TARGET_STATIC} ${ARROW_SYSTEM_LINK_LIBS} @@ -1260,6 +1274,10 @@ if(ARROW_SUBSTRAIT) add_subdirectory(engine) endif() +if(ARROW_WITH_OPENTELEMETRY) + add_subdirectory(telemetry) +endif() + if(ARROW_TENSORFLOW) add_subdirectory(adapters/tensorflow) endif() diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 215b1e4d21125..f7b442cc3c624 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3220,7 +3220,7 @@ TEST(HashJoin, ManyJoins) { // stack), which is essentially the recursive usage of the temp vector stack. // A fair number of joins to guarantee temp vector stack overflow before GH-41335. - const int num_joins = 64; + const int num_joins = 16; // `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate // before outputting. diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 063b68706b313..3d18d5f967b72 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1369,14 +1369,26 @@ TEST_F(TestMapArray, FromArrays) { ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1, keys_with_null, tmp_items, pool_)); - // With null_bitmap - ASSERT_OK_AND_ASSIGN(auto map7, MapArray::FromArrays(offsets1, keys, items, pool_, - offsets3->data()->buffers[0])); + // With null_bitmap and null_count=1 + auto null_bitmap_1 = ArrayFromJSON(boolean(), "[1, 0, 1]")->data()->buffers[1]; + ASSERT_OK_AND_ASSIGN(auto map7, + MapArray::FromArrays(offsets1, keys, items, pool_, null_bitmap_1)); ASSERT_OK(map7->Validate()); MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys, items, - offsets3->data()->buffers[0], 1); + null_bitmap_1, 1); + ASSERT_EQ(map7->null_count(), 1); AssertArraysEqual(expected7, *map7); + // With null_bitmap and null_count=2 + auto null_bitmap_2 = ArrayFromJSON(boolean(), "[0, 1, 0]")->data()->buffers[1]; + ASSERT_OK_AND_ASSIGN(auto map8, + MapArray::FromArrays(offsets1, keys, items, pool_, null_bitmap_2)); + ASSERT_OK(map8->Validate()); + MapArray expected8(map_type, length, offsets1->data()->buffers[1], keys, items, + null_bitmap_2, 2); + ASSERT_EQ(map8->null_count(), 2); + AssertArraysEqual(expected8, *map8); + // Null bitmap and offset with null ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_, offsets3->data()->buffers[0])); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 2f6bca3d571ed..47c0fd35829a1 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -807,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, Result> MapArray::FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool, const std::shared_ptr& null_bitmap) { + MemoryPool* pool, std::shared_ptr null_bitmap) { using offset_type = typename MapType::offset_type; using OffsetArrowType = typename CTypeTraits::ArrowType; @@ -836,7 +836,7 @@ Result> MapArray::FromArraysInternal( return Status::NotImplemented("Null bitmap with offsets slice not supported."); } - if (offsets->null_count() > 0) { + if (offsets->data()->MayHaveNulls()) { ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); return std::make_shared(type, offsets->length() - 1, std::move(buffers), @@ -847,30 +847,32 @@ Result> MapArray::FromArraysInternal( const auto& typed_offsets = checked_cast(*offsets); BufferVector buffers; - int64_t null_count; - if (null_bitmap != nullptr) { - buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); - null_count = null_bitmap->size(); - } else { - buffers = BufferVector({null_bitmap, typed_offsets.values()}); - null_count = 0; + buffers.resize(2); + int64_t null_count = 0; + if (null_bitmap) { + buffers[0] = std::move(null_bitmap); + null_count = kUnknownNullCount; } + buffers[1] = typed_offsets.values(); return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, items, /*null_count=*/null_count, offsets->offset()); } -Result> MapArray::FromArrays( - const std::shared_ptr& offsets, const std::shared_ptr& keys, - const std::shared_ptr& items, MemoryPool* pool, - const std::shared_ptr& null_bitmap) { +Result> MapArray::FromArrays(const std::shared_ptr& offsets, + const std::shared_ptr& keys, + const std::shared_ptr& items, + MemoryPool* pool, + std::shared_ptr null_bitmap) { return FromArraysInternal(std::make_shared(keys->type(), items->type()), - offsets, keys, items, pool, null_bitmap); + offsets, keys, items, pool, std::move(null_bitmap)); } -Result> MapArray::FromArrays( - std::shared_ptr type, const std::shared_ptr& offsets, - const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool, const std::shared_ptr& null_bitmap) { +Result> MapArray::FromArrays(std::shared_ptr type, + const std::shared_ptr& offsets, + const std::shared_ptr& keys, + const std::shared_ptr& items, + MemoryPool* pool, + std::shared_ptr null_bitmap) { if (type->id() != Type::MAP) { return Status::TypeError("Expected map type, got ", type->ToString()); } @@ -881,7 +883,8 @@ Result> MapArray::FromArrays( if (!map_type.item_type()->Equals(items->type())) { return Status::TypeError("Mismatching map items type"); } - return FromArraysInternal(std::move(type), offsets, keys, items, pool, null_bitmap); + return FromArraysInternal(std::move(type), offsets, keys, items, pool, + std::move(null_bitmap)); } Status MapArray::ValidateChildData( diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index f96b6bd3b1346..a6d4977839ef1 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -537,13 +537,13 @@ class ARROW_EXPORT MapArray : public ListArray { static Result> FromArrays( const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), - const std::shared_ptr& null_bitmap = NULLPTR); + std::shared_ptr null_bitmap = NULLPTR); static Result> FromArrays( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), - const std::shared_ptr& null_bitmap = NULLPTR); + std::shared_ptr null_bitmap = NULLPTR); const MapType* map_type() const { return map_type_; } @@ -563,7 +563,7 @@ class ARROW_EXPORT MapArray : public ListArray { static Result> FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool, const std::shared_ptr& null_bitmap = NULLPTR); + MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR); private: const MapType* map_type_; diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 6089cf04d421f..1851ef9122274 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -642,6 +642,8 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { /// \brief Builder class for fixed-length list array value types class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { public: + using TypeClass = FixedSizeListType; + /// Use this constructor to define the built array's type explicitly. If value_builder /// has indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index afb664c3bc258..eba575f4cf39c 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -528,7 +528,7 @@ namespace { struct ExportedArrayPrivateData : PoolAllocationMixin { // The buffers are owned by the ArrayData member SmallVector buffers_; - struct ArrowArray dictionary_; + struct ArrowArray dictionary_ {}; SmallVector children_; SmallVector child_pointers_; diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index eaec940556361..7c3bc46650e9f 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -341,7 +341,8 @@ static auto kMatchSubstringOptionsType = GetFunctionOptionsType( DataMember("nan_is_null", &NullOptions::nan_is_null)); static auto kPadOptionsType = GetFunctionOptionsType( - DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); + DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding), + DataMember("lean_left_on_odd_padding", &PadOptions::lean_left_on_odd_padding)); static auto kReplaceSliceOptionsType = GetFunctionOptionsType( DataMember("start", &ReplaceSliceOptions::start), DataMember("stop", &ReplaceSliceOptions::stop), @@ -480,10 +481,11 @@ NullOptions::NullOptions(bool nan_is_null) : FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {} constexpr char NullOptions::kTypeName[]; -PadOptions::PadOptions(int64_t width, std::string padding) +PadOptions::PadOptions(int64_t width, std::string padding, bool lean_left_on_odd_padding) : FunctionOptions(internal::kPadOptionsType), width(width), - padding(std::move(padding)) {} + padding(std::move(padding)), + lean_left_on_odd_padding(lean_left_on_odd_padding) {} PadOptions::PadOptions() : PadOptions(0, " ") {} constexpr char PadOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index bad34f4a37881..947474e5962d0 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -358,7 +358,8 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions { class ARROW_EXPORT PadOptions : public FunctionOptions { public: - explicit PadOptions(int64_t width, std::string padding = " "); + explicit PadOptions(int64_t width, std::string padding = " ", + bool lean_left_on_odd_padding = true); PadOptions(); static constexpr char const kTypeName[] = "PadOptions"; @@ -366,6 +367,10 @@ class ARROW_EXPORT PadOptions : public FunctionOptions { int64_t width; /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII). std::string padding; + /// What to do if there is an odd number of padding characters (in case of centered + /// padding). Defaults to aligning on the left (i.e. adding the extra padding character + /// on the right) + bool lean_left_on_odd_padding = true; }; class ARROW_EXPORT TrimOptions : public FunctionOptions { diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index b1d914ce873cc..33e5928c2865d 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -763,9 +763,7 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i for (size_t i = 0; i < arguments.size(); ++i) { ARROW_ASSIGN_OR_RAISE( arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context)); - if (arguments[i].is_array()) { - all_scalar = false; - } + all_scalar &= arguments[i].is_scalar(); } int64_t input_length; diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 30bd882b2c039..d94a17b6ffadf 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -909,6 +909,41 @@ TEST(Expression, ExecuteCallWithNoArguments) { EXPECT_EQ(actual.length(), kCount); } +TEST(Expression, ExecuteChunkedArray) { + // GH-41923: compute should generate the right result if input + // ExecBatch is `chunked_array`. + auto input_schema = struct_({field("a", struct_({ + field("a", float64()), + field("b", float64()), + }))}); + + auto chunked_array_input = ChunkedArrayFromJSON(input_schema, {R"([ + {"a": {"a": 6.125, "b": 3.375}}, + {"a": {"a": 0.0, "b": 1}} + ])", + R"([ + {"a": {"a": -1, "b": 4.75}} + ])"}); + + ASSERT_OK_AND_ASSIGN(auto table_input, + Table::FromChunkedStructArray(chunked_array_input)); + + auto expr = add(field_ref(FieldRef("a", "a")), field_ref(FieldRef("a", "b"))); + + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(input_schema)); + std::vector inputs{table_input->column(0)}; + ExecBatch batch{inputs, 3}; + + ASSERT_OK_AND_ASSIGN(Datum res, ExecuteScalarExpression(expr, batch)); + + AssertDatumsEqual(res, ArrayFromJSON(float64(), + R"([ + 9.5, + 1, + 3.75 + ])")); +} + TEST(Expression, ExecuteDictionaryTransparent) { ExpectExecute( equal(field_ref("a"), field_ref("b")), diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 66d38ecd64d49..c269de0763217 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -102,6 +102,7 @@ TEST(FunctionOptions, Equality) { #endif options.emplace_back(new PadOptions(5, " ")); options.emplace_back(new PadOptions(10, "A")); + options.emplace_back(new PadOptions(10, "A", false)); options.emplace_back(new TrimOptions(" ")); options.emplace_back(new TrimOptions("abc")); options.emplace_back(new SliceOptions(/*start=*/1)); diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index afb30996eac15..7c7b9c8b68d45 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -81,6 +81,7 @@ add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_list_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_random_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h new file mode 100644 index 0000000000000..4c161533a7277 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/gather_internal.h @@ -0,0 +1,306 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/macros.h" + +// Implementation helpers for kernels that need to load/gather fixed-width +// data from multiple, arbitrary indices. +// +// https://en.wikipedia.org/wiki/Gather/scatter_(vector_addressing) + +namespace arrow::internal { + +// CRTP [1] base class for Gather that provides a gathering loop in terms of +// Write*() methods that must be implemented by the derived class. +// +// [1] https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern +template +class GatherBaseCRTP { + public: + // Output offset is not supported by Gather and idx is supposed to have offset + // pre-applied. idx_validity parameters on functions can use the offset they + // carry to read the validity bitmap as bitmaps can't have pre-applied offsets + // (they might not align to byte boundaries). + + GatherBaseCRTP() = default; + ARROW_DISALLOW_COPY_AND_ASSIGN(GatherBaseCRTP); + ARROW_DEFAULT_MOVE_AND_ASSIGN(GatherBaseCRTP); + + protected: + ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) { + auto* self = static_cast(this); + for (int64_t position = 0; position < idx_length; position++) { + self->WriteValue(position); + } + return idx_length; + } + + // See derived Gather classes below for the meaning of the parameters, pre and + // post-conditions. + // + // src_validity is not necessarily the source of the values that are being + // gathered (e.g. the source could be a nested fixed-size list array and the + // values being gathered are from the innermost buffer), so the ArraySpan is + // used solely to check for nulls in the source values and nothing else. + // + // idx_length is the number of elements in idx and consequently the number of + // bits that might be written to out_is_valid. Member `Write*()` functions will be + // called with positions from 0 to idx_length - 1. + // + // If `kOutputIsZeroInitialized` is true, then `WriteZero()` or `WriteZeroSegment()` + // doesn't have to be called for resulting null positions. A position is + // considered null if either the index or the source value is null at that + // position. + template + ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity, + int64_t idx_length, const IndexCType* idx, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + auto* self = static_cast(this); + OptionalBitBlockCounter indices_bit_counter(idx_validity.buffers[0].data, + idx_validity.offset, idx_length); + int64_t position = 0; + int64_t valid_count = 0; + while (position < idx_length) { + BitBlockCount block = indices_bit_counter.NextBlock(); + if (!src_validity.MayHaveNulls()) { + // Source values are never null, so things are easier + valid_count += block.popcount; + if (block.popcount == block.length) { + // Fastest path: neither source values nor index nulls + bit_util::SetBitsTo(out_is_valid, position, block.length, true); + for (int64_t i = 0; i < block.length; ++i) { + self->WriteValue(position); + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some indices but not all are null + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr); + if (idx_validity.IsValid(position)) { + // index is not null + bit_util::SetBit(out_is_valid, position); + self->WriteValue(position); + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else { + self->WriteZeroSegment(position, block.length); + position += block.length; + } + } else { + // Source values may be null, so we must do random access into src_validity + if (block.popcount == block.length) { + // Faster path: indices are not null but source values may be + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr); + if (src_validity.IsValid(idx[position])) { + // value is not null + self->WriteValue(position); + bit_util::SetBit(out_is_valid, position); + ++valid_count; + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some but not all indices are null. Since we are doing + // random access in general we have to check the value nullness one by + // one. + for (int64_t i = 0; i < block.length; ++i) { + ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr); + ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr); + if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) { + // index is not null && value is not null + self->WriteValue(position); + bit_util::SetBit(out_is_valid, position); + ++valid_count; + } else if constexpr (!kOutputIsZeroInitialized) { + self->WriteZero(position); + } + ++position; + } + } else { + if constexpr (!kOutputIsZeroInitialized) { + self->WriteZeroSegment(position, block.length); + } + position += block.length; + } + } + } + return valid_count; + } +}; + +// A gather primitive for primitive fixed-width types with a integral byte width. If +// `kWithFactor` is true, the actual width is a runtime multiple of `kValueWidthInbits` +// (this can be useful for fixed-size list inputs and other input types with unusual byte +// widths that don't deserve value specialization). +template +class Gather : public GatherBaseCRTP> { + public: + static_assert(kValueWidthInBits >= 0 && kValueWidthInBits % 8 == 0); + static constexpr int kValueWidth = kValueWidthInBits / 8; + + private: + const int64_t src_length_; // number of elements of kValueWidth bytes in src_ + const uint8_t* src_; + const int64_t idx_length_; // number IndexCType elements in idx_ + const IndexCType* idx_; + uint8_t* out_; + int64_t factor_; + + public: + void WriteValue(int64_t position) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memcpy(out_ + position * scaled_factor, src_ + idx_[position] * scaled_factor, + scaled_factor); + } else { + memcpy(out_ + position * kValueWidth, src_ + idx_[position] * kValueWidth, + kValueWidth); + } + } + + void WriteZero(int64_t position) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memset(out_ + position * scaled_factor, 0, scaled_factor); + } else { + memset(out_ + position * kValueWidth, 0, kValueWidth); + } + } + + void WriteZeroSegment(int64_t position, int64_t length) { + if constexpr (kWithFactor) { + const int64_t scaled_factor = kValueWidth * factor_; + memset(out_ + position * scaled_factor, 0, length * scaled_factor); + } else { + memset(out_ + position * kValueWidth, 0, length * kValueWidth); + } + } + + public: + Gather(int64_t src_length, const uint8_t* src, int64_t zero_src_offset, + int64_t idx_length, const IndexCType* idx, uint8_t* out, int64_t factor) + : src_length_(src_length), + src_(src), + idx_length_(idx_length), + idx_(idx), + out_(out), + factor_(factor) { + assert(zero_src_offset == 0); + assert(src && idx && out); + assert((kWithFactor || factor == 1) && + "When kWithFactor is false, the factor is assumed to be 1 at compile time"); + } + + ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); } + + /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized. + /// \pre Bits in out_is_valid have to always be zero initialized. + /// \post The bits for the valid elements (and only those) are set in out_is_valid. + /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null + /// elements have 0s written to them. This might be less efficient than + /// zero-initializing first and calling this->Execute() afterwards. + /// \return The number of valid elements in out. + template + ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + assert(src_length_ == src_validity.length); + assert(idx_length_ == idx_validity.length); + assert(out_is_valid); + return this->template ExecuteWithNulls( + src_validity, idx_length_, idx_, idx_validity, out_is_valid); + } +}; + +// A gather primitive for boolean inputs. Unlike its counterpart above, +// this does not support passing a non-trivial factor parameter. +template +class Gather + : public GatherBaseCRTP> { + private: + const int64_t src_length_; // number of elements of bits bytes in src_ after offset + const uint8_t* src_; // the boolean array data buffer in bits + const int64_t src_offset_; // offset in bits + const int64_t idx_length_; // number IndexCType elements in idx_ + const IndexCType* idx_; + uint8_t* out_; // output boolean array data buffer in bits + + public: + Gather(int64_t src_length, const uint8_t* src, int64_t src_offset, int64_t idx_length, + const IndexCType* idx, uint8_t* out, int64_t factor) + : src_length_(src_length), + src_(src), + src_offset_(src_offset), + idx_length_(idx_length), + idx_(idx), + out_(out) { + assert(src && idx && out); + assert(factor == 1 && + "factor != 1 is not supported when Gather is used to gather bits/booleans"); + } + + void WriteValue(int64_t position) { + bit_util::SetBitTo(out_, position, + bit_util::GetBit(src_, src_offset_ + idx_[position])); + } + + void WriteZero(int64_t position) { bit_util::ClearBit(out_, position); } + + void WriteZeroSegment(int64_t position, int64_t block_length) { + bit_util::SetBitsTo(out_, position, block_length, false); + } + + ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); } + + /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized. + /// \pre Bits in out_is_valid have to always be zero initialized. + /// \post The bits for the valid elements (and only those) are set in out_is_valid. + /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null + /// elements have 0s written to them. This might be less efficient than + /// zero-initializing first and calling this->Execute() afterwards. + /// \return The number of valid elements in out. + template + ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity, + const ArraySpan& idx_validity, + uint8_t* out_is_valid) { + assert(src_length_ == src_validity.length); + assert(idx_length_ == idx_validity.length); + assert(out_is_valid); + return this->template ExecuteWithNulls( + src_validity, idx_length_, idx_, idx_validity, out_is_valid); + } +}; + +} // namespace arrow::internal diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc index c32a6ef6de93e..2a54d28c6fb64 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc @@ -56,8 +56,9 @@ Status CastToExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou std::shared_ptr GetCastToExtension(std::string name) { auto func = std::make_shared(std::move(name), Type::EXTENSION); for (Type::type in_ty : AllTypeIds()) { - DCHECK_OK( - func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType, CastToExtension)); + DCHECK_OK(func->AddKernel(in_ty, {InputType(in_ty)}, kOutputTargetType, + CastToExtension, NullHandling::COMPUTED_NO_PREALLOCATE, + MemAllocation::NO_PREALLOCATE)); } return func; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a6d7f6097b59b..f60d8f2e19e98 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2281,7 +2281,7 @@ TEST(Cast, ListToPrimitive) { Cast(*ArrayFromJSON(list(binary()), R"([["1", "2"], ["3", "4"]])"), utf8())); } -using make_list_t = std::shared_ptr(const std::shared_ptr&); +using make_list_t = std::shared_ptr(std::shared_ptr); static const auto list_factories = std::vector{&list, &large_list}; diff --git a/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc new file mode 100644 index 0000000000000..8c5b43d55f756 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/exec.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" +#include "benchmark/benchmark.h" + +namespace arrow::compute { + +constexpr auto kSeed = 0x94378165; + +const auto kSliceStart = 2; +const auto kSliceStop = 10; + +static void BenchmarkListSlice(benchmark::State& state, const ListSliceOptions& opts, + std::shared_ptr list_ty) { + RegressionArgs args(state, /*size_is_bytes=*/false); + auto rand = random::RandomArrayGenerator(kSeed); + auto array = rand.ArrayOf(std::move(list_ty), args.size, args.null_proportion); + auto ctx = default_exec_context(); + std::vector input_args = {std::move(array)}; + for (auto _ : state) { + ABORT_NOT_OK(CallFunction("list_slice", input_args, &opts, ctx).status()); + } +} + +template +static void ListSliceInt64List(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringList(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +template +static void ListSliceInt64ListWithStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringListWithStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +template +static void ListSliceInt64ListWithStepAndStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.step = 2; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +template +static void ListSliceStringListWithStepAndStop(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.step = 2; + opts.stop = kSliceStop; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +static void ListSliceInt64ListView(benchmark::State& state) { + ListSliceInt64List(state); +} + +static void ListSliceStringListView(benchmark::State& state) { + ListSliceStringList(state); +} + +static void ListSliceInt64ListViewWithStop(benchmark::State& state) { + ListSliceInt64ListWithStop(state); +} + +static void ListSliceStringListViewWithStop(benchmark::State& state) { + ListSliceStringListWithStop(state); +} + +static void ListSliceInt64ListViewWithStepAndStop(benchmark::State& state) { + ListSliceInt64ListWithStepAndStop(state); +} + +static void ListSliceStringListViewWithStepAndStop(benchmark::State& state) { + ListSliceStringListWithStepAndStop(state); +} + +static void ListSliceInt64ListToFSL(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + opts.return_fixed_size_list = true; + BenchmarkListSlice(state, opts, std::make_shared(int64())); +} + +static void ListSliceStringListToFSL(benchmark::State& state) { + ListSliceOptions opts; + opts.start = kSliceStart; + opts.stop = kSliceStop; + opts.return_fixed_size_list = true; + BenchmarkListSlice(state, opts, std::make_shared(utf8())); +} + +BENCHMARK(ListSliceInt64List)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringList)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListWithStepAndStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListWithStepAndStop)->Apply(RegressionSetArgs); + +BENCHMARK(ListSliceInt64ListView)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListView)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListViewWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListViewWithStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceInt64ListViewWithStepAndStop)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListViewWithStepAndStop)->Apply(RegressionSetArgs); + +BENCHMARK(ListSliceInt64ListToFSL)->Apply(RegressionSetArgs); +BENCHMARK(ListSliceStringListToFSL)->Apply(RegressionSetArgs); + +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index b99f065a0b158..71e367153d9c7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -28,6 +28,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/string.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -130,14 +131,56 @@ std::string ToString(const std::optional& o) { return o.has_value() ? ToChars(*o) : "(nullopt)"; } -template +/// \param stop User-provided stop or the length of the input list +int64_t ListSliceLength(int64_t start, int64_t step, int64_t stop) { + DCHECK_GE(step, 1); + const auto size = std::max(stop - start, 0); + return bit_util::CeilDiv(size, step); +} + +std::optional EffectiveSliceStop(const ListSliceOptions& opts, + const BaseListType& input_type) { + if (!opts.stop.has_value() && input_type.id() == Type::FIXED_SIZE_LIST) { + return checked_cast(input_type).list_size(); + } + return opts.stop; +} + +Result ListSliceOutputType(const ListSliceOptions& opts, + const BaseListType& input_list_type) { + const auto& value_type = input_list_type.field(0); + const bool is_fixed_size_list = input_list_type.id() == Type::FIXED_SIZE_LIST; + const auto return_fixed_size_list = + opts.return_fixed_size_list.value_or(is_fixed_size_list); + if (return_fixed_size_list) { + auto stop = EffectiveSliceStop(opts, input_list_type); + if (!stop.has_value()) { + return Status::Invalid( + "Unable to produce FixedSizeListArray from non-FixedSizeListArray without " + "`stop` being set."); + } + if (opts.step < 1) { + return Status::Invalid("`step` must be >= 1, got: ", opts.step); + } + const auto length = ListSliceLength(opts.start, opts.step, *stop); + return fixed_size_list(value_type, static_cast(length)); + } + if (is_fixed_size_list) { + return list(value_type); + } + return TypeHolder{&input_list_type}; +} + +template struct ListSlice { - using offset_type = typename Type::offset_type; + using offset_type = typename InListType::offset_type; static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const auto opts = OptionsWrapper::Get(ctx); + const auto& opts = OptionsWrapper::Get(ctx); + const ArraySpan& list_array = batch[0].array; + const auto* list_type = checked_cast(list_array.type); - // Invariants + // Pre-conditions if (opts.start < 0 || (opts.stop.has_value() && opts.start >= opts.stop.value())) { // TODO(ARROW-18281): support start == stop which should give empty lists return Status::Invalid("`start`(", opts.start, @@ -148,128 +191,201 @@ struct ListSlice { return Status::Invalid("`step` must be >= 1, got: ", opts.step); } - const ArraySpan& list_array = batch[0].array; - const Type* list_type = checked_cast(list_array.type); - const auto value_type = list_type->field(0); - const auto return_fixed_size_list = opts.return_fixed_size_list.value_or( - list_type->id() == arrow::Type::FIXED_SIZE_LIST); - std::unique_ptr builder; - - // should have been checked in resolver - // if stop not set, then cannot return fixed size list without input being fixed size - // list b/c we cannot determine the max list element in type resolving. - DCHECK(opts.stop.has_value() || - (!opts.stop.has_value() && (!return_fixed_size_list || - list_type->id() == arrow::Type::FIXED_SIZE_LIST))); - - // construct array values - if (return_fixed_size_list) { - int32_t stop; - if (opts.stop.has_value()) { - stop = static_cast(opts.stop.value()); - } else { - DCHECK_EQ(list_type->id(), arrow::Type::FIXED_SIZE_LIST); - stop = reinterpret_cast(list_type)->list_size(); - } - const auto size = std::max(stop - static_cast(opts.start), 0); - const auto length = bit_util::CeilDiv(size, opts.step); - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), - fixed_size_list(value_type, static_cast(length)), - &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } else { - if constexpr (std::is_same_v) { - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } else { - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), &builder)); - RETURN_NOT_OK(BuildArray(batch, opts, *builder)); - } + auto* pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto output_type_holder, ListSliceOutputType(opts, *list_type)); + constexpr auto kInputTypeId = InListType::type_id; + auto output_type = output_type_holder.GetSharedPtr(); + switch (output_type->id()) { + // The various `if constexpr` guards below avoid generating + // ListSlice::BuildArray specializations + // that will never be invoked at runtime. + case Type::LIST: + DCHECK(kInputTypeId == Type::LIST || kInputTypeId == Type::FIXED_SIZE_LIST); + if constexpr (kInputTypeId == Type::LIST || + kInputTypeId == Type::FIXED_SIZE_LIST) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::LARGE_LIST: + DCHECK_EQ(kInputTypeId, Type::LARGE_LIST); + if constexpr (kInputTypeId == Type::LARGE_LIST) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::FIXED_SIZE_LIST: + // A fixed-size list can be produced from any list-like input + // if ListSliceOptions::return_fixed_size_list is set to true + return BuildArray(pool, opts, batch, output_type, out); + case Type::LIST_VIEW: + DCHECK_EQ(kInputTypeId, Type::LIST_VIEW); + if constexpr (kInputTypeId == Type::LIST_VIEW) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + case Type::LARGE_LIST_VIEW: + DCHECK_EQ(kInputTypeId, Type::LARGE_LIST_VIEW); + if constexpr (kInputTypeId == Type::LARGE_LIST_VIEW) { + return BuildArray(pool, opts, batch, output_type, out); + } + break; + default: + break; } - - // build output arrays and set result - ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); - out->value = std::move(result->data()); + Unreachable(); return Status::OK(); } + /// \brief Builds the array of list slices from the input list array template - static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts, - ArrayBuilder& builder) { - if constexpr (std::is_same_v) { - RETURN_NOT_OK(BuildArrayFromFixedSizeListType(batch, opts, builder)); + static Status BuildArray(MemoryPool* pool, const ListSliceOptions& opts, + const ExecSpan& batch, + const std::shared_ptr& output_type, + ExecResult* out) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, output_type, &builder)); + auto* list_builder = checked_cast(builder.get()); + RETURN_NOT_OK(list_builder->Resize(batch[0].array.length)); + if constexpr (std::is_same_v) { + RETURN_NOT_OK(BuildArrayFromFixedSizeListType(opts.start, opts.step, opts.stop, + batch, list_builder)); } else { - RETURN_NOT_OK(BuildArrayFromListType(batch, opts, builder)); + RETURN_NOT_OK(BuildArrayFromVarLenListLikeType(opts.start, opts.step, opts.stop, + batch, list_builder)); } + std::shared_ptr result; + RETURN_NOT_OK(list_builder->FinishInternal(&result)); + out->value = std::move(result); return Status::OK(); } template - static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch, - const ListSliceOptions& opts, - ArrayBuilder& builder) { - const auto list_size = - checked_cast(*batch[0].type()).list_size(); + static Status BuildArrayFromFixedSizeListType(int64_t start, int64_t step, + std::optional stop, + const ExecSpan& batch, + BuilderType* out_list_builder) { + static_assert(std::is_same_v); + constexpr bool kIsFixedSizeOutput = std::is_same_v; + const auto& fsl_type = checked_cast(*batch[0].type()); const ArraySpan& list_array = batch[0].array; - const ArraySpan& list_values = list_array.child_data[0]; - - auto list_builder = checked_cast(&builder); - for (auto i = 0; i < list_array.length; ++i) { - auto offset = (i + list_array.offset) * list_size; - auto next_offset = offset + list_size; - if (list_array.IsNull(i)) { - RETURN_NOT_OK(list_builder->AppendNull()); + const ArraySpan& values_array = list_array.child_data[0]; + ArrayBuilder* value_builder = out_list_builder->value_builder(); + + auto* is_valid = list_array.GetValues(0, 0); + const auto list_size = static_cast(fsl_type.list_size()); + const int64_t effective_stop = stop.value_or(list_size); + int64_t slice_length, value_count; + int64_t null_padding = 0; + if constexpr (kIsFixedSizeOutput) { + if (list_size < effective_stop) { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = ListSliceLength(start, step, list_size); + DCHECK_LE(value_count, slice_length); + null_padding = slice_length - value_count; } else { - RETURN_NOT_OK(SetValues(list_builder, offset, next_offset, &opts, - &list_values)); + slice_length = ListSliceLength(start, step, effective_stop); + value_count = slice_length; } + } else { + slice_length = ListSliceLength(start, step, std::min(list_size, effective_stop)); + value_count = slice_length; + } + int64_t offset = list_array.offset * list_size; + for (int64_t i = 0; i < list_array.length; ++i) { + if (is_valid && !bit_util::GetBit(is_valid, list_array.offset + i)) { + RETURN_NOT_OK(out_list_builder->AppendNull()); + } else { + int64_t start_offset = offset + start; + RETURN_NOT_OK(AppendListSliceDimensions(slice_length, + out_list_builder)); + RETURN_NOT_OK(AppendListSliceValues(start_offset, step, value_count, null_padding, + values_array, value_builder)); + } + offset += list_size; } return Status::OK(); } template - static Status BuildArrayFromListType(const ExecSpan& batch, - const ListSliceOptions& opts, - ArrayBuilder& builder) { + static Status BuildArrayFromVarLenListLikeType(int64_t start, int64_t step, + std::optional stop, + const ExecSpan& batch, + BuilderType* out_list_builder) { + constexpr bool kIsListViewInput = is_list_view(InListType::type_id); + constexpr bool kIsFixedSizeOutput = std::is_same_v; const ArraySpan& list_array = batch[0].array; - const offset_type* offsets = list_array.GetValues(1); - - const ArraySpan& list_values = list_array.child_data[0]; - - auto list_builder = checked_cast(&builder); - for (auto i = 0; i < list_array.length; ++i) { + const ArraySpan& values_array = list_array.child_data[0]; + ArrayBuilder* value_builder = out_list_builder->value_builder(); + + const auto* is_valid = list_array.GetValues(0, 0); + const auto* offsets = list_array.GetValues(1); + const offset_type* sizes = nullptr; + if constexpr (kIsListViewInput) { + sizes = list_array.GetValues(2); + } + for (int64_t i = 0; i < list_array.length; ++i) { const offset_type offset = offsets[i]; - const offset_type next_offset = offsets[i + 1]; - if (list_array.IsNull(i)) { - RETURN_NOT_OK(list_builder->AppendNull()); + const int64_t list_size = kIsListViewInput ? sizes[i] : offsets[i + 1] - offset; + if (is_valid && !bit_util::GetBit(is_valid, list_array.offset + i)) { + RETURN_NOT_OK(out_list_builder->AppendNull()); } else { - RETURN_NOT_OK(SetValues(list_builder, offset, next_offset, &opts, - &list_values)); + int64_t effective_stop = stop.value_or(list_size); + int64_t slice_length, value_count; + int64_t null_padding = 0; + if constexpr (kIsFixedSizeOutput) { + if (list_size < effective_stop) { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = ListSliceLength(start, step, list_size); + DCHECK_LE(value_count, slice_length); + null_padding = slice_length - value_count; + } else { + slice_length = ListSliceLength(start, step, effective_stop); + value_count = slice_length; + } + } else { + slice_length = + ListSliceLength(start, step, std::min(list_size, effective_stop)); + value_count = slice_length; + } + RETURN_NOT_OK(AppendListSliceDimensions(slice_length, + out_list_builder)); + RETURN_NOT_OK(AppendListSliceValues(offset + start, step, value_count, + null_padding, values_array, value_builder)); } } return Status::OK(); } - template - static Status SetValues(BuilderType* list_builder, const offset_type offset, - const offset_type next_offset, const ListSliceOptions* opts, - const ArraySpan* list_values) { - auto value_builder = list_builder->value_builder(); - auto cursor = offset; - - RETURN_NOT_OK(list_builder->Append()); - const auto size = opts->stop.has_value() ? (opts->stop.value() - opts->start) - : ((next_offset - opts->start) - offset); - while (cursor < offset + size) { - if (cursor + opts->start >= next_offset) { - if constexpr (!std::is_same_v) { - break; // don't pad nulls for variable sized list output - } - RETURN_NOT_OK(value_builder->AppendNull()); - } else { + + template + static Status AppendListSliceDimensions(int64_t slice_length, + BuilderType* out_list_builder) { + if constexpr (kIsFixedSizeOutput) { + DCHECK_EQ(out_list_builder->type()->id(), Type::FIXED_SIZE_LIST); + return out_list_builder->Append(); + } else { + return out_list_builder->Append(/*is_valid=*/true, slice_length); + } + } + + /// \param value_count The pre-validated number of values to append starting + /// from `start_offset` with a step of `step` + /// \param null_padding The number of nulls to append after the values + static Status AppendListSliceValues(int64_t start_offset, int64_t step, + int64_t value_count, int64_t null_padding, + const ArraySpan& values_array, + ArrayBuilder* out_value_builder) { + if (step == 1) { + RETURN_NOT_OK( + out_value_builder->AppendArraySlice(values_array, start_offset, value_count)); + } else { + auto cursor_offset = start_offset; + for (int64_t i = 0; i < value_count; i++) { RETURN_NOT_OK( - value_builder->AppendArraySlice(*list_values, cursor + opts->start, 1)); + out_value_builder->AppendArraySlice(values_array, cursor_offset, 1)); + cursor_offset += step; } - cursor += static_cast(opts->step); + } + if (null_padding > 0) { + RETURN_NOT_OK(out_value_builder->AppendNulls(null_padding)); } return Status::OK(); } @@ -278,36 +394,8 @@ struct ListSlice { Result MakeListSliceResolve(KernelContext* ctx, const std::vector& types) { const auto& opts = OptionsWrapper::Get(ctx); - const auto list_type = checked_cast(types[0].type); - const auto value_type = list_type->field(0); - const auto return_fixed_size_list = - opts.return_fixed_size_list.value_or(list_type->id() == Type::FIXED_SIZE_LIST); - if (return_fixed_size_list) { - int32_t stop; - if (!opts.stop.has_value()) { - if (list_type->id() == Type::FIXED_SIZE_LIST) { - stop = checked_cast(list_type)->list_size(); - } else { - return Status::NotImplemented( - "Unable to produce FixedSizeListArray from non-FixedSizeListArray without " - "`stop` being set."); - } - } else { - stop = static_cast(opts.stop.value()); - } - const auto size = std::max(static_cast(stop - opts.start), 0); - if (opts.step < 1) { - return Status::Invalid("`step` must be >= 1, got: ", opts.step); - } - const auto length = bit_util::CeilDiv(size, opts.step); - return fixed_size_list(value_type, static_cast(length)); - } else { - // Returning large list if that's what we got in and didn't ask for fixed size - if (list_type->id() == Type::LARGE_LIST) { - return large_list(value_type); - } - return list(value_type); - } + const auto* list_type = checked_cast(types[0].type); + return ListSliceOutputType(opts, *list_type); } template @@ -325,6 +413,8 @@ void AddListSliceKernels(ScalarFunction* func) { AddListSliceKernels(func); AddListSliceKernels(func); AddListSliceKernels(func); + AddListSliceKernels(func); + AddListSliceKernels(func); } const FunctionDoc list_slice_doc( diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 32bea8246954d..b6a6cac1b4382 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -128,33 +128,45 @@ TEST(TestScalarNested, ListElementInvalid) { Raises(StatusCode::Invalid)); } +using VarLenListLikeTypeFactory = + std::shared_ptr (*)(std::shared_ptr); +static const VarLenListLikeTypeFactory kVarLenListTypeFactories[] = { + list, + large_list, + list_view, + large_list_view, +}; + TEST(TestScalarNested, ListSliceVariableOutput) { const auto value_types = {float32(), int32()}; for (auto value_type : value_types) { - auto input = ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], null]"); - ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, - /*return_fixed_size_list=*/false); - auto expected = ArrayFromJSON(list(value_type), "[[1, 2], [4, 5], [6], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + for (auto list_type_factory : kVarLenListTypeFactories) { + ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, + /*return_fixed_size_list=*/false); + auto list_ty = list_type_factory(value_type); + auto input = ArrayFromJSON(list_ty, "[[1, 2, 3], [4, 5], [6], null]"); + auto expected = ArrayFromJSON(list_ty, "[[1, 2], [4, 5], [6], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 1; - expected = ArrayFromJSON(list(value_type), "[[2], [5], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 1; + expected = ArrayFromJSON(list_ty, "[[2], [5], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 2; - args.stop = 4; - expected = ArrayFromJSON(list(value_type), "[[3], [], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 2; + args.stop = 4; + expected = ArrayFromJSON(list_ty, "[[3], [], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 1; - args.stop = std::nullopt; - expected = ArrayFromJSON(list(value_type), "[[2, 3], [5], [], null]"); - CheckScalarUnary("list_slice", input, expected, &args); + args.start = 1; + args.stop = std::nullopt; + expected = ArrayFromJSON(list_ty, "[[2, 3], [5], [], null]"); + CheckScalarUnary("list_slice", input, expected, &args); - args.start = 0; - args.stop = 4; - args.step = 2; - expected = ArrayFromJSON(list(value_type), "[[1, 3], [4], [6], null]"); + args.start = 0; + args.stop = 4; + args.step = 2; + expected = ArrayFromJSON(list_ty, "[[1, 3], [4], [6], null]"); + } } // Verify passing `return_fixed_size_list=false` with fixed size input @@ -169,9 +181,13 @@ TEST(TestScalarNested, ListSliceVariableOutput) { TEST(TestScalarNested, ListSliceFixedOutput) { const auto value_types = {float32(), int32()}; for (auto value_type : value_types) { - auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], null]"), - ArrayFromJSON(fixed_size_list(value_type, 3), - "[[1, 2, 3], [4, 5, null], [6, null, null], null]")}; + const char* kVarLenListJSON = "[[1, 2, 3], [4, 5], [6], null]"; + const char* kFixedSizeListJSON = "[[1, 2, 3], [4, 5, null], [6, null, null], null]"; + std::vector> inputs; + for (auto list_type_factory : kVarLenListTypeFactories) { + inputs.push_back(ArrayFromJSON(list_type_factory(value_type), kVarLenListJSON)); + } + inputs.push_back(ArrayFromJSON(fixed_size_list(value_type, 3), kFixedSizeListJSON)); for (auto input : inputs) { ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, /*return_fixed_size_list=*/true); @@ -198,7 +214,7 @@ TEST(TestScalarNested, ListSliceFixedOutput) { CheckScalarUnary("list_slice", input, expected, &args); } else { EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, + Invalid, ::testing::HasSubstr("Unable to produce FixedSizeListArray from " "non-FixedSizeListArray without `stop` being set."), CallFunction("list_slice", {input}, &args)); @@ -264,22 +280,25 @@ TEST(TestScalarNested, ListSliceChildArrayOffset) { ASSERT_EQ(input->offset(), 0); ASSERT_EQ(input->values()->offset(), 2); - ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1, + ListSliceOptions args(/*start=*/0, /*stop=*/3, /*step=*/1, /*return_fixed_size_list=*/false); auto expected = ArrayFromJSON(list(int8()), "[[2], [3, 4]]"); CheckScalarUnary("list_slice", input, expected, &args); args.return_fixed_size_list = true; - expected = ArrayFromJSON(fixed_size_list(int8(), 2), "[[2, null], [3, 4]]"); + expected = ArrayFromJSON(fixed_size_list(int8(), 3), "[[2, null, null], [3, 4, null]]"); CheckScalarUnary("list_slice", input, expected, &args); } TEST(TestScalarNested, ListSliceOutputEqualsInputType) { + const char* kVarLenListJSON = "[[1, 2, 3], [4, 5], [6, null], null]"; + const char* kFixedLenListJSON = "[[1, 2], [4, 5], [6, null], null]"; // Default is to return same type as the one passed in. - auto inputs = { - ArrayFromJSON(list(int8()), "[[1, 2, 3], [4, 5], [6, null], null]"), - ArrayFromJSON(large_list(int8()), "[[1, 2, 3], [4, 5], [6, null], null]"), - ArrayFromJSON(fixed_size_list(int8(), 2), "[[1, 2], [4, 5], [6, null], null]")}; + std::vector> inputs; + for (auto list_type_factory : kVarLenListTypeFactories) { + inputs.push_back(ArrayFromJSON(list_type_factory(int8()), kVarLenListJSON)); + } + inputs.push_back(ArrayFromJSON(fixed_size_list(int8(), 2), kFixedLenListJSON)); for (auto input : inputs) { ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1); auto expected = ArrayFromJSON(input->type(), "[[1, 2], [4, 5], [6, null], null]"); @@ -316,10 +335,9 @@ TEST(TestScalarNested, ListSliceBadParameters) { // stop not set and FixedSizeList requested with variable sized input args.stop = std::nullopt; EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, - ::testing::HasSubstr("NotImplemented: Unable to produce FixedSizeListArray from " - "non-FixedSizeListArray without " - "`stop` being set."), + Invalid, + ::testing::HasSubstr("Invalid: Unable to produce FixedSizeListArray from " + "non-FixedSizeListArray without `stop` being set."), CallFunction("list_slice", {input}, &args)); // Catch step must be >= 1 args.start = 0; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 762b666c6a148..fecd57412b436 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -1142,9 +1142,13 @@ struct AsciiPadTransform : public StringTransformBase { int64_t left = 0; int64_t right = 0; if (PadLeft && PadRight) { - // If odd number of spaces, put the extra space on the right - left = spaces / 2; - right = spaces - left; + if (options_.lean_left_on_odd_padding) { + left = spaces / 2; + right = spaces - left; + } else { + right = spaces / 2; + left = spaces - right; + } } else if (PadLeft) { left = spaces; } else if (PadRight) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index c7dbdef2436c3..0a2261290846a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -2117,6 +2117,12 @@ TYPED_TEST(TestStringKernels, PadUTF8) { R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])", &options); + PadOptions options2{/*width=*/5, "\xe2\x80\x88", /*lean_left_on_odd_padding=*/false}; + this->CheckUnary( + "utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(), + R"([null, "\u2008\u2008a\u2008\u2008", "\u2008\u2008bb\u2008", "\u2008b\u00E1r\u2008", "foobar"])", + &options2); + PadOptions options_bad{/*width=*/3, /*padding=*/"spam"}; auto input = ArrayFromJSON(this->type(), R"(["foo"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, @@ -2459,6 +2465,10 @@ TYPED_TEST(TestStringKernels, PadAscii) { this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(), R"([null, "a ", "bb ", "bar ", "foobar"])", &options); + PadOptions options2{/*width=*/5, " ", /*lean_left_on_odd_padding=*/false}; + this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(), + R"([null, " a ", " bb ", " bar ", "foobar"])", &options2); + PadOptions options_bad{/*width=*/3, /*padding=*/"spam"}; auto input = ArrayFromJSON(this->type(), R"(["foo"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index d720d4eee804f..42762ca8b116f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -930,9 +930,13 @@ struct Utf8PadTransform : public StringTransformBase { int64_t left = 0; int64_t right = 0; if (PadLeft && PadRight) { - // If odd number of spaces, put the extra space on the right - left = spaces / 2; - right = spaces - left; + if (options_.lean_left_on_odd_padding) { + left = spaces / 2; + right = spaces - left; + } else { + right = spaces / 2; + left = spaces - right; + } } else if (PadLeft) { left = spaces; } else if (PadRight) { diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 44bb7372c3f68..5067298858132 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -698,13 +698,12 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty) DCHECK_OK(func->AddKernel(base)); } - // Example parametric types that we want to match only on Type::type - auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO), - timestamp(TimeUnit::SECOND), duration(TimeUnit::SECOND), - fixed_size_binary(0)}; - for (const auto& ty : parametric_types) { - base.init = GetHashInit(ty->id()); - base.signature = KernelSignature::Make({ty->id()}, out_ty); + // Parametric types that we want matching to be dependent only on type id + auto parametric_types = {Type::TIME32, Type::TIME64, Type::TIMESTAMP, Type::DURATION, + Type::FIXED_SIZE_BINARY}; + for (const auto& type_id : parametric_types) { + base.init = GetHashInit(type_id); + base.signature = KernelSignature::Make({type_id}, out_ty); DCHECK_OK(func->AddKernel(base)); } diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 8c77c261c6a98..955f9b8cbd14c 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -21,9 +21,16 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/list_util.h" #include "arrow/visit_type_inline.h" namespace arrow { + +using internal::CountSetBits; +using list_util::internal::RangeOfValuesUsed; + namespace compute { namespace internal { namespace { @@ -76,6 +83,63 @@ struct ListParentIndicesArray { Status Visit(const LargeListType& type) { return VisitList(type); } + template + Status VisitListView(const Type&) { + ArraySpan list_view{*input}; + + const offset_type* offsets = list_view.GetValues(1); + const offset_type* sizes = list_view.GetValues(2); + int64_t values_offset; + int64_t values_length; + ARROW_ASSIGN_OR_RAISE(std::tie(values_offset, values_length), + RangeOfValuesUsed(list_view)); + + ARROW_ASSIGN_OR_RAISE(auto indices_validity, + AllocateEmptyBitmap(values_length, ctx->memory_pool())); + auto* out_indices_validity = indices_validity->mutable_data(); + int64_t total_pop_count = 0; + + ARROW_ASSIGN_OR_RAISE(auto indices, ctx->Allocate(values_length * sizeof(int64_t))); + auto* out_indices = indices->template mutable_data_as(); + memset(out_indices, -1, values_length * sizeof(int64_t)); + + const auto* validity = list_view.GetValues(0, 0); + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, list_view.offset, list_view.length, + [this, offsets, sizes, out_indices, out_indices_validity, values_offset, + &total_pop_count](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + auto validity_offset = offsets[i] - values_offset; + const int64_t pop_count = + CountSetBits(out_indices_validity, validity_offset, sizes[i]); + if (ARROW_PREDICT_FALSE(pop_count > 0)) { + return Status::Invalid( + "Function 'list_parent_indices' cannot produce parent indices for " + "values used by more than one list-view array element."); + } + bit_util::SetBitmap(out_indices_validity, validity_offset, sizes[i]); + total_pop_count += sizes[i]; + for (auto j = static_cast(offsets[i]); + j < static_cast(offsets[i]) + sizes[i]; ++j) { + out_indices[j - values_offset] = i + base_output_offset; + } + } + return Status::OK(); + })); + + DCHECK_LE(total_pop_count, values_length); + const int64_t null_count = values_length - total_pop_count; + BufferVector buffers{null_count > 0 ? std::move(indices_validity) : nullptr, + std::move(indices)}; + out = std::make_shared(int64(), values_length, std::move(buffers), + null_count); + return Status::OK(); + } + + Status Visit(const ListViewType& type) { return VisitListView(type); } + + Status Visit(const LargeListViewType& type) { return VisitListView(type); } + Status Visit(const FixedSizeListType& type) { using offset_type = typename FixedSizeListType::offset_type; const offset_type slot_length = type.list_size(); @@ -125,7 +189,7 @@ const FunctionDoc list_flatten_doc( const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", - ("`lists` must have a list-like type.\n" + ("`lists` must have a list-like or list-view type.\n" "For each value in each list of `lists`, the top-level list index\n" "is emitted."), {"lists"}); @@ -147,6 +211,7 @@ class ListParentIndicesFunction : public MetaFunction { int64_t base_output_offset = 0; ArrayVector out_chunks; + out_chunks.reserve(input->num_chunks()); for (const auto& chunk : input->chunks()) { ARROW_ASSIGN_OR_RAISE(auto out_chunk, ListParentIndicesArray::Exec(&kernel_ctx, chunk->data(), diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index 56604ebd16cc0..da751fa5de403 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -183,8 +183,50 @@ TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { CheckVectorUnary("list_flatten", input, expected, &opts); } +template +void SwapListView(ArrayData* array, int64_t i, int64_t j) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + ASSERT_LT(j, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + const bool is_valid_i = bit_util::GetBit(validity, array->offset + i); + const bool is_valid_j = bit_util::GetBit(validity, array->offset + j); + if (is_valid_i ^ is_valid_j) { + bit_util::SetBitTo(validity, array->offset + i, is_valid_j); + bit_util::SetBitTo(validity, array->offset + j, is_valid_i); + } + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); +} + +template +void SetListView(ArrayData* array, int64_t i, offset_type offset, offset_type size) { + ASSERT_TRUE(is_list_view(array->type->id())); + ASSERT_EQ(array->type->id(), T::type_id); + ASSERT_LT(i, array->length); + auto* validity = array->GetMutableValues(0); + if (validity) { + bit_util::SetBit(validity, array->offset + i); + } + auto* offsets = array->GetMutableValues(1); + auto* sizes = array->GetMutableValues(2); + offsets[i] = offset; + sizes[i] = size; +} + TEST(TestVectorNested, ListParentIndices) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 2, 2, 4, 4]"); @@ -196,10 +238,47 @@ TEST(TestVectorNested, ListParentIndices) { auto tweaked = TweakValidityBit(input, 1, false); auto expected = ArrayFromJSON(int64(), "[0, 0, 0, 1, 1, 2, 2, 4, 4]"); CheckVectorUnary("list_parent_indices", tweaked, expected); + + { + // Construct a list-view with a non-empty null slot + auto input = + ArrayFromJSON(list_view(int16()), "[[0, null, 1], [0, 0], [2, 3], [], [4, 5]]"); + auto tweaked = TweakValidityBit(input, 1, false); + auto expected = ArrayFromJSON(int64(), "[0, 0, 0, null, null, 2, 2, 4, 4]"); + CheckVectorUnary("list_parent_indices", tweaked, expected); + + // Swap some list-view entries + auto swapped = tweaked->data()->Copy(); + SwapListView(swapped.get(), 0, 2); + SwapListView(swapped.get(), 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), "[[2, 3], [4, 5], [0, null, 1], [], null]"), + /*verbose=*/true); + expected = ArrayFromJSON(int64(), "[2, 2, 2, null, null, 0, 0, 1, 1]"); + CheckVectorUnary("list_parent_indices", swapped, expected); + + // Make one view use values that are used by other list-views + SetListView(swapped.get(), 3, 1, 4); + AssertDatumsEqual( + swapped, + ArrayFromJSON(list_view(int16()), + "[[2, 3], [4, 5], [0, null, 1], [null, 1, 0, 0], null]"), + /*verbose=*/true); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("values used by more than one list-view"), + CallFunction("list_parent_indices", {input})); + } } TEST(TestVectorNested, ListParentIndicesChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { + const auto types = { + list(int16()), + large_list(int16()), + list_view(int16()), + large_list_view(int16()), + }; + for (auto ty : types) { auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], [], [4, 5]]"}); diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 64c3db204c9ee..b265673e23c86 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -68,12 +68,10 @@ using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- // DropNull Implementation -Result> GetDropNullFilter(const Array& values, - MemoryPool* memory_pool) { - auto bitmap_buffer = values.null_bitmap(); - std::shared_ptr out_array = std::make_shared( - values.length(), bitmap_buffer, nullptr, 0, values.offset()); - return out_array; +std::shared_ptr MakeDropNullFilter(const Array& values) { + auto& bitmap_buffer = values.null_bitmap(); + return std::make_shared(values.length(), bitmap_buffer, nullptr, 0, + values.offset()); } Result DropNullArray(const std::shared_ptr& values, ExecContext* ctx) { @@ -86,8 +84,7 @@ Result DropNullArray(const std::shared_ptr& values, ExecContext* c if (values->type()->id() == Type::type::NA) { return std::make_shared(0); } - ARROW_ASSIGN_OR_RAISE(auto drop_null_filter, - GetDropNullFilter(*values, ctx->memory_pool())); + auto drop_null_filter = Datum{MakeDropNullFilter(*values)}; return Filter(values, drop_null_filter, FilterOptions::Defaults(), ctx); } @@ -185,19 +182,16 @@ class DropNullMetaFunction : public MetaFunction { Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - switch (args[0].kind()) { - case Datum::ARRAY: { - return DropNullArray(args[0].make_array(), ctx); - } break; - case Datum::CHUNKED_ARRAY: { - return DropNullChunkedArray(args[0].chunked_array(), ctx); - } break; - case Datum::RECORD_BATCH: { - return DropNullRecordBatch(args[0].record_batch(), ctx); - } break; - case Datum::TABLE: { - return DropNullTable(args[0].table(), ctx); - } break; + auto& values = args[0]; + switch (values.kind()) { + case Datum::ARRAY: + return DropNullArray(values.make_array(), ctx); + case Datum::CHUNKED_ARRAY: + return DropNullChunkedArray(values.chunked_array(), ctx); + case Datum::RECORD_BATCH: + return DropNullRecordBatch(values.record_batch(), ctx); + case Datum::TABLE: + return DropNullTable(values.table(), ctx); default: break; } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 5e24331fe96f2..bf67a474f31e2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -1101,6 +1101,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), plain_filter, ExtensionFilterExec}, {InputType(Type::LIST), plain_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), plain_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), plain_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), plain_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), plain_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), plain_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), plain_filter, SparseUnionFilterExec}, @@ -1119,6 +1121,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), ree_filter, ExtensionFilterExec}, {InputType(Type::LIST), ree_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), ree_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), ree_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), ree_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), ree_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), ree_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), ree_filter, SparseUnionFilterExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index 2ba660e49ac38..7189d42850e79 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -547,39 +547,6 @@ struct VarBinarySelectionImpl : public Selection, T } }; -struct FSBSelectionImpl : public Selection { - using Base = Selection; - LIFT_BASE_MEMBERS(); - - TypedBufferBuilder data_builder; - - FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, - ExecResult* out) - : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {} - - template - Status GenerateOutput() { - FixedSizeBinaryArray typed_values(this->values.ToArrayData()); - int32_t value_size = typed_values.byte_width(); - - RETURN_NOT_OK(data_builder.Reserve(value_size * output_length)); - Adapter adapter(this); - return adapter.Generate( - [&](int64_t index) { - auto val = typed_values.GetView(index); - data_builder.UnsafeAppend(reinterpret_cast(val.data()), - value_size); - return Status::OK(); - }, - [&]() { - data_builder.UnsafeAppend(value_size, static_cast(0x00)); - return Status::OK(); - }); - } - - Status Finish() override { return data_builder.Finish(&out->buffers[1]); } -}; - template struct ListSelectionImpl : public Selection, Type> { using offset_type = typename Type::offset_type; @@ -645,6 +612,63 @@ struct ListSelectionImpl : public Selection, Type> { } }; +template +struct ListViewSelectionImpl : public Selection, Type> { + using offset_type = typename Type::offset_type; + + using Base = Selection, Type>; + LIFT_BASE_MEMBERS(); + + TypedBufferBuilder offsets_builder; + TypedBufferBuilder sizes_builder; + + ListViewSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, + ExecResult* out) + : Base(ctx, batch, output_length, out), + offsets_builder(ctx->memory_pool()), + sizes_builder(ctx->memory_pool()) {} + + template + Status GenerateOutput() { + auto* offsets = this->values.template GetValues(1); + auto* sizes = this->values.template GetValues(2); + + offset_type null_list_view_offset = 0; + Adapter adapter(this); + RETURN_NOT_OK(adapter.Generate( + [&](int64_t index) { + offset_type value_offset = offsets[index]; + offset_type value_length = sizes[index]; + offsets_builder.UnsafeAppend(value_offset); + sizes_builder.UnsafeAppend(value_length); + null_list_view_offset = value_offset + value_length; + return Status::OK(); + }, + [&]() { + // 0 could be appended here, but by adding the last offset, we keep + // the buffer compatible with how offsets behave in ListType as well. + // The invariant that `offsets[i] + sizes[i] <= values.length` is + // trivially maintained by having `sizes[i]` set to 0 here. + offsets_builder.UnsafeAppend(null_list_view_offset); + sizes_builder.UnsafeAppend(0); + return Status::OK(); + })); + return Status::OK(); + } + + Status Init() override { + RETURN_NOT_OK(offsets_builder.Reserve(output_length)); + return sizes_builder.Reserve(output_length); + } + + Status Finish() override { + RETURN_NOT_OK(offsets_builder.Finish(&out->buffers[1])); + RETURN_NOT_OK(sizes_builder.Finish(&out->buffers[2])); + out->child_data = {this->values.child_data[0].ToArrayData()}; + return Status::OK(); + } +}; + struct DenseUnionSelectionImpl : public Selection { using Base = Selection; @@ -891,6 +915,15 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return FilterExec>(ctx, batch, out); } +Status ListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + +Status LargeListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; @@ -939,23 +972,6 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, return TakeExec>(ctx, batch, out); } -Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& values = batch[0].array; - const auto byte_width = values.type->byte_width(); - // Use primitive Take implementation (presumably faster) for some byte widths - switch (byte_width) { - case 1: - case 2: - case 4: - case 8: - case 16: - case 32: - return PrimitiveTakeExec(ctx, batch, out); - default: - return TakeExec(ctx, batch, out); - } -} - Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return TakeExec>(ctx, batch, out); } @@ -964,30 +980,24 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* return TakeExec>(ctx, batch, out); } +Status ListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + +Status LargeListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; // If a FixedSizeList wraps a fixed-width type we can, in some cases, use - // PrimitiveTakeExec for a fixed-size list array. + // FixedWidthTakeExec for a fixed-size list array. if (util::IsFixedWidthLike(values, /*force_null_count=*/true, /*exclude_bool_and_dictionary=*/true)) { - const auto byte_width = util::FixedWidthInBytes(*values.type); - // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. - // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. - switch (byte_width) { - case 1: - case 2: - case 4: - case 8: - case 16: - case 32: - return PrimitiveTakeExec(ctx, batch, out); - default: - break; // fallback to TakeExec - } + return FixedWidthTakeExec(ctx, batch, out); } - return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index a169f4b38a2b8..887bf08354120 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -67,16 +67,19 @@ void VisitPlainxREEFilterOutputSegments( Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status PrimitiveTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 1a9af0efcd700..c45cc552a2cc5 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "arrow/array/builder_primitive.h" @@ -27,8 +28,10 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/compute/kernels/gather_internal.h" #include "arrow/compute/kernels/vector_selection_internal.h" #include "arrow/compute/kernels/vector_selection_take_internal.h" +#include "arrow/compute/registry.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" #include "arrow/table.h" @@ -324,238 +327,79 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte -// C-type based types. Use common implementation for every byte width and only -// generate code for unsigned integer indices, since after boundschecking to -// check for negative numbers in the indices we can safely reinterpret_cast -// signed integers as unsigned. - -/// \brief The Take implementation for primitive (fixed-width) types does not -/// use the logical Arrow type but rather the physical C type. This way we -/// only generate one take function for each byte width. +// Implement optimized take for primitive types from boolean to +// 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more +// bytes). +// +// Use one specialization for each of these primitive byte-widths so the +// compiler can specialize the memcpy to dedicated CPU instructions and for +// fixed-width binary use the 1-byte specialization but pass WithFactor=true +// that makes the kernel consider the factor parameter provided at runtime. +// +// Only unsigned index types need to be instantiated since after +// boundschecking to check for negative numbers in the indices we can safely +// reinterpret_cast signed integers as unsigned. + +/// \brief The Take implementation for primitive types and fixed-width binary. /// /// Also note that this function can also handle fixed-size-list arrays if /// they fit the criteria described in fixed_width_internal.h, so use the /// function defined in that file to access values and destination pointers /// and DO NOT ASSUME `values.type()` is a primitive type. /// +/// NOTE: Template parameters are types instead of values to let +/// `TakeIndexDispatch<>` forward `typename... Args` after the index type. +/// /// \pre the indices have been boundschecked -template -struct PrimitiveTakeImpl { - static constexpr int kValueWidth = ValueWidthConstant::value; - - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); - const auto* values_data = util::OffsetPointerOfFixedByteWidthValues(values); - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - DCHECK_EQ(out_arr->offset, 0); - auto* out = util::MutableFixedWidthValuesPointer(out_arr); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, 0, indices.length, false); - } - - auto WriteValue = [&](int64_t position) { - memcpy(out + position * kValueWidth, - values_data + indices_data[position] * kValueWidth, kValueWidth); - }; - - auto WriteZero = [&](int64_t position) { - memset(out + position * kValueWidth, 0, kValueWidth); - }; - - auto WriteZeroSegment = [&](int64_t position, int64_t length) { - memset(out + position * kValueWidth, 0, kValueWidth * length); - }; - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; - int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - WriteValue(position); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some indices but not all are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, position); - WriteValue(position); - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position) && - bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // index is not null && value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } - } - out_arr->null_count = out_arr->length - valid_count; - } -}; - -template -struct BooleanTakeImpl { - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - const uint8_t* values_data = values.buffers[1].data; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->buffers[1]->mutable_data(); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - // Avoid uninitialized data in values array - bit_util::SetBitsTo(out, out_offset, indices.length, false); - - auto PlaceDataBit = [&](int64_t loc, IndexCType index) { - bit_util::SetBitTo(out, out_offset + loc, - bit_util::GetBit(values_data, values_offset + index)); - }; - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; +template +struct FixedWidthTakeImpl { + static constexpr int kValueWidthInBits = ValueBitWidthConstant::value; + + static Status Exec(KernelContext* ctx, const ArraySpan& values, + const ArraySpan& indices, ArrayData* out_arr, int64_t factor) { +#ifndef NDEBUG + int64_t bit_width = util::FixedWidthInBits(*values.type); + DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1)); + DCHECK(!WithFactor::value || + (factor > 0 && kValueWidthInBits == 8 && // factors are used with bytes + static_cast(factor * kValueWidthInBits) == bit_width)); +#endif + const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls(); + + const uint8_t* src; + int64_t src_offset; + std::tie(src_offset, src) = util::OffsetPointerOfFixedBitWidthValues(values); + uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr); int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - PlaceDataBit(position, indices_data[position]); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - } - ++position; - } - } else { - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - ++valid_count; - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - PlaceDataBit(position, indices_data[position]); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } - } - ++position; - } - } else { - position += block.length; - } - } + arrow::internal::Gather gather{ + /*src_length=*/values.length, + src, + src_offset, + /*idx_length=*/indices.length, + /*idx=*/indices.GetValues(1), + out, + factor}; + if (out_has_validity) { + DCHECK_EQ(out_arr->offset, 0); + // out_is_valid must be zero-initiliazed, because Gather::Execute + // saves time by not having to ClearBit on every null element. + auto out_is_valid = out_arr->GetMutableValues(0); + memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length)); + valid_count = gather.template Execute( + /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid); + } else { + valid_count = gather.Execute(); } out_arr->null_count = out_arr->length - valid_count; + return Status::OK(); } }; template