GH-38432: [C++][Parquet] Try to fix performance regression in the DictByteArrayDecoderImpl #41643
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Licensed to the Apache Software Foundation (ASF) under one | |
# or more contributor license agreements. See the NOTICE file | |
# distributed with this work for additional information | |
# regarding copyright ownership. The ASF licenses this file | |
# to you under the Apache License, Version 2.0 (the | |
# "License"); you may not use this file except in compliance | |
# with the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, | |
# software distributed under the License is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
# KIND, either express or implied. See the License for the | |
# specific language governing permissions and limitations | |
# under the License. | |
name: R | |
on: | |
push: | |
paths: | |
- ".github/workflows/r.yml" | |
- "ci/scripts/r_*.sh" | |
- "ci/scripts/cpp_*.sh" | |
- "ci/scripts/PKGBUILD" | |
- "ci/etc/rprofile" | |
- "ci/docker/**" | |
- "cpp/**" | |
- 'docker-compose.yml' | |
- "r/**" | |
pull_request: | |
paths: | |
- ".github/workflows/r.yml" | |
- "ci/scripts/r_*.sh" | |
- "ci/scripts/cpp_*.sh" | |
- "ci/scripts/PKGBUILD" | |
- "ci/etc/rprofile" | |
- "ci/docker/**" | |
- "cpp/**" | |
- 'docker-compose.yml' | |
- "r/**" | |
concurrency: | |
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} | |
cancel-in-progress: true | |
permissions: | |
contents: read | |
env: | |
DOCKER_VOLUME_PREFIX: ".docker/" | |
jobs: | |
ubuntu: | |
name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} | |
runs-on: ubuntu-latest | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
r: ["4.2"] | |
ubuntu: [20.04] | |
force-tests: ["true"] | |
env: | |
R: ${{ matrix.r }} | |
UBUNTU: ${{ matrix.ubuntu }} | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Cache Docker Volumes | |
uses: actions/cache@88522ab9f39a2ea568f7027eddc7d8d8bc9d59c8 # v3.3.1 | |
with: | |
path: .docker | |
# As this key is identical on both matrix builds only one will be able to successfully cache, | |
# this is fine as there are no differences in the build | |
key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }} | |
restore-keys: | | |
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- | |
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- | |
- name: Setup Python | |
uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 | |
with: | |
python-version: 3.8 | |
- name: Setup Archery | |
run: pip install -e dev/archery[docker] | |
- name: Execute Docker Build | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
run: | | |
sudo sysctl -w kernel.core_pattern="core.%e.%p" | |
ulimit -c unlimited | |
# Setting a non-default and non-probable Marquesas French Polynesia time | |
# it has both with a .45 offset and very very few people who live there. | |
archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r | |
- name: Dump install logs | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
run: cat r/check/arrow.Rcheck/tests/testthat.Rout* | |
if: always() | |
- name: Save the test output | |
if: always() | |
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 | |
with: | |
name: test-output | |
path: r/check/arrow.Rcheck/tests/testthat.Rout* | |
- name: Docker Push | |
if: >- | |
success() && | |
github.event_name == 'push' && | |
github.repository == 'apache/arrow' && | |
github.ref_name == 'main' | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
continue-on-error: true | |
run: archery docker push ubuntu-r | |
bundled: | |
name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}" | |
runs-on: ubuntu-latest | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 60 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { org: "rhub", image: "debian-gcc-devel", tag: "latest", devtoolset: "" } | |
env: | |
R_ORG: ${{ matrix.config.org }} | |
R_IMAGE: ${{ matrix.config.image }} | |
R_TAG: ${{ matrix.config.tag }} | |
DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 | |
with: | |
python-version: 3.8 | |
- name: Setup Archery | |
run: pip install -e dev/archery[docker] | |
- name: Execute Docker Build | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
run: | | |
sudo sysctl -w kernel.core_pattern="core.%e.%p" | |
ulimit -c unlimited | |
# Don't set a TZ here to test that case. These builds will have the following warning in them: | |
# System has not been booted with systemd as init system (PID 1). Can't operate. | |
# Failed to connect to bus: Host is down | |
archery docker run -e TZ="" r | |
- name: Dump install logs | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
run: cat r/check/arrow.Rcheck/tests/testthat.Rout* | |
if: always() | |
- name: Save the test output | |
if: always() | |
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 | |
with: | |
name: test-output | |
path: r/check/arrow.Rcheck/tests/testthat.Rout* | |
- name: Docker Push | |
if: >- | |
success() && | |
github.event_name == 'push' && | |
github.repository == 'apache/arrow' && | |
github.ref_name == 'main' | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
continue-on-error: true | |
run: archery docker push r | |
windows-cpp: | |
name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }} | |
runs-on: windows-2019 | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 90 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { rtools: 40, arch: 'ucrt64' } | |
steps: | |
- run: git config --global core.autocrlf false | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
- name: Setup ccache | |
shell: bash | |
run: | | |
ci/scripts/ccache_setup.sh | |
echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV | |
- name: Cache ccache | |
uses: actions/cache@v3 | |
with: | |
path: ccache | |
key: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }} | |
restore-keys: | | |
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- | |
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}- | |
- uses: r-lib/actions/setup-r@v2 | |
with: | |
r-version: "4.1" | |
rtools-version: 40 | |
Ncpus: 2 | |
- name: Build Arrow C++ | |
shell: bash | |
env: | |
MINGW_ARCH: ${{ matrix.config.arch }} | |
run: ci/scripts/r_windows_build.sh | |
- name: Rename libarrow.zip | |
# So that they're unique when multiple are downloaded in the next step | |
shell: bash | |
run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
- uses: actions/upload-artifact@v3 | |
with: | |
name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
windows-r: | |
needs: [windows-cpp] | |
name: AMD64 Windows R ${{ matrix.config.rversion }} RTools ${{ matrix.config.rtools }} | |
runs-on: windows-2019 | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { rtools: 42, rversion: "4.2" } | |
- { rtools: 42, rversion: "devel" } | |
env: | |
ARROW_R_CXXFLAGS: "-Werror" | |
_R_CHECK_TESTS_NLINES_: 0 | |
steps: | |
- run: git config --global core.autocrlf false | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
- run: mkdir r/windows | |
- name: Download artifacts | |
if: ${{ matrix.config.rtools == 42 }} | |
uses: actions/download-artifact@v3 | |
with: | |
name: libarrow-rtools40-ucrt64.zip | |
path: r/windows | |
- name: Unzip and rezip libarrows | |
shell: bash | |
run: | | |
cd r/windows | |
ls *.zip | xargs -n 1 unzip -uo | |
rm -rf *.zip | |
- uses: r-lib/actions/setup-r@v2 | |
with: | |
r-version: ${{ matrix.config.rversion }} | |
rtools-version: ${{ matrix.config.rtools }} | |
Ncpus: 2 | |
- uses: r-lib/actions/setup-r-dependencies@v2 | |
env: | |
GITHUB_PAT: "${{ github.token }}" | |
with: | |
# For some arcane reason caching does not work on the windows runners | |
# most likely due to https://github.com/actions/cache/issues/815 | |
cache: false | |
working-directory: 'r' | |
extra-packages: | | |
any::rcmdcheck | |
- name: Install MinIO | |
shell: bash | |
run: | | |
mkdir -p "$HOME/.local/bin" | |
curl \ | |
--output "$HOME/.local/bin/minio.exe" \ | |
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z | |
chmod +x "$HOME/.local/bin/minio.exe" | |
echo "$HOME/.local/bin" >> $GITHUB_PATH | |
# TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows | |
# - name: Install Google Cloud Storage Testbench | |
# shell: bash | |
# run: ci/scripts/install_gcs_testbench.sh default | |
- name: Check | |
shell: Rscript {0} | |
run: | | |
# Because we do R CMD build and r/windows is in .Rbuildignore, | |
# assemble the libarrow.zip file and pass it as an env var | |
setwd("r/windows") | |
zip("libarrow.zip", ".") | |
setwd("..") | |
Sys.setenv( | |
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), | |
MAKEFLAGS = paste0("-j", parallel::detectCores()), | |
ARROW_R_DEV = TRUE, | |
"_R_CHECK_FORCE_SUGGESTS_" = FALSE, | |
"_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE | |
) | |
rcmdcheck::rcmdcheck(".", | |
build_args = '--no-build-vignettes', | |
args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'), | |
error_on = 'warning', | |
check_dir = 'check', | |
timeout = 3600 | |
) | |
- name: Run lintr | |
if: ${{ matrix.config.rversion == '4.2' }} | |
env: | |
NOT_CRAN: "true" | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
shell: Rscript {0} | |
working-directory: r | |
run: | | |
Sys.setenv( | |
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), | |
MAKEFLAGS = paste0("-j", parallel::detectCores()), | |
ARROW_R_DEV = TRUE, | |
"_R_CHECK_FORCE_SUGGESTS_" = FALSE | |
) | |
# we use pak for package installation since it is faster, safer and more convenient | |
pak::local_install() | |
pak::pak("lintr") | |
lintr::expect_lint_free() | |
- name: Dump install logs | |
shell: cmd | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
shell: bash | |
run: find r/check -name 'testthat.Rout*' -exec cat '{}' \; || true | |
if: always() |