Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37941: [R][CI][Release] Add checksum verification for pre-compiled binaries #38115

Merged
merged 31 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8974e19
add script to fetch checksum files from artifactory
assignUser Oct 6, 2023
357f62a
remove rwinlib
assignUser Oct 6, 2023
b80ee6d
validate windows binary checksum
assignUser Oct 6, 2023
f0cfd43
fix error with lineendings on windows
assignUser Oct 6, 2023
eceadb4
use versioned filename to match checksum file
assignUser Oct 6, 2023
f1219f2
add checksum validation for nixlibs.R
assignUser Oct 6, 2023
cc0d876
use same logic for checksum for *libs.R
assignUser Oct 7, 2023
0beda0e
allow setting checksum path via envvar
assignUser Oct 7, 2023
16ae261
add checksum validation to nightly builds
assignUser Oct 7, 2023
9fbd82a
add sudo to overcome docker permission
assignUser Oct 7, 2023
271574c
fix typo
assignUser Oct 7, 2023
b4e324f
chown docker build files to prevent permission issues
assignUser Oct 7, 2023
0428ad9
use absolute checksum path
assignUser Oct 7, 2023
28ce9e7
fix log message and checksum command
assignUser Oct 7, 2023
5e01e62
chown needs sudo
assignUser Oct 7, 2023
ed758f6
validate checksum with linux binary
assignUser Oct 7, 2023
6ef1495
fix log msg
assignUser Oct 7, 2023
3aad002
fix file name
assignUser Oct 7, 2023
cec5269
actually test checksum with linux binary
assignUser Oct 7, 2023
88dde98
use versioned filename to match checksum in nixlibs
assignUser Oct 7, 2023
3aaecf6
fall back to sha512sum if shasum not available
assignUser Oct 7, 2023
924286e
fix typo
assignUser Oct 7, 2023
7477b23
fix use of RWINLIB_LOCAL
assignUser Oct 8, 2023
add36ed
don't relly on env, use args explicitly
assignUser Oct 8, 2023
4a9619e
unnest & refactor
assignUser Oct 8, 2023
4dae43c
fix return
assignUser Oct 8, 2023
7a6c452
allow checksum bypass via envvar
assignUser Oct 11, 2023
4ff5d4c
add checksum download to release checklist
assignUser Oct 11, 2023
77fe890
Merge branch 'main' into add-checksum-rbinaries
assignUser Oct 11, 2023
4c8511e
fix name of update script
assignUser Oct 11, 2023
dd292b6
update usage
assignUser Oct 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions dev/tasks/r/github.packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,16 @@ jobs:
run: |
cd arrow/r/libarrow/dist
zip -r $PKG_FILE lib/ include/

- name: Create Checksum
shell: bash
run: |
cd arrow/r/libarrow/dist
shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__darwin-{{ '${{ matrix.platform.arch }}' }}-openssl-{{ '${{ matrix.openssl }}' }}
path: arrow/r/libarrow/dist/arrow-*.zip
path: arrow/r/libarrow/dist/arrow-*.zip*

linux-cpp:
name: C++ Binary Linux OpenSSL {{ '${{ matrix.openssl }}' }}
Expand Down Expand Up @@ -149,15 +153,21 @@ jobs:
PKG_FILE: arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip
VERSION: {{ '${{ needs.source.outputs.pkg_version }}' }}
run: |
cd arrow/r/libarrow/dist
# These files were created by the docker user so we have to sudo to get them
sudo -E zip -r $PKG_FILE lib/ include/
# These files were created by the docker user so we have to chown them
sudo chown -R $USER:$USER arrow/r/libarrow

cd arrow/r/libarrow/dist
zip -r $PKG_FILE lib/ include/
- name: Create Checksum
shell: bash
run: |
cd arrow/r/libarrow/dist
shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__linux-openssl-{{ '${{ matrix.openssl }}' }}
path: arrow/r/libarrow/dist/arrow-*.zip
path: arrow/r/libarrow/dist/arrow-*.zip*

windows-cpp:
name: C++ Binary Windows RTools (40 only)
Expand All @@ -181,11 +191,16 @@ jobs:
ARROW_HOME: "arrow"
{{ macros.github_set_sccache_envvars()|indent(8) }}
run: arrow/ci/scripts/r_windows_build.sh
- name: Create Checksum
shell: bash
run: |
cd build
sha512sum arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__windows
path: build/arrow-*.zip
path: build/arrow-*.zip*

r-packages:
needs: [source, windows-cpp, macos-cpp]
Expand Down Expand Up @@ -222,7 +237,6 @@ jobs:
rig system add-pak
{{ macros.github_setup_local_r_repo(false, true, true)|indent }}
- name: Prepare Dependency Installation

shell: bash
run: |
tar -xzf repo/src/contrib/arrow_*.tar.gz arrow/DESCRIPTION
Expand All @@ -244,6 +258,8 @@ jobs:
NOT_CRAN: "false" # actions/setup-r sets this implicitly
ARROW_R_DEV: "true"
LIBARROW_BINARY: "true" # has to be set as long as allowlist not updated
ARROW_R_ENFORCE_CHECKSUM: "true"
ARROW_R_CHECKSUM_PATH: "{{ '${{ github.workspace }}' }}/repo/libarrow/bin"
run: |
on_windows <- tolower(Sys.info()[["sysname"]]) == "windows"

Expand Down Expand Up @@ -335,6 +351,8 @@ jobs:
ARROW_R_DEV: "TRUE"
LIBARROW_BUILD: "FALSE"
LIBARROW_BINARY: {{ '${{ matrix.config.libarrow_binary }}' }}
ARROW_R_ENFORCE_CHECKSUM: "true"
ARROW_R_CHECKSUM_PATH: "{{ '${{ github.workspace }}' }}/repo/libarrow/bin"
shell: bash
run: |
Rscript -e '
Expand Down
4 changes: 4 additions & 0 deletions r/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ arrow_*.tgz
extra-tests/files
.deps

# Checksums for the precompiled binaries will be added just before CRAN submission
# use `tools/update-checksums.R` to download them.
assignUser marked this conversation as resolved.
Show resolved Hide resolved
/tools/checksums/

# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here.
/tools/cpp/
# cmake expects dotenv, NOTICE.txt, and LICENSE.txt to be available one level up
Expand Down
2 changes: 1 addition & 1 deletion r/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,5 @@ clean:
-rm src/Makevars.win
-rm -rf arrow.Rcheck/
-rm -rf libarrow/
-rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt
-rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt tools/checksums
-find . -name "*.orig" -delete
1 change: 1 addition & 0 deletions r/PACKAGING.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Wait for the release candidate to be cut:
- [ ] Create a PR entitled `WIP: [R] Verify CRAN release-10.0.1-rc0`. Add
a comment `@github-actions crossbow submit --group r` to run all R crossbow
jobs against the CRAN-specific release branch.
- [ ] Run `Rscript tools/update-checksums.R <libarrow version>` to download the checksums for the pre-compiled binaries from the ASF artifactory into the tools directory.
assignUser marked this conversation as resolved.
Show resolved Hide resolved
- [ ] Regenerate arrow_X.X.X.tar.gz (i.e., `make build`)

Ensure linux binary packages are available:
Expand Down
43 changes: 40 additions & 3 deletions r/tools/nixlibs.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ if (test_mode && is.na(VERSION)) {
}

dev_version <- package_version(VERSION)[1, 4]
is_release <- is.na(dev_version) || dev_version < "100"
on_macos <- tolower(Sys.info()[["sysname"]]) == "darwin"

checksum_path <- Sys.getenv("ARROW_R_CHECKSUM_PATH", "tools/checksums")

# Small dev versions are added for R-only changes during CRAN submission.
if (is.na(dev_version) || dev_version < "100") {
if (is_release) {
VERSION <- package_version(VERSION)[1, 1:3]
arrow_repo <- paste0(getOption("arrow.repo", sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s", VERSION)), "/libarrow/")
} else {
Expand Down Expand Up @@ -88,7 +89,7 @@ thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tool


download_binary <- function(lib) {
libfile <- tempfile()
libfile <- paste0("arrow-", VERSION, ".zip")
binary_url <- paste0(arrow_repo, "bin/", lib, "/arrow-", VERSION, ".zip")
if (try_download(binary_url, libfile)) {
if (!quietly) {
Expand All @@ -103,6 +104,42 @@ download_binary <- function(lib) {
}
libfile <- NULL
}
# Explicitly setting the env var to "false" will skip checksum validation
# e.g. in case the included checksums are stale.
skip_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "false")
enforce_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "true")
# validate binary checksum for CRAN release only
if (!skip_checksum && dir.exists(checksum_path) && is_release ||
enforce_checksum) {
paleolimbot marked this conversation as resolved.
Show resolved Hide resolved
checksum_file <- sub(".+/bin/(.+\\.zip)", "\\1\\.sha512", binary_url)
assignUser marked this conversation as resolved.
Show resolved Hide resolved
checksum_file <- file.path(checksum_path, checksum_file)
checksum_cmd <- "shasum"
checksum_args <- c("--status", "-a", "512", "-c", checksum_file)

# shasum is not available on all linux versions
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use sys.which() to see if it's present?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah nice 👍

status_shasum <- try(
suppressWarnings(
system2("shasum", args = c("--help"), stdout = FALSE, stderr = FALSE)
),
silent = TRUE
)

if (inherits(status_shasum, "try-error") || is.integer(status_shasum) && status_shasum != 0) {
checksum_cmd <- "sha512sum"
checksum_args <- c("--status", "-c", checksum_file)
}

checksum_ok <- system2(checksum_cmd, args = checksum_args)
assignUser marked this conversation as resolved.
Show resolved Hide resolved

if (checksum_ok != 0) {
cat("*** Checksum validation failed for libarrow binary: ", libfile, "\n")
unlink(libfile)
libfile <- NULL
} else {
cat("*** Checksum validated successfully for libarrow binary: ", libfile, "\n")
}
}

libfile
}

Expand Down
67 changes: 67 additions & 0 deletions r/tools/update-checksums.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Run this script AFTER the release was voted and the artifacts
# are moved into the final dir. This script will download the checksum
# files and save them to the tools/checksums directory mirroring the
# artifactory layout. *libs.R uses these files to validated the downloaded
# binaries when installing the package.
#
# Run this script from the r/ directory of the arrow repo with the version
# as the first argument$ Rscript tools/update-checksum.R 14.0.0

args <- commandArgs(TRUE)
VERSION <- args[1]
tools_root <- ""

if (length(args) != 1) {
stop("Usage: Rscript tools/update-checksums.R <version>")
}

tasks_yml <- "../dev/tasks/tasks.yml"

if (!file.exists(tasks_yml)) {
stop("Run this script from the r/ directory of the arrow repo")
}

# Get the libarrow binary paths from the tasks.yml file
binary_paths <- readLines(tasks_yml) |>
grep("r-lib__libarrow", x = _, value = TRUE) |>
sub(".+r-lib__libarrow__bin__(.+\\.zip)", "\\1", x = _) |>
sub("{no_rc_r_version}", VERSION, fixed = TRUE, x = _) |>
sub("__", "/", x = _) |>
sub("\\.zip", ".zip", fixed = TRUE, x = _)

artifactory_root <- "https://apache.jfrog.io/artifactory/arrow/r/%s/libarrow/bin/%s"

# Get the checksuym file from the artifactory
assignUser marked this conversation as resolved.
Show resolved Hide resolved
for (path in binary_paths) {
sha_path <- paste0(path, ".sha512")
file <- file.path("tools/checksums", sha_path)
dirname(file) |> dir.create(path = _, recursive = TRUE, showWarnings = FALSE)

url <- sprintf(artifactory_root, VERSION, sha_path)
download.file(url, file, quiet = TRUE, cacheOK = FALSE)

if (grepl("windows", path)) {
# UNIX style line endings cause errors with mysys2 sha512sum
sed_status <- system2("sed", args = c("-i", "s/\\r//", file))
if (sed_status != 0) {
stop("Failed to remove \\r from windows checksum file. Exit code: ", sed_status)
}
}
}
119 changes: 74 additions & 45 deletions r/tools/winlibs.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,55 +17,84 @@

args <- commandArgs(TRUE)
VERSION <- args[1]
if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
if (length(args) > 1) {
# Arg 2 would be the path/to/lib.zip
localfile <- args[2]
cat(sprintf("*** Using RWINLIB_LOCAL %s\n", localfile))
if (!file.exists(localfile)) {
cat(sprintf("*** %s does not exist; build will fail\n", localfile))
}
file.copy(localfile, "lib.zip")
} else {
# Download static arrow from the apache artifactory
quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
get_file <- function(template, version) {
try(
suppressWarnings(
download.file(sprintf(template, version), "lib.zip", quiet = quietly)
),
silent = quietly
)
}
dev_version <- package_version(VERSION)[1, 4]
# Small dev versions are added for R-only changes during CRAN submission
is_release <- is.na(dev_version) || dev_version < "100"
env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value)
# We want to log the message in the style of the configure script
# not as an R error. Use `return` to exit the script after logging.
lg <- function(...) {
cat("*** ", sprintf(...), "\n")
}

# URL templates
nightly <- paste0(
getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r"),
"/libarrow/bin/windows/arrow-%s.zip"
)
# %1$s uses the first variable for both substitutions
artifactory <- paste0(
getOption("arrow.repo", "https://apache.jfrog.io/artifactory/arrow/r/%1$s"),
"/libarrow/bin/windows/arrow-%1$s.zip"
)
rwinlib <- "https://github.com/rwinlib/arrow/archive/v%s.zip"
if (is_release) {
# This is a release version, so we need to use the major.minor.patch version without
# the CRAN suffix/dev_version
VERSION <- package_version(VERSION)[1, 1:3]
# %1$s uses the first variable for both substitutions
url_template <- paste0(
getOption("arrow.repo", "https://apache.jfrog.io/artifactory/arrow/r/%1$s"),
"/libarrow/bin/windows/arrow-%1$s.zip"
)
} else {
url_template <- paste0(
assignUser marked this conversation as resolved.
Show resolved Hide resolved
getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r"),
"/libarrow/bin/windows/arrow-%s.zip"
)
}

dev_version <- package_version(VERSION)[1, 4]
if (file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously before the truncation of the version string so theoretically having a local 13.0.0 version of arrow would not have worked with package 13.0.0.1. Likely never happened but still ^^

lg("Found local Arrow %s!", VERSION)
return()
}

# Small dev versions are added for R-only changes during CRAN submission.
if (is.na(dev_version) || dev_version < "100") {
VERSION <- package_version(VERSION)[1, 1:3]
get_file(rwinlib, VERSION)
zip_file <- sprintf("arrow-%s.zip", VERSION)

# If not found, fall back to apache artifactory
if (!file.exists("lib.zip")) {
get_file(artifactory, VERSION)
}
} else {
get_file(nightly, VERSION)
if (length(args) > 1) {
# Arg 2 would be the path/to/lib.zip
localfile <- args[2]
if (!file.exists(localfile)) {
lg("RWINLIB_LOCAL '%s' does not exist. Build will fail.", localfile)
return()
} else {
lg("Using RWINLIB_LOCAL %s", localfile)
}
file.copy(localfile, zip_file)
} else {
quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
binary_url <- sprintf(url_template, VERSION)
try(
suppressWarnings(
download.file(binary_url, zip_file, quiet = quietly)
),
silent = quietly
)

if (!file.exists(zip_file) || file.size(zip_file) == 0) {
lg("Failed to download libarrow binary from %s. Build will fail.", binary_url)
return()
}

checksum_path <- Sys.getenv("ARROW_R_CHECKSUM_PATH", "tools/checksums")
# Explicitly setting the env var to "false" will skip checksum validation
# e.g. in case the included checksums are stale.
skip_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "false")
enforce_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "true")
# validate binary checksum for CRAN release only
if (!skip_checksum && dir.exists(checksum_path) && is_release ||
enforce_checksum) {
checksum_file <- sprintf("%s/windows/arrow-%s.zip.sha512", checksum_path, VERSION)
# rtools does not have shasum with default config
checksum_ok <- system2("sha512sum", args = c("--status", "-c", checksum_file))

if (checksum_ok != 0) {
lg("Checksum validation failed for libarrow binary: %s", zip_file)
return()
}
lg("Checksum validated successfully for libarrow binary: %s", zip_file)
}
dir.create("windows", showWarnings = FALSE)
unzip("lib.zip", exdir = "windows")
unlink("lib.zip")
}

dir.create("windows", showWarnings = FALSE)
unzip(zip_file, exdir = "windows")
unlink(zip_file)