Skip to content

Commit

Permalink
apacheGH-37941: [R][CI][Release] Add checksum verification for pre-co…
Browse files Browse the repository at this point in the history
…mpiled binaries (apache#38115)

### Rationale for this change

This change is to restore parity with the previous solution on macOS (brew does cs validation) and improve security for windows and linux. This also align with CRAN policy.

### What changes are included in this PR?

This PR adds a script that can be run after the arrow release (once all files have been pushed to the artifactory) before the CRAN submission to download the checksum files for the pre-compiled binaries which are already added through the usual release. *libs.R have been extended to use these checksum files to validate the downloaded binaries.

### Are these changes tested?

The r-binary-packages nightlies generate checksums and use them when building binary packages, this way the code path is tested. They do not modify the actual src package though.

### Are there any user-facing changes?
no (outside of log messages)
* Closes: apache#37941

Authored-by: Jacob Wujciak-Jens <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
  • Loading branch information
assignUser authored and Jeremy Aguilon committed Oct 23, 2023
1 parent 6281110 commit ba7b240
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 57 deletions.
34 changes: 26 additions & 8 deletions dev/tasks/r/github.packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,16 @@ jobs:
run: |
cd arrow/r/libarrow/dist
zip -r $PKG_FILE lib/ include/
- name: Create Checksum
shell: bash
run: |
cd arrow/r/libarrow/dist
shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__darwin-{{ '${{ matrix.platform.arch }}' }}-openssl-{{ '${{ matrix.openssl }}' }}
path: arrow/r/libarrow/dist/arrow-*.zip
path: arrow/r/libarrow/dist/arrow-*.zip*

linux-cpp:
name: C++ Binary Linux OpenSSL {{ '${{ matrix.openssl }}' }}
Expand Down Expand Up @@ -149,15 +153,21 @@ jobs:
PKG_FILE: arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip
VERSION: {{ '${{ needs.source.outputs.pkg_version }}' }}
run: |
cd arrow/r/libarrow/dist
# These files were created by the docker user so we have to sudo to get them
sudo -E zip -r $PKG_FILE lib/ include/
# These files were created by the docker user so we have to chown them
sudo chown -R $USER:$USER arrow/r/libarrow
cd arrow/r/libarrow/dist
zip -r $PKG_FILE lib/ include/
- name: Create Checksum
shell: bash
run: |
cd arrow/r/libarrow/dist
shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__linux-openssl-{{ '${{ matrix.openssl }}' }}
path: arrow/r/libarrow/dist/arrow-*.zip
path: arrow/r/libarrow/dist/arrow-*.zip*

windows-cpp:
name: C++ Binary Windows RTools (40 only)
Expand All @@ -181,11 +191,16 @@ jobs:
ARROW_HOME: "arrow"
{{ macros.github_set_sccache_envvars()|indent(8) }}
run: arrow/ci/scripts/r_windows_build.sh
- name: Create Checksum
shell: bash
run: |
cd build
sha512sum arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
- name: Upload binary artifact
uses: actions/upload-artifact@v3
with:
name: r-lib__libarrow__bin__windows
path: build/arrow-*.zip
path: build/arrow-*.zip*

r-packages:
needs: [source, windows-cpp, macos-cpp]
Expand Down Expand Up @@ -222,7 +237,6 @@ jobs:
rig system add-pak
{{ macros.github_setup_local_r_repo(false, true, true)|indent }}
- name: Prepare Dependency Installation

shell: bash
run: |
tar -xzf repo/src/contrib/arrow_*.tar.gz arrow/DESCRIPTION
Expand All @@ -244,6 +258,8 @@ jobs:
NOT_CRAN: "false" # actions/setup-r sets this implicitly
ARROW_R_DEV: "true"
LIBARROW_BINARY: "true" # has to be set as long as allowlist not updated
ARROW_R_ENFORCE_CHECKSUM: "true"
ARROW_R_CHECKSUM_PATH: "{{ '${{ github.workspace }}' }}/repo/libarrow/bin"
run: |
on_windows <- tolower(Sys.info()[["sysname"]]) == "windows"
Expand Down Expand Up @@ -335,6 +351,8 @@ jobs:
ARROW_R_DEV: "TRUE"
LIBARROW_BUILD: "FALSE"
LIBARROW_BINARY: {{ '${{ matrix.config.libarrow_binary }}' }}
ARROW_R_ENFORCE_CHECKSUM: "true"
ARROW_R_CHECKSUM_PATH: "{{ '${{ github.workspace }}' }}/repo/libarrow/bin"
shell: bash
run: |
Rscript -e '
Expand Down
4 changes: 4 additions & 0 deletions r/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ arrow_*.tgz
extra-tests/files
.deps

# Checksums for the precompiled binaries will be added just before CRAN submission
# use `tools/update-checksums.R` to download them.
/tools/checksums/

# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here.
/tools/cpp/
# cmake expects dotenv, NOTICE.txt, and LICENSE.txt to be available one level up
Expand Down
2 changes: 1 addition & 1 deletion r/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,5 @@ clean:
-rm src/Makevars.win
-rm -rf arrow.Rcheck/
-rm -rf libarrow/
-rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt
-rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt tools/checksums
-find . -name "*.orig" -delete
1 change: 1 addition & 0 deletions r/PACKAGING.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Wait for the release candidate to be cut:
- [ ] Create a PR entitled `WIP: [R] Verify CRAN release-10.0.1-rc0`. Add
a comment `@github-actions crossbow submit --group r` to run all R crossbow
jobs against the CRAN-specific release branch.
- [ ] Run `Rscript tools/update-checksums.R <libarrow version>` to download the checksums for the pre-compiled binaries from the ASF artifactory into the tools directory.
- [ ] Regenerate arrow_X.X.X.tar.gz (i.e., `make build`)

Ensure linux binary packages are available:
Expand Down
43 changes: 40 additions & 3 deletions r/tools/nixlibs.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ if (test_mode && is.na(VERSION)) {
}

dev_version <- package_version(VERSION)[1, 4]
is_release <- is.na(dev_version) || dev_version < "100"
on_macos <- tolower(Sys.info()[["sysname"]]) == "darwin"

checksum_path <- Sys.getenv("ARROW_R_CHECKSUM_PATH", "tools/checksums")

# Small dev versions are added for R-only changes during CRAN submission.
if (is.na(dev_version) || dev_version < "100") {
if (is_release) {
VERSION <- package_version(VERSION)[1, 1:3]
arrow_repo <- paste0(getOption("arrow.repo", sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s", VERSION)), "/libarrow/")
} else {
Expand Down Expand Up @@ -88,7 +89,7 @@ thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tool


download_binary <- function(lib) {
libfile <- tempfile()
libfile <- paste0("arrow-", VERSION, ".zip")
binary_url <- paste0(arrow_repo, "bin/", lib, "/arrow-", VERSION, ".zip")
if (try_download(binary_url, libfile)) {
if (!quietly) {
Expand All @@ -103,6 +104,42 @@ download_binary <- function(lib) {
}
libfile <- NULL
}
# Explicitly setting the env var to "false" will skip checksum validation
# e.g. in case the included checksums are stale.
skip_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "false")
enforce_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "true")
# validate binary checksum for CRAN release only
if (!skip_checksum && dir.exists(checksum_path) && is_release ||
enforce_checksum) {
checksum_file <- sub(".+/bin/(.+\\.zip)", "\\1\\.sha512", binary_url)
checksum_file <- file.path(checksum_path, checksum_file)
checksum_cmd <- "shasum"
checksum_args <- c("--status", "-a", "512", "-c", checksum_file)

# shasum is not available on all linux versions
status_shasum <- try(
suppressWarnings(
system2("shasum", args = c("--help"), stdout = FALSE, stderr = FALSE)
),
silent = TRUE
)

if (inherits(status_shasum, "try-error") || is.integer(status_shasum) && status_shasum != 0) {
checksum_cmd <- "sha512sum"
checksum_args <- c("--status", "-c", checksum_file)
}

checksum_ok <- system2(checksum_cmd, args = checksum_args)

if (checksum_ok != 0) {
cat("*** Checksum validation failed for libarrow binary: ", libfile, "\n")
unlink(libfile)
libfile <- NULL
} else {
cat("*** Checksum validated successfully for libarrow binary: ", libfile, "\n")
}
}

libfile
}

Expand Down
67 changes: 67 additions & 0 deletions r/tools/update-checksums.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Run this script AFTER the release was voted and the artifacts
# are moved into the final dir. This script will download the checksum
# files and save them to the tools/checksums directory mirroring the
# artifactory layout. *libs.R uses these files to validated the downloaded
# binaries when installing the package.
#
# Run this script from the r/ directory of the arrow repo with the version
# as the first argument$ Rscript tools/update-checksum.R 14.0.0

args <- commandArgs(TRUE)
VERSION <- args[1]
tools_root <- ""

if (length(args) != 1) {
stop("Usage: Rscript tools/update-checksums.R <version>")
}

tasks_yml <- "../dev/tasks/tasks.yml"

if (!file.exists(tasks_yml)) {
stop("Run this script from the r/ directory of the arrow repo")
}

# Get the libarrow binary paths from the tasks.yml file
binary_paths <- readLines(tasks_yml) |>
grep("r-lib__libarrow", x = _, value = TRUE) |>
sub(".+r-lib__libarrow__bin__(.+\\.zip)", "\\1", x = _) |>
sub("{no_rc_r_version}", VERSION, fixed = TRUE, x = _) |>
sub("__", "/", x = _) |>
sub("\\.zip", ".zip", fixed = TRUE, x = _)

artifactory_root <- "https://apache.jfrog.io/artifactory/arrow/r/%s/libarrow/bin/%s"

# Get the checksuym file from the artifactory
for (path in binary_paths) {
sha_path <- paste0(path, ".sha512")
file <- file.path("tools/checksums", sha_path)
dirname(file) |> dir.create(path = _, recursive = TRUE, showWarnings = FALSE)

url <- sprintf(artifactory_root, VERSION, sha_path)
download.file(url, file, quiet = TRUE, cacheOK = FALSE)

if (grepl("windows", path)) {
# UNIX style line endings cause errors with mysys2 sha512sum
sed_status <- system2("sed", args = c("-i", "s/\\r//", file))
if (sed_status != 0) {
stop("Failed to remove \\r from windows checksum file. Exit code: ", sed_status)
}
}
}
119 changes: 74 additions & 45 deletions r/tools/winlibs.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,55 +17,84 @@

args <- commandArgs(TRUE)
VERSION <- args[1]
if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
if (length(args) > 1) {
# Arg 2 would be the path/to/lib.zip
localfile <- args[2]
cat(sprintf("*** Using RWINLIB_LOCAL %s\n", localfile))
if (!file.exists(localfile)) {
cat(sprintf("*** %s does not exist; build will fail\n", localfile))
}
file.copy(localfile, "lib.zip")
} else {
# Download static arrow from the apache artifactory
quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
get_file <- function(template, version) {
try(
suppressWarnings(
download.file(sprintf(template, version), "lib.zip", quiet = quietly)
),
silent = quietly
)
}
dev_version <- package_version(VERSION)[1, 4]
# Small dev versions are added for R-only changes during CRAN submission
is_release <- is.na(dev_version) || dev_version < "100"
env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value)
# We want to log the message in the style of the configure script
# not as an R error. Use `return` to exit the script after logging.
lg <- function(...) {
cat("*** ", sprintf(...), "\n")
}

# URL templates
nightly <- paste0(
getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r"),
"/libarrow/bin/windows/arrow-%s.zip"
)
# %1$s uses the first variable for both substitutions
artifactory <- paste0(
getOption("arrow.repo", "https://apache.jfrog.io/artifactory/arrow/r/%1$s"),
"/libarrow/bin/windows/arrow-%1$s.zip"
)
rwinlib <- "https://github.com/rwinlib/arrow/archive/v%s.zip"
if (is_release) {
# This is a release version, so we need to use the major.minor.patch version without
# the CRAN suffix/dev_version
VERSION <- package_version(VERSION)[1, 1:3]
# %1$s uses the first variable for both substitutions
url_template <- paste0(
getOption("arrow.repo", "https://apache.jfrog.io/artifactory/arrow/r/%1$s"),
"/libarrow/bin/windows/arrow-%1$s.zip"
)
} else {
url_template <- paste0(
getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r"),
"/libarrow/bin/windows/arrow-%s.zip"
)
}

dev_version <- package_version(VERSION)[1, 4]
if (file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
lg("Found local Arrow %s!", VERSION)
return()
}

# Small dev versions are added for R-only changes during CRAN submission.
if (is.na(dev_version) || dev_version < "100") {
VERSION <- package_version(VERSION)[1, 1:3]
get_file(rwinlib, VERSION)
zip_file <- sprintf("arrow-%s.zip", VERSION)

# If not found, fall back to apache artifactory
if (!file.exists("lib.zip")) {
get_file(artifactory, VERSION)
}
} else {
get_file(nightly, VERSION)
if (length(args) > 1) {
# Arg 2 would be the path/to/lib.zip
localfile <- args[2]
if (!file.exists(localfile)) {
lg("RWINLIB_LOCAL '%s' does not exist. Build will fail.", localfile)
return()
} else {
lg("Using RWINLIB_LOCAL %s", localfile)
}
file.copy(localfile, zip_file)
} else {
quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
binary_url <- sprintf(url_template, VERSION)
try(
suppressWarnings(
download.file(binary_url, zip_file, quiet = quietly)
),
silent = quietly
)

if (!file.exists(zip_file) || file.size(zip_file) == 0) {
lg("Failed to download libarrow binary from %s. Build will fail.", binary_url)
return()
}

checksum_path <- Sys.getenv("ARROW_R_CHECKSUM_PATH", "tools/checksums")
# Explicitly setting the env var to "false" will skip checksum validation
# e.g. in case the included checksums are stale.
skip_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "false")
enforce_checksum <- env_is("ARROW_R_ENFORCE_CHECKSUM", "true")
# validate binary checksum for CRAN release only
if (!skip_checksum && dir.exists(checksum_path) && is_release ||
enforce_checksum) {
checksum_file <- sprintf("%s/windows/arrow-%s.zip.sha512", checksum_path, VERSION)
# rtools does not have shasum with default config
checksum_ok <- system2("sha512sum", args = c("--status", "-c", checksum_file))

if (checksum_ok != 0) {
lg("Checksum validation failed for libarrow binary: %s", zip_file)
return()
}
lg("Checksum validated successfully for libarrow binary: %s", zip_file)
}
dir.create("windows", showWarnings = FALSE)
unzip("lib.zip", exdir = "windows")
unlink("lib.zip")
}

dir.create("windows", showWarnings = FALSE)
unzip(zip_file, exdir = "windows")
unlink(zip_file)

0 comments on commit ba7b240

Please sign in to comment.