From bffdfb1fc193456af8e2e12f6b866e3b140116c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Mon, 5 Aug 2024 20:57:31 +0200 Subject: [PATCH] Implement own `df_n_col()` to avoid calling `ncol()` (and `dim()`) (#7049) * Implement own `ncol()` to avoid calling `dim()` * Introduce `df_n_col()`, block usage of `ncol()` --------- Co-authored-by: Davis Vaughan --- NEWS.md | 3 +++ R/all-equal.R | 2 +- R/arrange.R | 2 +- R/data-storms.R | 2 +- R/generics.R | 2 +- R/n-col.R | 25 +++++++++++++++++++++++++ R/rows.R | 2 +- R/sets.R | 6 +++--- R/slice.R | 2 +- tests/testthat/test-across.R | 10 +++++----- tests/testthat/test-colwise-mutate.R | 6 +++--- tests/testthat/test-deprec-context.R | 4 ++-- tests/testthat/test-deprec-do.R | 8 ++++---- tests/testthat/test-distinct.R | 2 +- tests/testthat/test-select.R | 4 ++-- 15 files changed, 54 insertions(+), 26 deletions(-) create mode 100644 R/n-col.R diff --git a/NEWS.md b/NEWS.md index 89cfee901f..abb7240cc9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # dplyr (development version) +* Fixed an issue where duckplyr's ALTREP data frames were being materialized + early due to internal usage of `ncol()` (#7049). + * R >=3.6.0 is now explicitly required (#7026). # dplyr 1.1.4 diff --git a/R/all-equal.R b/R/all-equal.R index 1d4e57f582..127c38e6be 100644 --- a/R/all-equal.R +++ b/R/all-equal.R @@ -60,7 +60,7 @@ equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = T return("Different number of rows.") } - if (ncol(x) == 0L) { + if (df_n_col(x) == 0L) { return(TRUE) } diff --git a/R/arrange.R b/R/arrange.R index 573541ea52..c0ec1af1f7 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -225,7 +225,7 @@ sort_key_generator <- function(locale) { # ------------------------------------------------------------------------------ dplyr_order_legacy <- function(data, direction = "asc") { - if (ncol(data) == 0L) { + if (df_n_col(data) == 0L) { # Work around `order(!!!list())` returning `NULL` return(seq_len(nrow(data))) } diff --git a/R/data-storms.R b/R/data-storms.R index 8f1500b67e..11eada64c4 100644 --- a/R/data-storms.R +++ b/R/data-storms.R @@ -10,7 +10,7 @@ #' #' #' @format A tibble with `r format(nrow(storms), big.mark = ",")` observations -#' and `r ncol(storms)` variables: +#' and `r df_n_col(storms)` variables: #' \describe{ #' \item{name}{Storm Name} #' \item{year,month,day}{Date of report} diff --git a/R/generics.R b/R/generics.R index cfb440e838..1843c923d9 100644 --- a/R/generics.R +++ b/R/generics.R @@ -221,7 +221,7 @@ dplyr_reconstruct.rowwise_df <- function(data, template) { } dplyr_col_select <- function(.data, loc, error_call = caller_env()) { - loc <- vec_as_location(loc, n = ncol(.data), names = names(.data)) + loc <- vec_as_location(loc, n = df_n_col(.data), names = names(.data)) out <- .data[loc] if (!inherits(out, "data.frame")) { diff --git a/R/n-col.R b/R/n-col.R new file mode 100644 index 0000000000..e40af28cec --- /dev/null +++ b/R/n-col.R @@ -0,0 +1,25 @@ +# Masks `ncol()` to avoid accidentally materializing ALTREP duckplyr +# data frames. +ncol <- function(x) { + abort("Use `df_n_col()` or `mat_n_col()` instead.") +} + +# Alternative to `ncol()` which avoids `dim()`. +# +# `dim()` also requires knowing the number of rows, +# which forces ALTREP duckplyr data frames to materialize. +# +# This function makes the same assertion as vctrs about data frame structure, +# i.e. if `x` inherits from `"data.frame"`, then it is a VECSXP with length +# equal to the number of columns. +df_n_col <- function(x) { + x <- unclass(x) + obj_check_list(x) + length(x) +} + +# In a few places we call `ncol()` on matrices, and in those +# cases we want to continue using the base version. +mat_n_col <- function(x) { + base::ncol(x) +} diff --git a/R/rows.R b/R/rows.R index b42ed56e9a..58402d73da 100644 --- a/R/rows.R +++ b/R/rows.R @@ -467,7 +467,7 @@ rows_check_by <- function(by, y, ..., error_call = caller_env()) { check_dots_empty() if (is.null(by)) { - if (ncol(y) == 0L) { + if (df_n_col(y) == 0L) { abort("`y` must have at least one column.", call = error_call) } diff --git a/R/sets.R b/R/sets.R index f35d59ae8c..7393af33b3 100644 --- a/R/sets.R +++ b/R/sets.R @@ -165,10 +165,10 @@ is_compatible <- function(x, y, ignore_col_order = TRUE, convert = TRUE) { return("`y` must be a data frame.") } - nc <- ncol(x) - if (nc != ncol(y)) { + nc <- df_n_col(x) + if (nc != df_n_col(y)) { return( - c(x = glue("Different number of columns: {nc} vs {ncol(y)}.")) + c(x = glue("Different number of columns: {nc} vs {df_n_col(y)}.")) ) } diff --git a/R/slice.R b/R/slice.R index 36118d5511..7059bc0c83 100644 --- a/R/slice.R +++ b/R/slice.R @@ -366,7 +366,7 @@ slice_eval <- function(mask, slice_idx <- ...elt(i) - if (is.matrix(slice_idx) && ncol(slice_idx) == 1) { + if (is.matrix(slice_idx) && mat_n_col(slice_idx) == 1) { lifecycle::deprecate_warn( when = "1.1.0", what = I("Slicing with a 1-column matrix"), diff --git a/tests/testthat/test-across.R b/tests/testthat/test-across.R index c644669a11..d2116c2c22 100644 --- a/tests/testthat/test-across.R +++ b/tests/testthat/test-across.R @@ -167,15 +167,15 @@ test_that("across() result locations are aligned with column names (#4967)", { test_that("across() works sequentially (#4907)", { df <- tibble(a = 1) expect_equal( - mutate(df, x = ncol(across(where(is.numeric), identity)), y = ncol(across(where(is.numeric), identity))), + mutate(df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(where(is.numeric), identity))), tibble(a = 1, x = 1L, y = 2L) ) expect_equal( - mutate(df, a = "x", y = ncol(across(where(is.numeric), identity))), + mutate(df, a = "x", y = df_n_col(across(where(is.numeric), identity))), tibble(a = "x", y = 0L) ) expect_equal( - mutate(df, x = 1, y = ncol(across(where(is.numeric), identity))), + mutate(df, x = 1, y = df_n_col(across(where(is.numeric), identity))), tibble(a = 1, x = 1, y = 2L) ) }) @@ -282,7 +282,7 @@ test_that("across() gives meaningful messages", { test_that("monitoring cache - across() can be used twice in the same expression", { df <- tibble(a = 1, b = 2) expect_equal( - mutate(df, x = ncol(across(where(is.numeric), identity)) + ncol(across(a, identity))), + mutate(df, x = df_n_col(across(where(is.numeric), identity)) + df_n_col(across(a, identity))), tibble(a = 1, b = 2, x = 3) ) }) @@ -290,7 +290,7 @@ test_that("monitoring cache - across() can be used twice in the same expression" test_that("monitoring cache - across() can be used in separate expressions", { df <- tibble(a = 1, b = 2) expect_equal( - mutate(df, x = ncol(across(where(is.numeric), identity)), y = ncol(across(a, identity))), + mutate(df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(a, identity))), tibble(a = 1, b = 2, x = 2, y = 1) ) }) diff --git a/tests/testthat/test-colwise-mutate.R b/tests/testthat/test-colwise-mutate.R index 27bf2ac756..c3d86536ff 100644 --- a/tests/testthat/test-colwise-mutate.R +++ b/tests/testthat/test-colwise-mutate.R @@ -181,13 +181,13 @@ test_that("summarise_at with multiple columns AND unnamed functions works (#4119 res <- storms %>% summarise_at(vars(wind, pressure), list(mean, median)) - expect_equal(ncol(res), 4L) + expect_equal(df_n_col(res), 4L) expect_equal(names(res), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2")) res <- storms %>% summarise_at(vars(wind, pressure), list(n = length, mean, median)) - expect_equal(ncol(res), 6L) + expect_equal(df_n_col(res), 6L) expect_equal(names(res), c("wind_n", "pressure_n", "wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2")) }) @@ -195,7 +195,7 @@ test_that("mutate_at with multiple columns AND unnamed functions works (#4119)", res <- storms %>% mutate_at(vars(wind, pressure), list(mean, median)) - expect_equal(ncol(res), ncol(storms) + 4L) + expect_equal(df_n_col(res), df_n_col(storms) + 4L) expect_equal( names(res), c(names(storms), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2")) diff --git a/tests/testthat/test-deprec-context.R b/tests/testthat/test-deprec-context.R index 2a0ab6e417..ba8d3c395b 100644 --- a/tests/testthat/test-deprec-context.R +++ b/tests/testthat/test-deprec-context.R @@ -54,13 +54,13 @@ test_that("cur_data() and cur_data_all() work sequentially", { df <- tibble(a = 1) expect_equal( - mutate(df, x = ncol(cur_data()), y = ncol(cur_data())), + mutate(df, x = df_n_col(cur_data()), y = df_n_col(cur_data())), tibble(a = 1, x = 1, y = 2) ) gf <- tibble(a = 1, b = 2) %>% group_by(a) expect_equal( - mutate(gf, x = ncol(cur_data_all()), y = ncol(cur_data_all())), + mutate(gf, x = df_n_col(cur_data_all()), y = df_n_col(cur_data_all())), group_by(tibble(a = 1, b = 2, x = 2, y = 3), a) ) }) diff --git a/tests/testthat/test-deprec-do.R b/tests/testthat/test-deprec-do.R index 22b9a71a47..428caa1607 100644 --- a/tests/testthat/test-deprec-do.R +++ b/tests/testthat/test-deprec-do.R @@ -15,7 +15,7 @@ test_that("unnamed results bound together by row", { }) test_that("named argument become list columns", { - out <- df %>% do(nrow = nrow(.), ncol = ncol(.)) + out <- df %>% do(nrow = nrow(.), ncol = df_n_col(.)) expect_equal(out$nrow, list(1, 2, 3)) # includes grouping columns expect_equal(out$ncol, list(3, 3, 3)) @@ -121,7 +121,7 @@ test_that("empty data frames give consistent outputs", { vapply(pillar::type_sum, character(1)) %>% expect_equal(c(x = "dbl", g = "chr", y = "int")) dat %>% - do(y = ncol(.)) %>% + do(y = df_n_col(.)) %>% vapply(pillar::type_sum, character(1)) %>% expect_equal(c(y = "list")) @@ -144,7 +144,7 @@ test_that("empty data frames give consistent outputs", { vapply(pillar::type_sum, character(1)) %>% expect_equal(c(x = "dbl", g = "chr", y = "int")) grp %>% - do(y = ncol(.)) %>% + do(y = df_n_col(.)) %>% vapply(pillar::type_sum, character(1)) %>% expect_equal(c(g = "chr", y = "list")) @@ -166,7 +166,7 @@ test_that("empty data frames give consistent outputs", { vapply(pillar::type_sum, character(1)) %>% expect_equal(c(x = "dbl", g = "chr", y = "int")) emt %>% - do(y = ncol(.)) %>% + do(y = df_n_col(.)) %>% vapply(pillar::type_sum, character(1)) %>% expect_equal(c(g = "chr", y = "list")) }) diff --git a/tests/testthat/test-distinct.R b/tests/testthat/test-distinct.R index 1fa80e0f97..f56fec6e4a 100644 --- a/tests/testthat/test-distinct.R +++ b/tests/testthat/test-distinct.R @@ -22,7 +22,7 @@ test_that("distinct for single column works as expected (#1937)", { test_that("distinct works for 0-sized columns (#1437)", { df <- tibble(x = 1:10) %>% select(-x) ddf <- distinct(df) - expect_equal(ncol(ddf), 0L) + expect_equal(df_n_col(ddf), 0L) }) test_that("if no variables specified, uses all", { diff --git a/tests/testthat/test-select.R b/tests/testthat/test-select.R index bd84efb604..15bf0ab550 100644 --- a/tests/testthat/test-select.R +++ b/tests/testthat/test-select.R @@ -46,11 +46,11 @@ test_that("select doesn't fail if some names missing", { test_that("select with no args returns nothing", { empty <- select(mtcars) - expect_equal(ncol(empty), 0) + expect_equal(df_n_col(empty), 0) expect_equal(nrow(empty), 32) empty <- select(mtcars, !!!list()) - expect_equal(ncol(empty), 0) + expect_equal(df_n_col(empty), 0) expect_equal(nrow(empty), 32) })