From bffdfb1fc193456af8e2e12f6b866e3b140116c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirill=20M=C3=BCller?= <krlmlr@users.noreply.github.com>
Date: Mon, 5 Aug 2024 20:57:31 +0200
Subject: [PATCH] Implement own `df_n_col()` to avoid calling `ncol()` (and
 `dim()`) (#7049)

* Implement own `ncol()` to avoid calling `dim()`

* Introduce `df_n_col()`, block usage of `ncol()`

---------

Co-authored-by: Davis Vaughan <davis@posit.co>
---
 NEWS.md                              |  3 +++
 R/all-equal.R                        |  2 +-
 R/arrange.R                          |  2 +-
 R/data-storms.R                      |  2 +-
 R/generics.R                         |  2 +-
 R/n-col.R                            | 25 +++++++++++++++++++++++++
 R/rows.R                             |  2 +-
 R/sets.R                             |  6 +++---
 R/slice.R                            |  2 +-
 tests/testthat/test-across.R         | 10 +++++-----
 tests/testthat/test-colwise-mutate.R |  6 +++---
 tests/testthat/test-deprec-context.R |  4 ++--
 tests/testthat/test-deprec-do.R      |  8 ++++----
 tests/testthat/test-distinct.R       |  2 +-
 tests/testthat/test-select.R         |  4 ++--
 15 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 R/n-col.R

diff --git a/NEWS.md b/NEWS.md
index 89cfee901f..abb7240cc9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
 # dplyr (development version)
 
+* Fixed an issue where duckplyr's ALTREP data frames were being materialized
+  early due to internal usage of `ncol()` (#7049).
+
 * R >=3.6.0 is now explicitly required (#7026).
 
 # dplyr 1.1.4
diff --git a/R/all-equal.R b/R/all-equal.R
index 1d4e57f582..127c38e6be 100644
--- a/R/all-equal.R
+++ b/R/all-equal.R
@@ -60,7 +60,7 @@ equal_data_frame <- function(x, y, ignore_col_order = TRUE, ignore_row_order = T
     return("Different number of rows.")
   }
 
-  if (ncol(x) == 0L) {
+  if (df_n_col(x) == 0L) {
     return(TRUE)
   }
 
diff --git a/R/arrange.R b/R/arrange.R
index 573541ea52..c0ec1af1f7 100644
--- a/R/arrange.R
+++ b/R/arrange.R
@@ -225,7 +225,7 @@ sort_key_generator <- function(locale) {
 # ------------------------------------------------------------------------------
 
 dplyr_order_legacy <- function(data, direction = "asc") {
-  if (ncol(data) == 0L) {
+  if (df_n_col(data) == 0L) {
     # Work around `order(!!!list())` returning `NULL`
     return(seq_len(nrow(data)))
   }
diff --git a/R/data-storms.R b/R/data-storms.R
index 8f1500b67e..11eada64c4 100644
--- a/R/data-storms.R
+++ b/R/data-storms.R
@@ -10,7 +10,7 @@
 #'   <https://github.com/tidyverse/dplyr/blob/main/data-raw/storms.R>
 #'
 #' @format A tibble with `r format(nrow(storms), big.mark = ",")` observations
-#'   and `r ncol(storms)` variables:
+#'   and `r df_n_col(storms)` variables:
 #' \describe{
 #' \item{name}{Storm Name}
 #' \item{year,month,day}{Date of report}
diff --git a/R/generics.R b/R/generics.R
index cfb440e838..1843c923d9 100644
--- a/R/generics.R
+++ b/R/generics.R
@@ -221,7 +221,7 @@ dplyr_reconstruct.rowwise_df <- function(data, template) {
 }
 
 dplyr_col_select <- function(.data, loc, error_call = caller_env()) {
-  loc <- vec_as_location(loc, n = ncol(.data), names = names(.data))
+  loc <- vec_as_location(loc, n = df_n_col(.data), names = names(.data))
 
   out <- .data[loc]
   if (!inherits(out, "data.frame")) {
diff --git a/R/n-col.R b/R/n-col.R
new file mode 100644
index 0000000000..e40af28cec
--- /dev/null
+++ b/R/n-col.R
@@ -0,0 +1,25 @@
+# Masks `ncol()` to avoid accidentally materializing ALTREP duckplyr
+# data frames.
+ncol <- function(x) {
+  abort("Use `df_n_col()` or `mat_n_col()` instead.")
+}
+
+# Alternative to `ncol()` which avoids `dim()`.
+#
+# `dim()` also requires knowing the number of rows,
+# which forces ALTREP duckplyr data frames to materialize.
+#
+# This function makes the same assertion as vctrs about data frame structure,
+# i.e. if `x` inherits from `"data.frame"`, then it is a VECSXP with length
+# equal to the number of columns.
+df_n_col <- function(x) {
+  x <- unclass(x)
+  obj_check_list(x)
+  length(x)
+}
+
+# In a few places we call `ncol()` on matrices, and in those
+# cases we want to continue using the base version.
+mat_n_col <- function(x) {
+  base::ncol(x)
+}
diff --git a/R/rows.R b/R/rows.R
index b42ed56e9a..58402d73da 100644
--- a/R/rows.R
+++ b/R/rows.R
@@ -467,7 +467,7 @@ rows_check_by <- function(by, y, ..., error_call = caller_env()) {
   check_dots_empty()
 
   if (is.null(by)) {
-    if (ncol(y) == 0L) {
+    if (df_n_col(y) == 0L) {
       abort("`y` must have at least one column.", call = error_call)
     }
 
diff --git a/R/sets.R b/R/sets.R
index f35d59ae8c..7393af33b3 100644
--- a/R/sets.R
+++ b/R/sets.R
@@ -165,10 +165,10 @@ is_compatible <- function(x, y, ignore_col_order = TRUE, convert = TRUE) {
     return("`y` must be a data frame.")
   }
 
-  nc <- ncol(x)
-  if (nc != ncol(y)) {
+  nc <- df_n_col(x)
+  if (nc != df_n_col(y)) {
     return(
-      c(x = glue("Different number of columns: {nc} vs {ncol(y)}."))
+      c(x = glue("Different number of columns: {nc} vs {df_n_col(y)}."))
     )
   }
 
diff --git a/R/slice.R b/R/slice.R
index 36118d5511..7059bc0c83 100644
--- a/R/slice.R
+++ b/R/slice.R
@@ -366,7 +366,7 @@ slice_eval <- function(mask,
 
       slice_idx <- ...elt(i)
 
-      if (is.matrix(slice_idx) && ncol(slice_idx) == 1) {
+      if (is.matrix(slice_idx) && mat_n_col(slice_idx) == 1) {
         lifecycle::deprecate_warn(
           when = "1.1.0",
           what = I("Slicing with a 1-column matrix"),
diff --git a/tests/testthat/test-across.R b/tests/testthat/test-across.R
index c644669a11..d2116c2c22 100644
--- a/tests/testthat/test-across.R
+++ b/tests/testthat/test-across.R
@@ -167,15 +167,15 @@ test_that("across() result locations are aligned with column names (#4967)", {
 test_that("across() works sequentially (#4907)", {
   df <- tibble(a = 1)
   expect_equal(
-    mutate(df, x = ncol(across(where(is.numeric), identity)), y = ncol(across(where(is.numeric), identity))),
+    mutate(df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(where(is.numeric), identity))),
     tibble(a = 1, x = 1L, y = 2L)
   )
   expect_equal(
-    mutate(df, a = "x", y = ncol(across(where(is.numeric), identity))),
+    mutate(df, a = "x", y = df_n_col(across(where(is.numeric), identity))),
     tibble(a = "x", y = 0L)
   )
   expect_equal(
-    mutate(df, x = 1, y = ncol(across(where(is.numeric), identity))),
+    mutate(df, x = 1, y = df_n_col(across(where(is.numeric), identity))),
     tibble(a = 1, x = 1, y = 2L)
   )
 })
@@ -282,7 +282,7 @@ test_that("across() gives meaningful messages", {
 test_that("monitoring cache - across() can be used twice in the same expression", {
   df <- tibble(a = 1, b = 2)
   expect_equal(
-    mutate(df, x = ncol(across(where(is.numeric), identity)) + ncol(across(a, identity))),
+    mutate(df, x = df_n_col(across(where(is.numeric), identity)) + df_n_col(across(a, identity))),
     tibble(a = 1, b = 2, x = 3)
   )
 })
@@ -290,7 +290,7 @@ test_that("monitoring cache - across() can be used twice in the same expression"
 test_that("monitoring cache - across() can be used in separate expressions", {
   df <- tibble(a = 1, b = 2)
   expect_equal(
-    mutate(df, x = ncol(across(where(is.numeric), identity)), y = ncol(across(a, identity))),
+    mutate(df, x = df_n_col(across(where(is.numeric), identity)), y = df_n_col(across(a, identity))),
     tibble(a = 1, b = 2, x = 2, y = 1)
   )
 })
diff --git a/tests/testthat/test-colwise-mutate.R b/tests/testthat/test-colwise-mutate.R
index 27bf2ac756..c3d86536ff 100644
--- a/tests/testthat/test-colwise-mutate.R
+++ b/tests/testthat/test-colwise-mutate.R
@@ -181,13 +181,13 @@ test_that("summarise_at with multiple columns AND unnamed functions works (#4119
   res <- storms %>%
     summarise_at(vars(wind, pressure), list(mean, median))
 
-  expect_equal(ncol(res), 4L)
+  expect_equal(df_n_col(res), 4L)
   expect_equal(names(res), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2"))
 
   res <- storms %>%
     summarise_at(vars(wind, pressure), list(n = length, mean, median))
 
-  expect_equal(ncol(res), 6L)
+  expect_equal(df_n_col(res), 6L)
   expect_equal(names(res), c("wind_n", "pressure_n", "wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2"))
 })
 
@@ -195,7 +195,7 @@ test_that("mutate_at with multiple columns AND unnamed functions works (#4119)",
   res <- storms %>%
     mutate_at(vars(wind, pressure), list(mean, median))
 
-  expect_equal(ncol(res), ncol(storms) + 4L)
+  expect_equal(df_n_col(res), df_n_col(storms) + 4L)
   expect_equal(
     names(res),
     c(names(storms), c("wind_fn1", "pressure_fn1", "wind_fn2", "pressure_fn2"))
diff --git a/tests/testthat/test-deprec-context.R b/tests/testthat/test-deprec-context.R
index 2a0ab6e417..ba8d3c395b 100644
--- a/tests/testthat/test-deprec-context.R
+++ b/tests/testthat/test-deprec-context.R
@@ -54,13 +54,13 @@ test_that("cur_data() and cur_data_all() work sequentially", {
 
   df <- tibble(a = 1)
   expect_equal(
-    mutate(df, x = ncol(cur_data()), y = ncol(cur_data())),
+    mutate(df, x = df_n_col(cur_data()), y = df_n_col(cur_data())),
     tibble(a = 1, x = 1, y = 2)
   )
 
   gf <- tibble(a = 1, b = 2) %>% group_by(a)
   expect_equal(
-    mutate(gf, x = ncol(cur_data_all()), y = ncol(cur_data_all())),
+    mutate(gf, x = df_n_col(cur_data_all()), y = df_n_col(cur_data_all())),
     group_by(tibble(a = 1, b = 2, x = 2, y = 3), a)
   )
 })
diff --git a/tests/testthat/test-deprec-do.R b/tests/testthat/test-deprec-do.R
index 22b9a71a47..428caa1607 100644
--- a/tests/testthat/test-deprec-do.R
+++ b/tests/testthat/test-deprec-do.R
@@ -15,7 +15,7 @@ test_that("unnamed results bound together by row", {
 })
 
 test_that("named argument become list columns", {
-  out <- df %>% do(nrow = nrow(.), ncol = ncol(.))
+  out <- df %>% do(nrow = nrow(.), ncol = df_n_col(.))
   expect_equal(out$nrow, list(1, 2, 3))
   # includes grouping columns
   expect_equal(out$ncol, list(3, 3, 3))
@@ -121,7 +121,7 @@ test_that("empty data frames give consistent outputs", {
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(x = "dbl", g = "chr", y = "int"))
   dat %>%
-    do(y = ncol(.)) %>%
+    do(y = df_n_col(.)) %>%
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(y = "list"))
 
@@ -144,7 +144,7 @@ test_that("empty data frames give consistent outputs", {
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(x = "dbl", g = "chr", y = "int"))
   grp %>%
-    do(y = ncol(.)) %>%
+    do(y = df_n_col(.)) %>%
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(g = "chr", y = "list"))
 
@@ -166,7 +166,7 @@ test_that("empty data frames give consistent outputs", {
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(x = "dbl", g = "chr", y = "int"))
   emt %>%
-    do(y = ncol(.)) %>%
+    do(y = df_n_col(.)) %>%
     vapply(pillar::type_sum, character(1)) %>%
     expect_equal(c(g = "chr", y = "list"))
 })
diff --git a/tests/testthat/test-distinct.R b/tests/testthat/test-distinct.R
index 1fa80e0f97..f56fec6e4a 100644
--- a/tests/testthat/test-distinct.R
+++ b/tests/testthat/test-distinct.R
@@ -22,7 +22,7 @@ test_that("distinct for single column works as expected (#1937)", {
 test_that("distinct works for 0-sized columns (#1437)", {
   df <- tibble(x = 1:10) %>% select(-x)
   ddf <- distinct(df)
-  expect_equal(ncol(ddf), 0L)
+  expect_equal(df_n_col(ddf), 0L)
 })
 
 test_that("if no variables specified, uses all", {
diff --git a/tests/testthat/test-select.R b/tests/testthat/test-select.R
index bd84efb604..15bf0ab550 100644
--- a/tests/testthat/test-select.R
+++ b/tests/testthat/test-select.R
@@ -46,11 +46,11 @@ test_that("select doesn't fail if some names missing", {
 
 test_that("select with no args returns nothing", {
   empty <- select(mtcars)
-  expect_equal(ncol(empty), 0)
+  expect_equal(df_n_col(empty), 0)
   expect_equal(nrow(empty), 32)
 
   empty <- select(mtcars, !!!list())
-  expect_equal(ncol(empty), 0)
+  expect_equal(df_n_col(empty), 0)
   expect_equal(nrow(empty), 32)
 })