Skip to content

Commit

Permalink
feat: add $with_columns() and $with_columns_seq() for DataFrame
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher authored Feb 14, 2025
1 parent f8cdd07 commit b6c082f
Show file tree
Hide file tree
Showing 5 changed files with 278 additions and 30 deletions.
86 changes: 74 additions & 12 deletions R/dataframe-frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -297,28 +297,90 @@ dataframe__select <- function(...) {
#' @inherit lazyframe__with_columns description params
#' @inherit as_polars_df return
#' @examples
#' as_polars_df(iris)$with_columns(
#' abs_SL = pl$col("Sepal.Length")$abs(),
#' add_2_SL = pl$col("Sepal.Length") + 2
#' # Pass an expression to add it as a new column.
#' df <- pl$DataFrame(
#' a = 1:4,
#' b = c(0.5, 4, 10, 13),
#' c = c(TRUE, TRUE, FALSE, TRUE),
#' )
#' df$with_columns((pl$col("a")^2)$alias("a^2"))
#'
#' # Added columns will replace existing columns with the same name.
#' df$with_columns(a = pl$col("a")$cast(pl$Float64))
#'
#' # same query
#' l_expr <- list(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' # Multiple columns can be added
#' df$with_columns(
#' (pl$col("a")^2)$alias("a^2"),
#' (pl$col("b") / 2)$alias("b/2"),
#' (pl$col("c")$not())$alias("not c"),
#' )
#' as_polars_df(iris)$with_columns(l_expr)
#'
#' as_polars_df(iris)$with_columns(
#' SW_add_2 = (pl$col("Sepal.Width") + 2),
#' # unnamed expr will keep name "Sepal.Length"
#' pl$col("Sepal.Length")$abs()
#' # Name expression instead of `$alias()`
#' df$with_columns(
#' `a^2` = pl$col("a")^2,
#' `b/2` = pl$col("b") / 2,
#' `not c` = pl$col("c")$not(),
#' )
#'
#' # Expressions with multiple outputs can automatically be instantiated
#' # as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`:
#' if (requireNamespace("withr", quietly = TRUE)) {
#' withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
#' df$drop("c")$with_columns(
#' diffs = pl$col("a", "b")$diff()$name$suffix("_diff"),
#' )
#' })
#' }
dataframe__with_columns <- function(...) {
self$lazy()$with_columns(...)$collect(`_eager` = TRUE) |>
wrap()
}

#' Modify/append column(s) of a DataFrame
#'
#' @inherit lazyframe__with_columns_seq description
#' @inherit pl__DataFrame return
#' @inheritParams lazyframe__select
#' @examples
#' # Pass an expression to add it as a new column.
#' df <- pl$DataFrame(
#' a = 1:4,
#' b = c(0.5, 4, 10, 13),
#' c = c(TRUE, TRUE, FALSE, TRUE),
#' )
#' df$with_columns_seq((pl$col("a")^2)$alias("a^2"))
#'
#' # Added columns will replace existing columns with the same name.
#' df$with_columns_seq(a = pl$col("a")$cast(pl$Float64))
#'
#' # Multiple columns can be added
#' df$with_columns_seq(
#' (pl$col("a")^2)$alias("a^2"),
#' (pl$col("b") / 2)$alias("b/2"),
#' (pl$col("c")$not())$alias("not c"),
#' )
#'
#' # Name expression instead of `$alias()`
#' df$with_columns_seq(
#' `a^2` = pl$col("a")^2,
#' `b/2` = pl$col("b") / 2,
#' `not c` = pl$col("c")$not(),
#' )
#'
#' # Expressions with multiple outputs can automatically be instantiated
#' # as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`:
#' if (requireNamespace("withr", quietly = TRUE)) {
#' withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
#' df$drop("c")$with_columns_seq(
#' diffs = pl$col("a", "b")$diff()$name$suffix("_diff"),
#' )
#' })
#' }
dataframe__with_columns_seq <- function(...) {
self$lazy()$with_columns_seq(...)$collect(`_eager` = TRUE) |>
wrap()
}

# TODO-REWRITE: before release, add in news that param idx was renamed "index"
# and mention that it errors if out of bounds
#' Select column as Series at index location
Expand Down
41 changes: 29 additions & 12 deletions man/dataframe__with_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 66 additions & 0 deletions man/dataframe__with_columns_seq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 27 additions & 6 deletions tests/testthat/helper-expections.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,43 @@
#' Inspired by `compare_dplyr_binding` of the arrow package.
#' @param object A polars query, must be started with `.input`.
#' See the examples for details.
#' @param input R object will be converted to a DataFrame or LazyFrame
#' by `as_polars_df` or `as_polars_lf`.
#' @param expected A polars DataFrame, the expected result of the query.
#' @param ... Dynamic dots taking the various inputs specified in `object`. The
#' last element MUST be the expected output.
#' @examples
#' expect_query_equal(
#' .input$select("foo"),
#' pl$DataFrame(foo = NULL, bar = NULL),
#' pl$DataFrame(foo = NULL)
#' )
#'
#' a <- pl$DataFrame(x = 1:2, y = 3:4)
#' b <- pl$DataFrame(x = 2, z = 5)
#' expect_query_equal(
#' .input$join(.input2, on = "x", how = "inner"),
#' .input = a, .input2 = b,
#' pl$DataFrame(a = 2, y = 4, z = 5)
#' )
#' @noRd
expect_query_equal <- function(object, input, expected) {
expect_query_equal <- function(object, ...) {
query <- rlang::enquo(object)
out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect()
out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
inputs <- list2(...)

# Otherwise the expected output needs to be named in all expect_query_equal()
expected <- inputs[[length(inputs)]]
inputs[[length(inputs)]] <- NULL

# Just a convenience to avoid naming `.input` when it's the only input in the
# query
inputs_lazy <- lapply(inputs, \(x) x$lazy())
if (length(inputs) == 1 && is.null(names(inputs))) {
names(inputs) <- ".input"
names(inputs_lazy) <- ".input"
}

out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs_lazy)))$collect()
expect_equal(out_lazy, expected)

out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs)))
expect_equal(out_eager, expected)

invisible(NULL)
Expand Down
82 changes: 82 additions & 0 deletions tests/testthat/test-lazyframe-frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,85 @@ test_that("slice/head/tail works lazy/eager", {
r"(-4\.0 is out of range that can be safely converted to u32)"
)
})

test_that("with_columns: basic usage", {
df <- pl$DataFrame(x = 1:2)

expect_query_equal(
.input$with_columns(y = 1 + pl$col("x"), z = pl$col("x")^2),
df,
pl$DataFrame(x = 1:2, y = c(2, 3), z = c(1, 4))
)

# cannot reuse defined variable in same statement
expect_query_error(
.input$with_columns(y = 1 + pl$col("x"), z = pl$col("y")^2),
df,
"Column(s) not found: y",
fixed = TRUE
)

# chaining multiple with_columns works
expect_query_equal(
.input$with_columns(y = 1 + pl$col("x"))$with_columns(z = pl$col("y")^2),
df,
pl$DataFrame(x = 1:2, y = c(2, 3), z = c(4, 9))
)
})

test_that("with_columns can create list variables", {
df <- pl$DataFrame(x = 1:2)

expect_query_equal(
.input$with_columns(y = list(1:2, 3:4)),
df,
pl$DataFrame(x = 1:2, y = list(1:2, 3:4))
)

expect_query_equal(
.input$with_columns(y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d"))),
df,
pl$DataFrame(x = 1:2, y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d")))
)
})

test_that("with_columns_seq: basic usage", {
df <- pl$DataFrame(x = 1:2)

expect_query_equal(
.input$with_columns_seq(y = 1 + pl$col("x"), z = pl$col("x")^2),
df,
pl$DataFrame(x = 1:2, y = c(2, 3), z = c(1, 4))
)

# cannot reuse defined variable in same statement
expect_query_error(
.input$with_columns_seq(y = 1 + pl$col("x"), z = pl$col("y")^2),
df,
"Column(s) not found: y",
fixed = TRUE
)

# chaining multiple with_columns_seq works
expect_query_equal(
.input$with_columns_seq(y = 1 + pl$col("x"))$with_columns_seq(z = pl$col("y")^2),
df,
pl$DataFrame(x = 1:2, y = c(2, 3), z = c(4, 9))
)
})

test_that("with_columns_seq can create list variables", {
df <- pl$DataFrame(x = 1:2)

expect_query_equal(
.input$with_columns_seq(y = list(1:2, 3:4)),
df,
pl$DataFrame(x = 1:2, y = list(1:2, 3:4))
)

expect_query_equal(
.input$with_columns_seq(y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d"))),
df,
pl$DataFrame(x = 1:2, y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d")))
)
})

0 comments on commit b6c082f

Please sign in to comment.