Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New param no_collapse_above_absolute for PipeOpCollapseFactors #840

Merged
merged 12 commits into from
Nov 19, 2024
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# mlr3pipelines 0.7.9000

* New parameter `no_collapse_above_absolute` in `PipeOpCollapseFactors` / `po("collapse_factors")`.
* Fix: `PipeOpCollapseFactors` now correctly collapses levels of ordered factors.

# mlr3pipelines 0.7.1

* Compatibility fix for upcoming `mlr3`
Expand Down
62 changes: 52 additions & 10 deletions R/PipeOpCollapseFactors.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the
#' training samples, until `target_level_count` levels remain. Levels that have prevalence above `no_collapse_above_prevalence`
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables,
#' rare variables are collapsed to the neighbouring class, whichever has fewer samples.
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the training samples, until `target_level_count`
#' levels remain. Levels that have prevalence strictly above `no_collapse_above_prevalence` or absolute count strictly above `no_collapse_above_absolute`
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables, rare variables
#' are collapsed to the neighbouring class, whichever has fewer samples.
#' In case both `no_collapse_above_prevalence` and `no_collapse_above_absolute` are given, the less strict threshold of the two will be used, i.e. if
#' `no_collapse_above_prevalence` is 1 and `no_collapse_above_absolute` is 10 for a task with 100 samples, levels that are seen more than 10 times
#' will not be collapsed.
#'
#' Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the
#' [`PipeOpFixFactors`].
Expand Down Expand Up @@ -39,6 +42,9 @@
#' * `no_collapse_above_prevalence` :: `numeric(1)` \cr
#' Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels
#' to be collapsed until `target_level_count` remain.
#' * `no_collapse_above_absolute` :: `integer(1)` \cr
#' Number of samples below which factor levels get collapsed. Default is `Inf`, which causes all levels
#' to be collapsed until `target_level_count` remain.
#' * `target_level_count` :: `integer(1)` \cr
#' Number of levels to retain. Default is 2.
#'
Expand All @@ -55,15 +61,41 @@
#' @export
#' @examples
#' library("mlr3")
#' op = PipeOpCollapseFactors$new()
#'
#' # Create example training task
#' df = data.frame(
#' target = runif(100),
#' fct = factor(rep(LETTERS[1:6], times = c(25, 30, 5, 15, 5, 20))),
#' ord = factor(rep(1:6, times = c(20, 25, 30, 5, 5, 15)), ordered = TRUE)
#' )
#' task = TaskRegr$new(df, target = "target", id = "example_train")
#'
#' # Training
#' train_task_collapsed = op$train(list(task))[[1]]
#' train_task_collapsed$levels(c("fct", "ord"))
#'
#' # Create example prediction task
#' df_pred = data.frame(
#' target = runif(7),
#' fct = factor(LETTERS[1:7]),
#' ord = factor(1:7, ordered = TRUE)
#' )
#' pred_task = TaskRegr$new(df_pred, target = "target", id = "example_pred")
#'
#' # Prediction
#' pred_task_collapsed = op$predict(list(pred_task))[[1]]
#' pred_task_collapsed$levels(c("fct", "ord"))
PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "collapsefactors", param_vals = list()) {
ps = ps(
no_collapse_above_prevalence = p_dbl(0, 1, tags = c("train", "predict")),
no_collapse_above_absolute = p_int(1, special_vals = list(Inf), tags = c("train", "predict")),
target_level_count = p_int(2, tags = c("train", "predict"))
)
ps$values = list(no_collapse_above_prevalence = 1, target_level_count = 2)
ps$values = list(no_collapse_above_prevalence = 1, no_collapse_above_absolute = Inf, target_level_count = 2)
super$initialize(id, param_set = ps, param_vals = param_vals, feature_types = c("factor", "ordered"))
}
),
Expand All @@ -74,6 +106,7 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
dt = task$data(cols = private$.select_cols(task))

keep_fraction = self$param_set$values$no_collapse_above_prevalence
keep_absolute = self$param_set$values$no_collapse_above_absolute
target_count = self$param_set$values$target_level_count

collapse_map = sapply(dt, function(d) {
Expand All @@ -83,22 +116,30 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
if (length(levels(d)) <= target_count) {
return(NULL)
}

dtable = table(d)
fractions = sort(dtable, decreasing = TRUE) / sum(!is.na(d))
keep_fraction = names(fractions)[fractions >= keep_fraction]

absolutes = sort(dtable, decreasing = TRUE)
keep_absolute = names(absolutes)[absolutes > keep_absolute]

fractions = absolutes / sum(!is.na(d))
keep_fraction = names(fractions)[fractions > keep_fraction]

keep_count = names(fractions)[seq_len(target_count)] # at this point we know there are more levels than target_count
keep = union(keep_fraction, keep_count)

keep = union(keep_fraction, union(keep_count, keep_absolute))
dont_keep = setdiff(levels(d), keep)

if (is.ordered(d)) {
cmap = stats::setNames(as.list(levels(d)), levels(d))
for (eliminating in dont_keep) {
position = match(eliminating, names(cmap))
if (position == 1) {
cmap[[2]] = c(cmap[[2]], eliminating)
} else if (position == length(cmap) || dtable[position - 1] < dtable[position + 1]) {
cmap[[position - 1]] = c(cmap[[position - 1]], eliminating)
cmap[[position - 1]] = c(cmap[[position - 1]], cmap[[eliminating]])
} else {
cmap[[position + 1]] = c(cmap[[position + 1]], eliminating)
cmap[[position + 1]] = c(cmap[[position + 1]], cmap[[eliminating]])
}
dtable = dtable[-position]
cmap[[position]] = NULL
Expand All @@ -108,6 +149,7 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
lowest_kept = keep[length(keep)]
cmap[[lowest_kept]] = c(lowest_kept, dont_keep)
}

cmap
}, simplify = FALSE)

Expand Down
39 changes: 35 additions & 4 deletions man/mlr_pipeops_collapsefactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions tests/testthat/test_pipeop_collapsefactors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
context("PipeOpCollapseFactors")

test_that("PipeOpCollapseFactors - basic properties", {
task = mlr_tasks$get("penguins")

expect_datapreproc_pipeop_class(PipeOpCollapseFactors, task = task)
})

test_that("PipeOpCollapseFactors - train and predict work", {
op = PipeOpCollapseFactors$new()
df = data.frame(
target = runif(100),
fct = factor(rep(LETTERS[1:6], times = c(25, 30, 5, 15, 5, 20))),
ord = factor(rep(1:6, times = c(20, 25, 30, 5, 5, 15)), ordered = TRUE)
)
task = TaskRegr$new(df, target = "target", id = "test")

# test (default): levels are reduced to target_count, correct levels are chosen for this
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "A"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("2", "3"), times = c(45, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "A"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("2", "3"), times = c(45, 55)), ordered = TRUE))

# test: target_count works
op$param_set$values$target_level_count = 4
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))
op$param_set$values$target_level_count = 2

# test: absolute works
op$param_set$values$no_collapse_above_absolute = 15
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))
op$param_set$values$no_collapse_above_absolute = Inf

# test: prevalence works
op$param_set$values$no_collapse_above_prevalence = 0.15
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

# test: if given both, does as documented (i.e. lower one is used since we are using union)
op$param_set$values$no_collapse_above_absolute = 10
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

# test: unseen levels are not touched in predict
op$param_set$values$no_collapse_above_absolute = Inf
op$param_set$values$no_collapse_above_prevalence = 1
df_pred = data.frame(
target = runif(7),
fct = factor(LETTERS[1:7]),
ord = factor(1:7, ordered = TRUE)
)
pred_task = TaskRegr$new(df_pred, target = "target", id = "test_pred")
op$train(list(task))
predict_out = op$predict(list(pred_task))[[1]]

expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(c("A", "B", "A", "A", "A", "A", "G")))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(c("2", "2", "3", "3", "3", "3", "7"), ordered = TRUE))
})