Skip to content

Commit

Permalink
Merge pull request #109 from mlr-org/mtry.ratio
Browse files Browse the repository at this point in the history
Support mtry.ratio and sampsize.ratio
  • Loading branch information
RaphaelS1 authored Sep 12, 2021
2 parents ce33c22 + 7c93fa7 commit 48d206d
Show file tree
Hide file tree
Showing 20 changed files with 288 additions and 92 deletions.
5 changes: 5 additions & 0 deletions .ignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
man/
attic/
pkgdown/
revdep/
docs/
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mlr3extralearners
Title: Extra Learners For mlr3
Version: 0.5.6
Version: 0.5.7
Authors@R:
c(person(given = "Raphael",
family = "Sonabend",
Expand Down Expand Up @@ -44,7 +44,7 @@ Imports:
data.table,
methods,
mlr3 (>= 0.6.0),
mlr3misc,
mlr3misc (>= 0.9.4),
paradox,
R6
Suggests:
Expand Down Expand Up @@ -108,4 +108,4 @@ Config/testthat/edition: 3
Encoding: UTF-8
NeedsCompilation: no
Roxygen: list(markdown = TRUE, r6 = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
# mlr3extralearners 0.5.7

* Introduced new custom hyperparameters for `randomForestSRC::rfsrc()`,
`partykit::cforest()` and `obliqueRSF::ORSF()` to conveniently tune
hyperparameters whose upper limit depends on data dimensions.

# mlr3extralearners 0.5.6

* Fix learners requiring distr6. distr6 1.6.0 now forced and param6 added to suggests


# mlr3extralearners 0.5.5

* Bugfix `regr.gausspr`
Expand Down
63 changes: 63 additions & 0 deletions R/bibentries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
bibentries = c( # nolint start
breiman_2001 = bibentry("article",
title = "Random Forests",
author = "Breiman, Leo",
year = "2001",
journal = "Machine Learning",
volume = "45",
number = "1",
pages = "5--32",
doi = "10.1023/A:1010933404324",
issn = "1573-0565"
),

ishwaran_2008 = bibentry("article",
doi = "10.1214/08-aoas169",
url = "https://doi.org/10.1214/08-aoas169",
year = "2008",
month = "9",
publisher = "Institute of Mathematical Statistics",
volume = "2",
number = "3",
author = "Hemant Ishwaran and Udaya B. Kogalur and Eugene H. Blackstone and Michael S. Lauer",
title = "Random survival forests",
journal = "The Annals of Applied Statistics"
),

hothorn_2015 = bibentry("article",
author = "Torsten Hothorn and Achim Zeileis",
title = "partykit: A Modular Toolkit for Recursive Partytioning in R",
journal = "Journal of Machine Learning Research",
year = "2015",
volume = "16",
number = "118",
pages = "3905-3909",
url = "http://jmlr.org/papers/v16/hothorn15a.html"
),

hothorn_2006 = bibentry("article",
doi = "10.1198/106186006x133933",
url = "https://doi.org/10.1198/106186006x133933",
year = "2006",
month = "9",
publisher = "Informa {UK} Limited",
volume = "15",
number = "3",
pages = "651--674",
author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis",
title = "Unbiased Recursive Partitioning: A Conditional Inference Framework",
journal = "Journal of Computational and Graphical Statistics"
),

jaeger_2019 = bibentry("article",
doi = "10.1214/19-aoas1261",
year = "2019",
month = "9",
publisher = "Institute of Mathematical Statistics",
volume = "13",
number = "3",
author = "Byron C. Jaeger and D. Leann Long and Dustin M. Long and Mario Sims and Jeff M. Szychowski and Yuan-I Min and Leslie A. Mcclure and George Howard and Noah Simon",
title = "Oblique random survival forests",
journal = "The Annals of Applied Statistics"
)
) # nolint end
37 changes: 37 additions & 0 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,40 @@ pprob_to_matrix <- function(pp, task) {
colnames(y) <- task$class_names
y
}

#' @title Convert a Ratio Hyperparameter
#'
#' @description
#' Given the named list `pv` (values of a [ParamSet]), converts a possibly provided hyperparameter
#' called `ratio` to an integer hyperparameter `target`.
#' If both are found in `pv`, an exception is thrown.
#'
#' @param pv (named `list()`).
#' @param target (`character(1)`)\cr
#' Name of the integer hyperparameter.
#' @param ratio (`character(1)`)\cr
#' Name of the ratio hyperparameter.
#' @param n (`integer(1)`)\cr
#' Ratio of what?
#'
#' @return (named `list()`) with new hyperparameter settings.
#' @noRd
convert_ratio = function(pv, target, ratio, n) {
switch(to_decimal(c(target, ratio) %in% names(pv)) + 1L,
# !mtry && !mtry.ratio
pv,

# !mtry && mtry.ratio
{
pv[[target]] = max(ceiling(pv[[ratio]] * n), 1)
remove_named(pv, ratio)
},


# mtry && !mtry.ratio
pv,

# mtry && mtry.ratio
stopf("Hyperparameters '%s' and '%s' are mutually exclusive", target, ratio)
)
}
13 changes: 8 additions & 5 deletions R/learner_obliqueRSF_surv_obliqueRSF.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
#' - Actual default: `TRUE`
#' - Adjusted default: `FALSE`
#' - Reason for change: mlr3 already has it's own verbose set to `TRUE` by default
#' - `mtry`:
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtry_ratio`
#' as `mtry = max(ceiling(mtry_ratio * n_features), 1)`.
#' Note that `mtry` and `mtry_ratio` are mutually exclusive.
#'
#' @references
#' Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min Y, Mcclure LA, Howard G, Simon N (2019).
#' “Oblique random survival forests.” The Annals of Applied Statistics, 13(3), 1847–1883.
#' ISSN 1932-6157, 1941-7330, doi: 10.1214/19-AOAS1261,
#' https://projecteuclid.org/euclid.aoas/1571277776.
#' `r format_bib("jaeger_2019")`
#'
#' @template seealso_learner
#' @template example
Expand All @@ -42,6 +43,7 @@ LearnerSurvObliqueRSF = R6Class("LearnerSurvObliqueRSF",
max_pval_to_split_node = p_dbl(lower = 0, upper = 1, default = 0.5,
tags = "train"),
mtry = p_int(lower = 1, tags = "train"),
mtry_ratio = p_dbl(0, 1, tags = "train"),
dfmax = p_int(lower = 1, tags = "train"),
use.cv = p_lgl(default = FALSE, tags = "train"),
verbose = p_lgl(default = TRUE, tags = "train"),
Expand Down Expand Up @@ -76,11 +78,12 @@ LearnerSurvObliqueRSF = R6Class("LearnerSurvObliqueRSF",
private = list(
.train = function(task) {
pv = self$param_set$get_values(tags = "train")
pv = convert_ratio(pv, "mtry", "mtry_ratio", length(task$feature_names))
targets = task$target_names

mlr3misc::invoke(
obliqueRSF::ORSF,
data = as.data.frame(task$data()),
data = data.table::setDF(task$data()),
time = targets[1L],
status = targets[2L],
.args = pv
Expand Down
18 changes: 9 additions & 9 deletions R/learner_partykit_classif_cforest.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@
#' @templateVar id classif.cforest
#' @templateVar caller cforest
#'
#' @references
#' Hothorn T, Zeileis A (2015).
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
#' Journal of Machine Learning Research, 16(118), 3905-3909.
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
#' @section Custom mlr3 defaults:
#' - `mtry`:
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtryratio`
#' as `mtry = max(ceiling(mtryratio * n_features), 1)`.
#' Note that `mtry` and `mtryratio` are mutually exclusive.
#'
#' Hothorn T, Hornik K, Zeileis A (2006).
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
#' \doi{10.1198/106186006x133933}
#' @references
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
#'
#' @export
#' @template seealso_learner
Expand All @@ -37,6 +35,7 @@ LearnerClassifCForest = R6Class("LearnerClassifCForest",
tags = "train"),
mtry = p_int(lower = 0L, special_vals = list(Inf),
tags = "train"), # default actually "ceiling(sqrt(nvar))"
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
applyfun = p_uty(tags = c("train", "importance")),
cores = p_int(default = NULL, special_vals = list(NULL),
tags = c("train", "importance")),
Expand Down Expand Up @@ -167,6 +166,7 @@ LearnerClassifCForest = R6Class("LearnerClassifCForest",
.train = function(task) {

pars = self$param_set$get_values(tags = "train")
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))
pars_control = pars[which(names(pars) %in%
setdiff(methods::formalArgs(partykit::ctree_control),
c("mtry", "applyfun", "cores")
Expand Down
14 changes: 5 additions & 9 deletions R/learner_partykit_regr_cforest.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@
#' @templateVar id regr.cforest
#' @templateVar caller cforest
#'
#' @references
#' Hothorn T, Zeileis A (2015).
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
#' Journal of Machine Learning Research, 16(118), 3905-3909.
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
#' @inheritSection mlr_learners_classif.cforest Custom mlr3 defaults
#'
#' Hothorn T, Hornik K, Zeileis A (2006).
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
#' \doi{10.1198/106186006x133933}
#' @references
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
#'
#' @export
#' @template seealso_learner
Expand All @@ -37,6 +31,7 @@ LearnerRegrCForest = R6Class("LearnerRegrCForest",
tags = "train"),
mtry = p_int(lower = 0L, special_vals = list(Inf),
tags = "train"), # default actually "ceiling(sqrt(nvar))"
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
applyfun = p_uty(tags = c("train", "importance")),
cores = p_int(default = NULL, special_vals = list(NULL),
tags = c("train", "importance")),
Expand Down Expand Up @@ -163,6 +158,7 @@ LearnerRegrCForest = R6Class("LearnerRegrCForest",
.train = function(task) {

pars = self$param_set$get_values(tags = "train")
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))
pars_control = pars[which(names(pars) %in%
setdiff(methods::formalArgs(partykit::ctree_control),
c("mtry", "applyfun", "cores")
Expand Down
14 changes: 5 additions & 9 deletions R/learner_partykit_surv_cforest.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@
#' @templateVar id surv.cforest
#' @templateVar caller cforest
#'
#' @references
#' Hothorn T, Zeileis A (2015).
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
#' Journal of Machine Learning Research, 16(118), 3905-3909.
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
#' @inheritSection mlr_learners_classif.cforest Custom mlr3 defaults
#'
#' Hothorn T, Hornik K, Zeileis A (2006).
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
#' \doi{10.1198/106186006x133933}
#' @references
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
#'
#' @export
#' @template seealso_learner
Expand All @@ -34,6 +28,7 @@ LearnerSurvCForest = R6Class("LearnerSurvCForest",
tags = c("train", "perturb")),
mtry = p_int(lower = 0L, special_vals = list(Inf),
tags = "train"), # default actually "ceiling(sqrt(nvar))"
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
applyfun = p_uty(tags = c("train", "importance")),
cores = p_int(default = NULL, special_vals = list(NULL),
tags = c("train", "importance")),
Expand Down Expand Up @@ -127,6 +122,7 @@ LearnerSurvCForest = R6Class("LearnerSurvCForest",
.train = function(task) {

pars = self$param_set$get_values(tags = "train")
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))

if ("weights" %in% task$properties) {
pars$weights = task$weights$weight
Expand Down
15 changes: 13 additions & 2 deletions R/learner_randomForestSRC_classif_rfsrc.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,17 @@
#' - Actual default: Auto-detecting the number of cores
#' - Adjusted default: 1
#' - Reason for change: Threading conflicts with explicit parallelization via \CRANpkg{future}.
#' - `mtry`:
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtry.ratio`
#' as `mtry = max(ceiling(mtry.ratio * n_features), 1)`.
#' Note that `mtry` and `mtry.ratio` are mutually exclusive.
#' - `sampsize`:
#' - This hyperparameter can alternatively be set via the added hyperparameter `sampsize.ratio`
#' as `sampsize = max(ceiling(sampsize.ratio * n_obs), 1)`.
#' Note that `sampsize` and `sampsize.ratio` are mutually exclusive.
#'
#' @references
#' Breiman L (2001). “Random Forests.”
#' Machine Learning, 45(1), 5–32. ISSN 1573-0565, doi: 10.1023/A:1010933404324.
#' `r format_bib("breiman_2001")`
#'
#' @template seealso_learner
#' @template example
Expand All @@ -29,6 +36,7 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
ps = ps(
ntree = p_int(default = 1000, lower = 1L, tags = c("train", "predict")),
mtry = p_int(lower = 1L, tags = "train"),
mtry.ratio = p_dbl(lower = 0, upper = 1, tags = "train"),
nodesize = p_int(default = 15L, lower = 1L, tags = "train"),
nodedepth = p_int(lower = 1L, tags = "train"),
splitrule = p_fct(
Expand All @@ -52,6 +60,7 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
samp = p_uty(tags = "train"),
membership = p_lgl(default = FALSE, tags = c("train", "predict")),
sampsize = p_uty(tags = "train"),
sampsize.ratio = p_dbl(0, 1, tags = "train"),
na.action = p_fct(
default = "na.omit", levels = c("na.omit", "na.impute"),
tags = c("train", "predict")),
Expand Down Expand Up @@ -140,6 +149,8 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
private = list(
.train = function(task) {
pv = self$param_set$get_values(tags = "train")
pv = convert_ratio(pv, "mtry", "mtry.ratio", length(task$feature_names))
pv = convert_ratio(pv, "sampsize", "sampsize.ratio", task$nrow)
cores = pv$cores %??% 1L

if ("weights" %in% task$properties) {
Expand Down
13 changes: 6 additions & 7 deletions R/learner_randomForestSRC_regr_rfsrc.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,10 @@
#' @templateVar id regr.rfsrc
#' @templateVar caller rfsrc
#'
#' @section Custom mlr3 defaults:
#' - `cores`:
#' - Actual default: Auto-detecting the number of cores
#' - Adjusted default: 1
#' - Reason for change: Threading conflicts with explicit parallelization via \CRANpkg{future}.
#' @inheritSection mlr_learners_classif.rfsrc Custom mlr3 defaults
#'
#' @references
#' Breiman L (2001). “Random Forests.”
#' Machine Learning, 45(1), 5–32. ISSN 1573-0565, \doi{10.1023/A:1010933404324}
#' `r format_bib("breiman_2001")`
#'
#' @template seealso_learner
#' @template example
Expand All @@ -29,6 +24,7 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
ps = ps(
ntree = p_int(default = 1000, lower = 1L, tags = c("train", "predict")),
mtry = p_int(lower = 1L, tags = "train"),
mtry.ratio = p_dbl(lower = 0, upper = 1, tags = "train"),
nodesize = p_int(default = 15L, lower = 1L, tags = "train"),
nodedepth = p_int(lower = 1L, tags = "train"),
splitrule = p_fct(
Expand All @@ -52,6 +48,7 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
samp = p_uty(tags = "train"),
membership = p_lgl(default = FALSE, tags = c("train", "predict")),
sampsize = p_uty(tags = "train"),
sampsize.ratio = p_dbl(0, 1, tags = "train"),
na.action = p_fct(
default = "na.omit", levels = c("na.omit", "na.impute"),
tags = c("train", "predict")),
Expand Down Expand Up @@ -137,6 +134,8 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
private = list(
.train = function(task) {
pv = self$param_set$get_values(tags = "train")
pv = convert_ratio(pv, "mtry", "mtry.ratio", length(task$feature_names))
pv = convert_ratio(pv, "sampsize", "sampsize.ratio", task$nrow)
cores = pv$cores %??% 1L

if ("weights" %in% task$properties) {
Expand Down
Loading

0 comments on commit 48d206d

Please sign in to comment.