aorsf classification and regression engines (#78)

--------- Co-authored-by: Simon P. Couch <[email protected]>
tidymodels · May 14, 2024 · 87da1c9 · 87da1c9
1 parent a91e80a
commit 87da1c9
Show file tree

Hide file tree

Showing 6 changed files with 462 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -38,6 +38,7 @@ Suggests:
     covr,
     knitr,
     lightgbm,
+    aorsf (>= 0.1.3),
     modeldata,
     partykit,
     rmarkdown,

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 v0.2.1.9000 is a developmental version of the bonsai package.
 
+* Introduced support for accelerated oblique random forests for the `"classification"` and `"regression"` modes using the new [`"aorsf"` engine](https://github.com/ropensci/aorsf) (#78 by `@bcjaeger`). 
+
 * Enabled passing [Dataset Parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters) to the `"lightgbm"` engine. To pass an argument that would be usually passed as an element to the `param` argument in `lightgbm::lgb.Dataset()`, pass the argument directly through the ellipses in `set_engine()`, e.g. `boost_tree() %>% set_engine("lightgbm", linear_tree = TRUE)` (#77).
 
 * Enabled case weights with the `"lightgbm"` engine (#72 by `@p-schaefer`).

diff --git a/R/aorsf_data.R b/R/aorsf_data.R
@@ -0,0 +1,225 @@
+# nocov start
+
+make_rand_forest_aorsf <- function(){
+  parsnip::set_model_engine("rand_forest", "classification", "aorsf")
+  parsnip::set_model_engine("rand_forest", "regression", "aorsf")
+  parsnip::set_dependency("rand_forest", "aorsf", "aorsf", mode = "classification")
+  parsnip::set_dependency("rand_forest", "aorsf", "aorsf", mode = "regression")
+
+  parsnip::set_model_arg(
+    model = "rand_forest",
+    eng = "aorsf",
+    parsnip = "mtry",
+    original = "mtry",
+    func = list(pkg = "dials", fun = "mtry"),
+    has_submodel = FALSE
+  )
+
+  parsnip::set_model_arg(
+    model = "rand_forest",
+    eng = "aorsf",
+    parsnip = "trees",
+    original = "n_tree",
+    func = list(pkg = "dials", fun = "trees"),
+    has_submodel = FALSE
+  )
+
+  parsnip::set_model_arg(
+    model = "rand_forest",
+    eng = "aorsf",
+    parsnip = "min_n",
+    original = "leaf_min_obs",
+    func = list(pkg = "dials", fun = "min_n"),
+    has_submodel = FALSE
+  )
+
+  parsnip::set_model_arg(
+    model = "rand_forest",
+    eng = "aorsf",
+    parsnip = "mtry",
+    original = "mtry",
+    func = list(pkg = "dials", fun = "mtry"),
+    has_submodel = FALSE
+  )
+
+  parsnip::set_fit(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "classification",
+    value = list(
+      interface = "formula",
+      protect = c("formula", "data", "weights"),
+      func = c(pkg = "aorsf", fun = "orsf"),
+      defaults =
+        list(
+          n_thread = 1,
+          verbose_progress = FALSE
+        )
+    )
+  )
+
+  parsnip::set_encoding(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "classification",
+    options = list(
+      predictor_indicators = "none",
+      compute_intercept = FALSE,
+      remove_intercept = FALSE,
+      allow_sparse_x = FALSE
+    )
+  )
+
+  parsnip::set_fit(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "regression",
+    value = list(
+      interface = "formula",
+      protect = c("formula", "data", "weights"),
+      func = c(pkg = "aorsf", fun = "orsf"),
+      defaults =
+        list(
+          n_thread = 1,
+          verbose_progress = FALSE
+        )
+    )
+  )
+
+  parsnip::set_encoding(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "regression",
+    options = list(
+      predictor_indicators = "none",
+      compute_intercept = FALSE,
+      remove_intercept = FALSE,
+      allow_sparse_x = FALSE
+    )
+  )
+
+  parsnip::set_pred(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "classification",
+    type = "class",
+    value = list(
+      pre = NULL,
+      # makes prob preds consistent with class ones.
+      # note: the class predict method in aorsf uses the standard 'each tree
+      # gets one vote' approach, which is usually consistent with probability
+      # but not all the time. I opted to make predicted probability totally
+      # consistent with predicted class in the parsnip bindings for aorsf b/c
+      # I think it's really confusing when predicted probs do not align with
+      # predicted classes. I'm fine with this in aorsf but in bonsai I want
+      # to minimize confusion (#78).
+      post = function(results, object){
+
+        missings <- apply(results, 1, function(x) any(is.na(x)))
+
+        if(!any(missings)) {
+          return(colnames(results)[apply(results, 1, which.max)])
+        }
+
+        obs <- which(!missings)
+
+        out <- rep(NA_character_, nrow(results))
+        out[obs] <- colnames(results)[apply(results[obs, ], 1, which.max)]
+        out
+
+      },
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          pred_type = "prob",
+          verbose_progress = FALSE,
+          na_action = 'pass'
+        )
+    )
+  )
+
+  parsnip::set_pred(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "classification",
+    type = "prob",
+    value = list(
+      pre = NULL,
+      post = function(x, object) {
+        as_tibble(x)
+      },
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          pred_type = 'prob',
+          verbose_progress = FALSE,
+          na_action = 'pass'
+        )
+    )
+  )
+
+  parsnip::set_pred(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "classification",
+    type = "raw",
+    value = list(
+      pre = NULL,
+      post = NULL,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          verbose_progress = FALSE,
+          na_action = 'pass'
+        )
+    )
+  )
+
+  parsnip::set_pred(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "regression",
+    type = "numeric",
+    value = list(
+      pre = NULL,
+      post = as.numeric,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          pred_type = "mean",
+          verbose_progress = FALSE,
+          na_action = 'pass'
+        )
+    )
+  )
+
+  parsnip::set_pred(
+    model = "rand_forest",
+    eng = "aorsf",
+    mode = "regression",
+    type = "raw",
+    value = list(
+      pre = NULL,
+      post = as.numeric,
+      func = c(fun = "predict"),
+      args =
+        list(
+          object = quote(object$fit),
+          new_data = quote(new_data),
+          pred_type = "mean",
+          verbose_progress = FALSE,
+          na_action = 'pass'
+        )
+    )
+  )
+}
+
+# nocov end
diff --git a/R/zzz.R b/R/zzz.R
@@ -9,6 +9,8 @@
 
   make_decision_tree_partykit()
   make_rand_forest_partykit()
+
+  make_rand_forest_aorsf()
 }
 
 

diff --git a/README.md b/README.md
@@ -51,6 +51,8 @@ following table:
 | decision_tree | partykit | classification |
 | rand_forest   | partykit | regression     |
 | rand_forest   | partykit | classification |
+| rand_forest   | aorsf    | classification |
+| rand_forest   | aorsf    | regression     |
 
 ## Code of Conduct
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,8 @@ @@
       make_decision_tree_partykit()
       make_rand_forest_partykit()
+      make_rand_forest_aorsf()
     }
@@ Expand Down @@