feat: new "cluster_sd" criteria for model selection

laresbernardo · Dec 19, 2024 · d3d8856 · d3d8856
1 parent 2e1b195
commit d3d8856
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 59 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: lares
 Type: Package
 Title: Analytics & Machine Learning Sidekick
-Version: 5.2.10.9001
+Version: 5.2.10.9002
 Authors@R: c(
     person("Bernardo", "Lares", , "[email protected]", c("aut", "cre")))
 Maintainer: Bernardo Lares <[email protected]>

diff --git a/R/robyn.R b/R/robyn.R
@@ -138,9 +138,11 @@ robyn_hypsbuilder <- function(
 #' amount of models per cluster, "baseline_dist" for the difference between
 #' the model's baseline and \code{baseline_ref} value, "certainty" metric
 #' to minimize the channels' distance to their cluster's mean performance,
-#' weighted by spends \code{spend_wt = TRUE}. Additionally, you can use the
-#' standard MOO errors: "nrmse", "decomp.rssd", and "mape" (the lowest the
-#' error, the highest the score; same for "baseline_dist").
+#' weighted by spends \code{spend_wt = TRUE}, "cluster_sd" metric to score
+#' based on the paid channels' performance standard deviations in clusters.
+#' Additionally, you can use the standard MOO errors: 
+#' "nrmse", "decomp.rssd", and "mape" (the lowest the error, the highest 
+#' the score; same for "baseline_dist" and "cluster_sd").
 #' @param wt Vector. Weight for each of the normalized \code{metrics} selected,
 #' to calculate the score and rank models. Must have the same order and length
 #' of \code{metrics} parameter input.
@@ -164,10 +166,11 @@ robyn_modelselector <- function(
     metrics = c(
       "rsq_train", "performance",
       "potential_improvement",
-      "non_zeroes", "incluster_models",
-      "baseline_dist", "certainty"
+      "non_zeroes", 
+      "incluster_models", "cluster_sd",
+      "certainty", "baseline_dist"
     ),
-    wt = c(2, 1, 0, 1, 0.1, 0, 1),
+    wt = c(2, 0.5, 0, 1, 0.1, 0, 0, 0),
     baseline_ref = 0,
     top = 4,
     n_per_cluster = 5,
@@ -185,7 +188,7 @@ robyn_modelselector <- function(
     class(ret) <- c("robyn_modelselector", class(ret))
     return(ret)
   }
-
+  if (length(metrics) == 1) wt <- 1
   stopifnot(length(wt) == length(metrics))
   stopifnot(length(allocator_limits) == 2)
   stopifnot(baseline_ref >= 0 && baseline_ref <= 1)
@@ -195,17 +198,19 @@ robyn_modelselector <- function(
     metric = c(
       "rsq_train", "performance", "potential_improvement",
       "non_zeroes", "incluster_models",
-      "baseline_dist", "certainty",
+      "baseline_dist", "certainty", "cluster_sd",
       "nrmse", "decomp.rssd", "mape"
     ),
     metric_name = c(
       "R^2", ifelse(InputCollect$dep_var_type == "revenue", "High ROAS", "Low CPA"),
       "Potential Boost", "Non-Zero Betas", "Models in Cluster",
       sprintf("Baseline Distance [%.0f%%]", signif(baseline_ref * 100, 2)),
-      "Certainty within Cluster",
+      "Certainty in Cluster", "Cluster Mean Std Dev",
       "1 - NRMSE", "1 - DECOMP.RSSD", "1 - MAPE"
     )
   )
+  # The following criteria are inverted because the smaller, the better
+  invert_criteria <- c("nrmse", "decomp.rssd", "mape")
   check_opts(metrics, metrics_df$metric)
 
   # Metrics Used
@@ -276,13 +281,20 @@ robyn_modelselector <- function(
 
   # Count models per cluster
   if (!"clusters" %in% names(OutputCollect)) {
-    OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None", top_sol = TRUE)
+    OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None")
     OutputCollect$clusters$clusters_means <- data.frame(cluster = "None", n = length(sols))
+    OutputCollect$clusters$df_cluster_ci <- data.frame(cluster = "None", sd = 0)
   }
+  clus_mean_sd <- OutputCollect$clusters$df_cluster_ci %>%
+    group_by(.data$cluster) %>%
+    summarise(mean_sd = mean(.data$sd, na.rm = TRUE), .groups = "drop") %>%
+    mutate(cluster_sd = normalize(-.data$mean_sd, range = c(0.01, 1)),
+           cluster = as.character(cluster))
   temp <- OutputCollect$clusters$data %>%
-    select(.data$solID, .data$cluster, .data$top_sol) %>%
+    select(.data$solID, .data$cluster) %>%
     mutate(cluster = as.character(.data$cluster)) %>%
     left_join(select(OutputCollect$clusters$clusters_means, .data$cluster, .data$n), "cluster") %>%
+    left_join(clus_mean_sd, "cluster") %>%
     rename(incluster_models = "n")
 
   # Calculate baselines
@@ -295,13 +307,15 @@ robyn_modelselector <- function(
     ungroup() %>%
     filter(.data$rn == "baseline") %>%
     arrange(abs(.data$baseline)) %>%
-    mutate(baseline_dist = abs(baseline_ref - .data$baseline)) %>%
+    mutate(baseline_dist_real = abs(baseline_ref - .data$baseline),
+           baseline_dist = normalize(-.data$baseline_dist_real, range = c(
+             0, 1 - min(.data$baseline_dist_real) / max(.data$baseline_dist_real)))) %>%
     select(c("solID", "baseline", "baseline_dist")) %>%
-    arrange(desc(.data$baseline_dist))
+    arrange(.data$baseline_dist)
 
   # Certainty Criteria: distance to cluster's mean weighted by spend
   certainty <- certainty_score(InputCollect, OutputCollect, ...) %>%
-    select("solID", "certainty" = "certainty")
+    select("solID", "certainty")
 
   # Gather everything up
   dfa <- OutputCollect$allPareto$resultHypParam %>%
@@ -315,43 +329,22 @@ robyn_modelselector <- function(
     ungroup() %>%
     left_join(baselines, "solID")
 
-  # The following criteria are inverted because the smaller, the better
-  inv <- c("baseline_dist", "nrmse", "decomp.rssd", "mape")
-
-  # Calculate normalized and weighted scores
-  scores <- list(
-    rsq_train = normalize(dfa$rsq_train) * ifelse(
-      !"rsq_train" %in% metrics, 0, wt[which(metrics == "rsq_train")]
-    ) * ifelse("rsq_train" %in% inv, -1, 1),
-    performance = normalize(dfa$performance, na.rm = TRUE) * ifelse(
-      !"performance" %in% metrics, 0, wt[which(metrics == "performance")]
-    ) * ifelse("performance" %in% inv, -1, 1),
-    potential_improvement = normalize(dfa$potential_improvement) * ifelse(
-      !"potential_improvement" %in% metrics, 0, wt[which(metrics == "potential_improvement")]
-    ) * ifelse("potential_improvement" %in% inv, -1, 1),
-    non_zeroes = normalize(dfa$non_zeroes) * ifelse(
-      !"non_zeroes" %in% metrics, 0, wt[which(metrics == "non_zeroes")]
-    ) * ifelse("non_zeroes" %in% inv, -1, 1),
-    incluster_models = normalize(dfa$incluster_models) * ifelse(
-      !"incluster_models" %in% metrics, 0, wt[which(metrics == "incluster_models")]
-    ) * ifelse("incluster_models" %in% inv, -1, 1),
-    certainty = normalize(dfa$certainty) * ifelse(
-      !"certainty" %in% metrics, 0, wt[which(metrics == "certainty")]
-    ) * ifelse("certainty" %in% inv, -1, 1),
-    # The following are negative/inverted criteria when scoring
-    baseline_dist = normalize(dfa$baseline_dist) * ifelse(
-      !"baseline_dist" %in% metrics, 0, wt[which(metrics == "baseline_dist")]
-    ) * ifelse("baseline_dist" %in% inv, -1, 1),
-    nrmse = normalize(dfa$nrmse) * ifelse(
-      !"nrmse" %in% metrics, 0, wt[which(metrics == "nrmse")]
-    ) * ifelse("nrmse" %in% inv, -1, 1),
-    decomp.rssd = normalize(dfa$decomp.rssd) * ifelse(
-      !"decomp.rssd" %in% metrics, 0, wt[which(metrics == "decomp.rssd")]
-    ) * ifelse("decomp.rssd" %in% inv, -1, 1),
-    mape = normalize(dfa$mape) * ifelse(
-      !"mape" %in% metrics, 0, wt[which(metrics == "mape")]
-    ) * ifelse("mape" %in% inv, -1, 1)
-  )
+  # Helper function to calculate normalized and weighted scores
+  calculate_score <- function(metric_name, data, metrics, weights, invert_criteria) {
+    if (metric_name %in% metrics) {
+      normalized_value <- normalize(data[[metric_name]], na.rm = TRUE)
+      weight <- weights[which(metrics == metric_name)]
+      sign <- ifelse(metric_name %in% invert_criteria, -1, 1)
+      return(normalized_value * weight * sign)
+    }
+    return(0)
+  }
+
+  # Calculate scores
+  scores <- list()
+  for (metric in metrics_df$metric) {
+    scores[[metric]] <- calculate_score(metric, dfa, metrics, wt, invert_criteria)
+  }
   dfa <- dfa %>%
     mutate(
       score = normalize(rowSums(bind_cols(scores))),
@@ -362,7 +355,7 @@ robyn_modelselector <- function(
       rep("*", (top + 1) - .data$aux),
       collapse = ""
     ), "")) %>%
-    select(-.data$top_sol, -.data$aux) %>%
+    select(-.data$aux) %>%
     arrange(desc(.data$score), desc(3), desc(4))
   if (!quiet) message("Recommended considering these models first: ", v2t(head(dfa$solID, top)))
 
@@ -382,7 +375,7 @@ robyn_modelselector <- function(
     unique()
   pdat <- dfa %>%
     # So that inverted variables have larger relative bars (darker blue)
-    mutate_at(all_of(inv), function(x) 1 - x) %>%
+    mutate_at(all_of(invert_criteria), function(x) 1 - x) %>%
     mutate(
       cluster = factor(sprintf("%s (%s)", .data$cluster, .data$incluster_models), levels = sorting),
       incluster_models = .data$incluster_models / max(dfa$incluster_models, na.rm = TRUE)
@@ -447,6 +440,11 @@ robyn_modelselector <- function(
 certainty_score <- function(
     InputCollect, OutputCollect,
     penalization = 2, spend_wt = TRUE, ...) {
+  if (!"clusters" %in% names(OutputCollect)) {
+    return(data.frame(
+      solID = unique(OutputCollect$xDecompAgg$solID),
+      certainty = 0))
+  }
   clusters <- OutputCollect$clusters$df_cluster_ci
   perfs <- OutputCollect$xDecompAgg %>%
     filter(!is.na(.data$mean_spend)) %>% # get rid of organic

diff --git a/R/wrangling.R b/R/wrangling.R
@@ -118,6 +118,7 @@ zerovar <- function(df) {
 #' This function normalizes numerical values into a specified range, 
 #' defaulting to the 0 to 1 range.
 #'
+#' @inheritParams ohse
 #' @family Data Wrangling
 #' @param x Numeric vector. The numbers to be normalized.
 #' @param range A numeric vector of length 2 specifying the desired range 

diff --git a/lares.Rproj b/lares.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 31f77b89-5802-419f-aa4a-2c6b21055c89
 
 RestoreWorkspace: Default
 SaveWorkspace: Default

diff --git a/man/normalize.Rd b/man/normalize.Rd
diff --git a/man/robyn_modelselector.Rd b/man/robyn_modelselector.Rd