diff --git a/DESCRIPTION b/DESCRIPTION index 1ae39ab1..e5375475 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: lares Type: Package Title: Analytics & Machine Learning Sidekick -Version: 5.2.10.9001 +Version: 5.2.10.9002 Authors@R: c( person("Bernardo", "Lares", , "laresbernardo@gmail.com", c("aut", "cre"))) Maintainer: Bernardo Lares diff --git a/R/robyn.R b/R/robyn.R index 29832e28..1bb64501 100644 --- a/R/robyn.R +++ b/R/robyn.R @@ -138,9 +138,11 @@ robyn_hypsbuilder <- function( #' amount of models per cluster, "baseline_dist" for the difference between #' the model's baseline and \code{baseline_ref} value, "certainty" metric #' to minimize the channels' distance to their cluster's mean performance, -#' weighted by spends \code{spend_wt = TRUE}. Additionally, you can use the -#' standard MOO errors: "nrmse", "decomp.rssd", and "mape" (the lowest the -#' error, the highest the score; same for "baseline_dist"). +#' weighted by spends \code{spend_wt = TRUE}, "cluster_sd" metric to score +#' based on the paid channels' performance standard deviations in clusters. +#' Additionally, you can use the standard MOO errors: +#' "nrmse", "decomp.rssd", and "mape" (the lowest the error, the highest +#' the score; same for "baseline_dist" and "cluster_sd"). #' @param wt Vector. Weight for each of the normalized \code{metrics} selected, #' to calculate the score and rank models. Must have the same order and length #' of \code{metrics} parameter input. @@ -164,10 +166,11 @@ robyn_modelselector <- function( metrics = c( "rsq_train", "performance", "potential_improvement", - "non_zeroes", "incluster_models", - "baseline_dist", "certainty" + "non_zeroes", + "incluster_models", "cluster_sd", + "certainty", "baseline_dist" ), - wt = c(2, 1, 0, 1, 0.1, 0, 1), + wt = c(2, 0.5, 0, 1, 0.1, 0, 0, 0), baseline_ref = 0, top = 4, n_per_cluster = 5, @@ -185,7 +188,7 @@ robyn_modelselector <- function( class(ret) <- c("robyn_modelselector", class(ret)) return(ret) } - + if (length(metrics) == 1) wt <- 1 stopifnot(length(wt) == length(metrics)) stopifnot(length(allocator_limits) == 2) stopifnot(baseline_ref >= 0 && baseline_ref <= 1) @@ -195,17 +198,19 @@ robyn_modelselector <- function( metric = c( "rsq_train", "performance", "potential_improvement", "non_zeroes", "incluster_models", - "baseline_dist", "certainty", + "baseline_dist", "certainty", "cluster_sd", "nrmse", "decomp.rssd", "mape" ), metric_name = c( "R^2", ifelse(InputCollect$dep_var_type == "revenue", "High ROAS", "Low CPA"), "Potential Boost", "Non-Zero Betas", "Models in Cluster", sprintf("Baseline Distance [%.0f%%]", signif(baseline_ref * 100, 2)), - "Certainty within Cluster", + "Certainty in Cluster", "Cluster Mean Std Dev", "1 - NRMSE", "1 - DECOMP.RSSD", "1 - MAPE" ) ) + # The following criteria are inverted because the smaller, the better + invert_criteria <- c("nrmse", "decomp.rssd", "mape") check_opts(metrics, metrics_df$metric) # Metrics Used @@ -276,13 +281,20 @@ robyn_modelselector <- function( # Count models per cluster if (!"clusters" %in% names(OutputCollect)) { - OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None", top_sol = TRUE) + OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None") OutputCollect$clusters$clusters_means <- data.frame(cluster = "None", n = length(sols)) + OutputCollect$clusters$df_cluster_ci <- data.frame(cluster = "None", sd = 0) } + clus_mean_sd <- OutputCollect$clusters$df_cluster_ci %>% + group_by(.data$cluster) %>% + summarise(mean_sd = mean(.data$sd, na.rm = TRUE), .groups = "drop") %>% + mutate(cluster_sd = normalize(-.data$mean_sd, range = c(0.01, 1)), + cluster = as.character(cluster)) temp <- OutputCollect$clusters$data %>% - select(.data$solID, .data$cluster, .data$top_sol) %>% + select(.data$solID, .data$cluster) %>% mutate(cluster = as.character(.data$cluster)) %>% left_join(select(OutputCollect$clusters$clusters_means, .data$cluster, .data$n), "cluster") %>% + left_join(clus_mean_sd, "cluster") %>% rename(incluster_models = "n") # Calculate baselines @@ -295,13 +307,15 @@ robyn_modelselector <- function( ungroup() %>% filter(.data$rn == "baseline") %>% arrange(abs(.data$baseline)) %>% - mutate(baseline_dist = abs(baseline_ref - .data$baseline)) %>% + mutate(baseline_dist_real = abs(baseline_ref - .data$baseline), + baseline_dist = normalize(-.data$baseline_dist_real, range = c( + 0, 1 - min(.data$baseline_dist_real) / max(.data$baseline_dist_real)))) %>% select(c("solID", "baseline", "baseline_dist")) %>% - arrange(desc(.data$baseline_dist)) + arrange(.data$baseline_dist) # Certainty Criteria: distance to cluster's mean weighted by spend certainty <- certainty_score(InputCollect, OutputCollect, ...) %>% - select("solID", "certainty" = "certainty") + select("solID", "certainty") # Gather everything up dfa <- OutputCollect$allPareto$resultHypParam %>% @@ -315,43 +329,22 @@ robyn_modelselector <- function( ungroup() %>% left_join(baselines, "solID") - # The following criteria are inverted because the smaller, the better - inv <- c("baseline_dist", "nrmse", "decomp.rssd", "mape") - - # Calculate normalized and weighted scores - scores <- list( - rsq_train = normalize(dfa$rsq_train) * ifelse( - !"rsq_train" %in% metrics, 0, wt[which(metrics == "rsq_train")] - ) * ifelse("rsq_train" %in% inv, -1, 1), - performance = normalize(dfa$performance, na.rm = TRUE) * ifelse( - !"performance" %in% metrics, 0, wt[which(metrics == "performance")] - ) * ifelse("performance" %in% inv, -1, 1), - potential_improvement = normalize(dfa$potential_improvement) * ifelse( - !"potential_improvement" %in% metrics, 0, wt[which(metrics == "potential_improvement")] - ) * ifelse("potential_improvement" %in% inv, -1, 1), - non_zeroes = normalize(dfa$non_zeroes) * ifelse( - !"non_zeroes" %in% metrics, 0, wt[which(metrics == "non_zeroes")] - ) * ifelse("non_zeroes" %in% inv, -1, 1), - incluster_models = normalize(dfa$incluster_models) * ifelse( - !"incluster_models" %in% metrics, 0, wt[which(metrics == "incluster_models")] - ) * ifelse("incluster_models" %in% inv, -1, 1), - certainty = normalize(dfa$certainty) * ifelse( - !"certainty" %in% metrics, 0, wt[which(metrics == "certainty")] - ) * ifelse("certainty" %in% inv, -1, 1), - # The following are negative/inverted criteria when scoring - baseline_dist = normalize(dfa$baseline_dist) * ifelse( - !"baseline_dist" %in% metrics, 0, wt[which(metrics == "baseline_dist")] - ) * ifelse("baseline_dist" %in% inv, -1, 1), - nrmse = normalize(dfa$nrmse) * ifelse( - !"nrmse" %in% metrics, 0, wt[which(metrics == "nrmse")] - ) * ifelse("nrmse" %in% inv, -1, 1), - decomp.rssd = normalize(dfa$decomp.rssd) * ifelse( - !"decomp.rssd" %in% metrics, 0, wt[which(metrics == "decomp.rssd")] - ) * ifelse("decomp.rssd" %in% inv, -1, 1), - mape = normalize(dfa$mape) * ifelse( - !"mape" %in% metrics, 0, wt[which(metrics == "mape")] - ) * ifelse("mape" %in% inv, -1, 1) - ) + # Helper function to calculate normalized and weighted scores + calculate_score <- function(metric_name, data, metrics, weights, invert_criteria) { + if (metric_name %in% metrics) { + normalized_value <- normalize(data[[metric_name]], na.rm = TRUE) + weight <- weights[which(metrics == metric_name)] + sign <- ifelse(metric_name %in% invert_criteria, -1, 1) + return(normalized_value * weight * sign) + } + return(0) + } + + # Calculate scores + scores <- list() + for (metric in metrics_df$metric) { + scores[[metric]] <- calculate_score(metric, dfa, metrics, wt, invert_criteria) + } dfa <- dfa %>% mutate( score = normalize(rowSums(bind_cols(scores))), @@ -362,7 +355,7 @@ robyn_modelselector <- function( rep("*", (top + 1) - .data$aux), collapse = "" ), "")) %>% - select(-.data$top_sol, -.data$aux) %>% + select(-.data$aux) %>% arrange(desc(.data$score), desc(3), desc(4)) if (!quiet) message("Recommended considering these models first: ", v2t(head(dfa$solID, top))) @@ -382,7 +375,7 @@ robyn_modelselector <- function( unique() pdat <- dfa %>% # So that inverted variables have larger relative bars (darker blue) - mutate_at(all_of(inv), function(x) 1 - x) %>% + mutate_at(all_of(invert_criteria), function(x) 1 - x) %>% mutate( cluster = factor(sprintf("%s (%s)", .data$cluster, .data$incluster_models), levels = sorting), incluster_models = .data$incluster_models / max(dfa$incluster_models, na.rm = TRUE) @@ -447,6 +440,11 @@ robyn_modelselector <- function( certainty_score <- function( InputCollect, OutputCollect, penalization = 2, spend_wt = TRUE, ...) { + if (!"clusters" %in% names(OutputCollect)) { + return(data.frame( + solID = unique(OutputCollect$xDecompAgg$solID), + certainty = 0)) + } clusters <- OutputCollect$clusters$df_cluster_ci perfs <- OutputCollect$xDecompAgg %>% filter(!is.na(.data$mean_spend)) %>% # get rid of organic diff --git a/R/wrangling.R b/R/wrangling.R index dca268bf..6ffc60ef 100644 --- a/R/wrangling.R +++ b/R/wrangling.R @@ -118,6 +118,7 @@ zerovar <- function(df) { #' This function normalizes numerical values into a specified range, #' defaulting to the 0 to 1 range. #' +#' @inheritParams ohse #' @family Data Wrangling #' @param x Numeric vector. The numbers to be normalized. #' @param range A numeric vector of length 2 specifying the desired range diff --git a/lares.Rproj b/lares.Rproj index 217d8915..48012b6b 100644 --- a/lares.Rproj +++ b/lares.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 31f77b89-5802-419f-aa4a-2c6b21055c89 RestoreWorkspace: Default SaveWorkspace: Default diff --git a/man/normalize.Rd b/man/normalize.Rd index f1839f16..b9929dd0 100644 --- a/man/normalize.Rd +++ b/man/normalize.Rd @@ -11,6 +11,8 @@ normalize(x, range = c(0, 1), ...) \item{range}{A numeric vector of length 2 specifying the desired range for normalization. Default is c(0, 1).} + +\item{...}{Additional parameters.} } \value{ A numeric vector with normalized \code{x} values. diff --git a/man/robyn_modelselector.Rd b/man/robyn_modelselector.Rd index f9f69f18..6b47d777 100644 --- a/man/robyn_modelselector.Rd +++ b/man/robyn_modelselector.Rd @@ -9,8 +9,8 @@ robyn_modelselector( InputCollect, OutputCollect, metrics = c("rsq_train", "performance", "potential_improvement", "non_zeroes", - "incluster_models", "baseline_dist", "certainty"), - wt = c(2, 1, 0, 1, 0.1, 0, 1), + "incluster_models", "cluster_sd", "certainty", "baseline_dist"), + wt = c(2, 0.5, 0, 1, 0.1, 0, 0, 0), baseline_ref = 0, top = 4, n_per_cluster = 5, @@ -33,9 +33,11 @@ default budget allocator improvement using \code{allocator_limits}, amount of models per cluster, "baseline_dist" for the difference between the model's baseline and \code{baseline_ref} value, "certainty" metric to minimize the channels' distance to their cluster's mean performance, -weighted by spends \code{spend_wt = TRUE}. Additionally, you can use the -standard MOO errors: "nrmse", "decomp.rssd", and "mape" (the lowest the -error, the highest the score; same for "baseline_dist").} +weighted by spends \code{spend_wt = TRUE}, "cluster_sd" metric to score +based on the paid channels' performance standard deviations in clusters. +Additionally, you can use the standard MOO errors: +"nrmse", "decomp.rssd", and "mape" (the lowest the error, the highest +the score; same for "baseline_dist" and "cluster_sd").} \item{wt}{Vector. Weight for each of the normalized \code{metrics} selected, to calculate the score and rank models. Must have the same order and length