Skip to content

Commit

Permalink
feat: new "cluster_sd" criteria for model selection
Browse files Browse the repository at this point in the history
  • Loading branch information
laresbernardo committed Dec 19, 2024
1 parent 2e1b195 commit d3d8856
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 59 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: lares
Type: Package
Title: Analytics & Machine Learning Sidekick
Version: 5.2.10.9001
Version: 5.2.10.9002
Authors@R: c(
person("Bernardo", "Lares", , "[email protected]", c("aut", "cre")))
Maintainer: Bernardo Lares <[email protected]>
Expand Down
104 changes: 51 additions & 53 deletions R/robyn.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,11 @@ robyn_hypsbuilder <- function(
#' amount of models per cluster, "baseline_dist" for the difference between
#' the model's baseline and \code{baseline_ref} value, "certainty" metric
#' to minimize the channels' distance to their cluster's mean performance,
#' weighted by spends \code{spend_wt = TRUE}. Additionally, you can use the
#' standard MOO errors: "nrmse", "decomp.rssd", and "mape" (the lowest the
#' error, the highest the score; same for "baseline_dist").
#' weighted by spends \code{spend_wt = TRUE}, "cluster_sd" metric to score
#' based on the paid channels' performance standard deviations in clusters.
#' Additionally, you can use the standard MOO errors:
#' "nrmse", "decomp.rssd", and "mape" (the lowest the error, the highest
#' the score; same for "baseline_dist" and "cluster_sd").
#' @param wt Vector. Weight for each of the normalized \code{metrics} selected,
#' to calculate the score and rank models. Must have the same order and length
#' of \code{metrics} parameter input.
Expand All @@ -164,10 +166,11 @@ robyn_modelselector <- function(
metrics = c(
"rsq_train", "performance",
"potential_improvement",
"non_zeroes", "incluster_models",
"baseline_dist", "certainty"
"non_zeroes",
"incluster_models", "cluster_sd",
"certainty", "baseline_dist"
),
wt = c(2, 1, 0, 1, 0.1, 0, 1),
wt = c(2, 0.5, 0, 1, 0.1, 0, 0, 0),
baseline_ref = 0,
top = 4,
n_per_cluster = 5,
Expand All @@ -185,7 +188,7 @@ robyn_modelselector <- function(
class(ret) <- c("robyn_modelselector", class(ret))
return(ret)
}

if (length(metrics) == 1) wt <- 1
stopifnot(length(wt) == length(metrics))
stopifnot(length(allocator_limits) == 2)
stopifnot(baseline_ref >= 0 && baseline_ref <= 1)
Expand All @@ -195,17 +198,19 @@ robyn_modelselector <- function(
metric = c(
"rsq_train", "performance", "potential_improvement",
"non_zeroes", "incluster_models",
"baseline_dist", "certainty",
"baseline_dist", "certainty", "cluster_sd",
"nrmse", "decomp.rssd", "mape"
),
metric_name = c(
"R^2", ifelse(InputCollect$dep_var_type == "revenue", "High ROAS", "Low CPA"),
"Potential Boost", "Non-Zero Betas", "Models in Cluster",
sprintf("Baseline Distance [%.0f%%]", signif(baseline_ref * 100, 2)),
"Certainty within Cluster",
"Certainty in Cluster", "Cluster Mean Std Dev",
"1 - NRMSE", "1 - DECOMP.RSSD", "1 - MAPE"
)
)
# The following criteria are inverted because the smaller, the better
invert_criteria <- c("nrmse", "decomp.rssd", "mape")
check_opts(metrics, metrics_df$metric)

# Metrics Used
Expand Down Expand Up @@ -276,13 +281,20 @@ robyn_modelselector <- function(

# Count models per cluster
if (!"clusters" %in% names(OutputCollect)) {
OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None", top_sol = TRUE)
OutputCollect$clusters$data <- data.frame(solID = sols, cluster = "None")
OutputCollect$clusters$clusters_means <- data.frame(cluster = "None", n = length(sols))
OutputCollect$clusters$df_cluster_ci <- data.frame(cluster = "None", sd = 0)
}
clus_mean_sd <- OutputCollect$clusters$df_cluster_ci %>%
group_by(.data$cluster) %>%
summarise(mean_sd = mean(.data$sd, na.rm = TRUE), .groups = "drop") %>%
mutate(cluster_sd = normalize(-.data$mean_sd, range = c(0.01, 1)),
cluster = as.character(cluster))
temp <- OutputCollect$clusters$data %>%
select(.data$solID, .data$cluster, .data$top_sol) %>%
select(.data$solID, .data$cluster) %>%
mutate(cluster = as.character(.data$cluster)) %>%
left_join(select(OutputCollect$clusters$clusters_means, .data$cluster, .data$n), "cluster") %>%
left_join(clus_mean_sd, "cluster") %>%
rename(incluster_models = "n")

# Calculate baselines
Expand All @@ -295,13 +307,15 @@ robyn_modelselector <- function(
ungroup() %>%
filter(.data$rn == "baseline") %>%
arrange(abs(.data$baseline)) %>%
mutate(baseline_dist = abs(baseline_ref - .data$baseline)) %>%
mutate(baseline_dist_real = abs(baseline_ref - .data$baseline),
baseline_dist = normalize(-.data$baseline_dist_real, range = c(
0, 1 - min(.data$baseline_dist_real) / max(.data$baseline_dist_real)))) %>%
select(c("solID", "baseline", "baseline_dist")) %>%
arrange(desc(.data$baseline_dist))
arrange(.data$baseline_dist)

# Certainty Criteria: distance to cluster's mean weighted by spend
certainty <- certainty_score(InputCollect, OutputCollect, ...) %>%
select("solID", "certainty" = "certainty")
select("solID", "certainty")

# Gather everything up
dfa <- OutputCollect$allPareto$resultHypParam %>%
Expand All @@ -315,43 +329,22 @@ robyn_modelselector <- function(
ungroup() %>%
left_join(baselines, "solID")

# The following criteria are inverted because the smaller, the better
inv <- c("baseline_dist", "nrmse", "decomp.rssd", "mape")

# Calculate normalized and weighted scores
scores <- list(
rsq_train = normalize(dfa$rsq_train) * ifelse(
!"rsq_train" %in% metrics, 0, wt[which(metrics == "rsq_train")]
) * ifelse("rsq_train" %in% inv, -1, 1),
performance = normalize(dfa$performance, na.rm = TRUE) * ifelse(
!"performance" %in% metrics, 0, wt[which(metrics == "performance")]
) * ifelse("performance" %in% inv, -1, 1),
potential_improvement = normalize(dfa$potential_improvement) * ifelse(
!"potential_improvement" %in% metrics, 0, wt[which(metrics == "potential_improvement")]
) * ifelse("potential_improvement" %in% inv, -1, 1),
non_zeroes = normalize(dfa$non_zeroes) * ifelse(
!"non_zeroes" %in% metrics, 0, wt[which(metrics == "non_zeroes")]
) * ifelse("non_zeroes" %in% inv, -1, 1),
incluster_models = normalize(dfa$incluster_models) * ifelse(
!"incluster_models" %in% metrics, 0, wt[which(metrics == "incluster_models")]
) * ifelse("incluster_models" %in% inv, -1, 1),
certainty = normalize(dfa$certainty) * ifelse(
!"certainty" %in% metrics, 0, wt[which(metrics == "certainty")]
) * ifelse("certainty" %in% inv, -1, 1),
# The following are negative/inverted criteria when scoring
baseline_dist = normalize(dfa$baseline_dist) * ifelse(
!"baseline_dist" %in% metrics, 0, wt[which(metrics == "baseline_dist")]
) * ifelse("baseline_dist" %in% inv, -1, 1),
nrmse = normalize(dfa$nrmse) * ifelse(
!"nrmse" %in% metrics, 0, wt[which(metrics == "nrmse")]
) * ifelse("nrmse" %in% inv, -1, 1),
decomp.rssd = normalize(dfa$decomp.rssd) * ifelse(
!"decomp.rssd" %in% metrics, 0, wt[which(metrics == "decomp.rssd")]
) * ifelse("decomp.rssd" %in% inv, -1, 1),
mape = normalize(dfa$mape) * ifelse(
!"mape" %in% metrics, 0, wt[which(metrics == "mape")]
) * ifelse("mape" %in% inv, -1, 1)
)
# Helper function to calculate normalized and weighted scores
calculate_score <- function(metric_name, data, metrics, weights, invert_criteria) {
if (metric_name %in% metrics) {
normalized_value <- normalize(data[[metric_name]], na.rm = TRUE)
weight <- weights[which(metrics == metric_name)]
sign <- ifelse(metric_name %in% invert_criteria, -1, 1)
return(normalized_value * weight * sign)
}
return(0)
}

# Calculate scores
scores <- list()
for (metric in metrics_df$metric) {
scores[[metric]] <- calculate_score(metric, dfa, metrics, wt, invert_criteria)
}
dfa <- dfa %>%
mutate(
score = normalize(rowSums(bind_cols(scores))),
Expand All @@ -362,7 +355,7 @@ robyn_modelselector <- function(
rep("*", (top + 1) - .data$aux),
collapse = ""
), "")) %>%
select(-.data$top_sol, -.data$aux) %>%
select(-.data$aux) %>%
arrange(desc(.data$score), desc(3), desc(4))
if (!quiet) message("Recommended considering these models first: ", v2t(head(dfa$solID, top)))

Expand All @@ -382,7 +375,7 @@ robyn_modelselector <- function(
unique()
pdat <- dfa %>%
# So that inverted variables have larger relative bars (darker blue)
mutate_at(all_of(inv), function(x) 1 - x) %>%
mutate_at(all_of(invert_criteria), function(x) 1 - x) %>%
mutate(
cluster = factor(sprintf("%s (%s)", .data$cluster, .data$incluster_models), levels = sorting),
incluster_models = .data$incluster_models / max(dfa$incluster_models, na.rm = TRUE)
Expand Down Expand Up @@ -447,6 +440,11 @@ robyn_modelselector <- function(
certainty_score <- function(
InputCollect, OutputCollect,
penalization = 2, spend_wt = TRUE, ...) {
if (!"clusters" %in% names(OutputCollect)) {
return(data.frame(
solID = unique(OutputCollect$xDecompAgg$solID),
certainty = 0))
}
clusters <- OutputCollect$clusters$df_cluster_ci
perfs <- OutputCollect$xDecompAgg %>%
filter(!is.na(.data$mean_spend)) %>% # get rid of organic
Expand Down
1 change: 1 addition & 0 deletions R/wrangling.R
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ zerovar <- function(df) {
#' This function normalizes numerical values into a specified range,
#' defaulting to the 0 to 1 range.
#'
#' @inheritParams ohse
#' @family Data Wrangling
#' @param x Numeric vector. The numbers to be normalized.
#' @param range A numeric vector of length 2 specifying the desired range
Expand Down
1 change: 1 addition & 0 deletions lares.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 31f77b89-5802-419f-aa4a-2c6b21055c89

RestoreWorkspace: Default
SaveWorkspace: Default
Expand Down
2 changes: 2 additions & 0 deletions man/normalize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions man/robyn_modelselector.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d3d8856

Please sign in to comment.