size_tree_by: implement relative scaling

- Instead of relying on the raw data, perform the following operations: - Normalize values based on standard deviation, which ensures sizes are comparable across datasets and prevents large values from dominating. - Scale all values to be within `rescale_to`, which also helps ensure consistent sizing, but also handles negative values (some of which may be introduced by the previous operation). - The larger the standard deviation is, the larger the the nodes can be. e.g. if rescale_to was c(0, 100) and the values in the data were c(-2, -1, 0, 1, 2), the largest node would almost fill the page. - `rescale_to` is not currently an argument that can be provided by the user. There may be reasons to want to visualize size differences to a greater degree in the future (perhaps objective function value
metrumresearchgroup · Jan 15, 2025 · d6730dd · d6730dd
1 parent 71be083
commit d6730dd
Showing 1 changed file with 28 additions and 3 deletions.
diff --git a/R/model-tree.R b/R/model-tree.R
@@ -739,16 +739,41 @@ color_tree_by <- function(tree_data, color_by = "run"){
 #' another column.
 #' @inheritParams make_tree_tooltip
 #' @inheritParams model_tree
+#' @param rescale_to A numeric vector of length 2 specifying the range to rescale
+#'  `size_by` values to. Defaults to `c(1, 3)`, where `1` is the smallest node
+#'  size and `3` is the largest.
 #' @noRd
-size_tree_by <- function(tree_data, size_by = NULL){
+size_tree_by <- function(tree_data, size_by = NULL, rescale_to = c(1, 3)){
   if(!is.null(size_by)){
     checkmate::assert_true(size_by %in% names(tree_data))
+    checkmate::assert_numeric(rescale_to, len = 2, lower = 1)
+
     tree_data$node_size <- tree_data[[size_by]]
+
     # Scale size with numeric value
     if(inherits(tree_data$node_size, c("numeric", "integer"))){
-      # Set node sizes with NA values (including start node) to mean value
+      # Rescale values based on standard deviation
+      # - a large SD can lead to very large nodes
       mean_val <- mean(tree_data$node_size, na.rm = TRUE)
-      tree_data$node_size[is.na(tree_data$node_size)] <- mean_val
+      sd_val <- sd(tree_data$node_size, na.rm = TRUE)
+
+      if(sd_val != 0){
+        tree_data$node_size <- (tree_data$node_size - mean_val) / sd_val
+      }else{
+        # Would only happen if all values are the same
+        # - avoids dividing by 0
+        tree_data$node_size <- rep(1, length(tree_data$node_size))
+      }
+
+      # Rescale to specified range
+      # Note: node sizes cannot be negative
+      tree_data <- tree_data %>% dplyr::mutate(
+        node_size = scales::rescale(.data$node_size, to = rescale_to)
+      )
+
+      # Set node sizes with NA values (including start node) to mean rescale_to
+      tree_data$node_size[is.na(tree_data$node_size)] <- mean(rescale_to)
+
       # Set size_by attribute to node_size column
       attr(tree_data, "size_by") <- "node_size"
     }else{