From ddd32c7c92ba9f37c8505720129d7e10979ccc4c Mon Sep 17 00:00:00 2001 From: "Lars H. B. Olsen" <92097196+LHBO@users.noreply.github.com> Date: Thu, 18 Apr 2024 09:14:42 +0200 Subject: [PATCH] Restructured torch modules to support shapr installation without torch (#393) * Updated the mask generators * Fixed the Neural Network Modules * Fixed Dataset Utility Functions * Added check that progressr is installed, otherwise we proceed without a progress_bar * Fixed vaeac and memory layer * Missed default value in `mcar_mask_generator`. * Manuals * Typos in the vaeac vignettes * Updated the documentation to clearly state that a vaeac model cannot be moved from the folder it was trained in if one want to continue to train it. This is a limitation that I should consider fixing. But I am unsure how often this will occur. Also made sure that continue train works if the explanation object was trained by giving a path. * styler + lintr * Added self as global variable --- R/approach_vaeac.R | 22 +- R/approach_vaeac_torch_modules.R | 2375 +++++++++--------- R/zzz.R | 1 + man/memory_layer.Rd | 27 +- man/shapr-package.Rd | 4 +- man/vaeac_get_extra_para_default.Rd | 5 +- vignettes/understanding_shapr_vaeac.Rmd | 5 +- vignettes/understanding_shapr_vaeac.Rmd.orig | 5 +- 8 files changed, 1270 insertions(+), 1174 deletions(-) diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R index e997d8f5f..4ba03ba20 100644 --- a/R/approach_vaeac.R +++ b/R/approach_vaeac.R @@ -438,7 +438,11 @@ vaeac_train_model <- function(x_train, best_vlb <- -Inf # Create a `progressr::progressor()` to keep track of the overall training time of the vaeac approach - progressr_bar <- progressr::progressor(steps = epochs_initiation_phase * (n_vaeacs_initialize - 1) + epochs) + if (requireNamespace("progressr", quietly = TRUE)) { + progressr_bar <- progressr::progressor(steps = epochs_initiation_phase * (n_vaeacs_initialize - 1) + epochs) + } else { + progressr_bar <- NULL + } # Iterate over the initializations. initialization_idx <- 1 @@ -835,9 +839,10 @@ vaeac_train_model_continue <- function(explanation, # Set seed for reproducibility set.seed(seed) - # Extract the vaeac list and load the model at the last epoch + # Extract the vaeac list and load the model at the last epoch or the best (default 'best' when path is provided) vaeac_model <- explanation$internal$parameters$vaeac - checkpoint <- torch::torch_load(vaeac_model$models$last) + vaeac_model_path <- if (!is.null(vaeac_model$models$last)) vaeac_model$models$last else vaeac_model$models$best + checkpoint <- torch::torch_load(vaeac_model_path) # Get which device we are to continue to train the model device <- ifelse(checkpoint$cuda, "cuda", "cpu") @@ -939,7 +944,11 @@ vaeac_train_model_continue <- function(explanation, state_list$epochs <- epochs # Create a `progressr::progressor()` to keep track of the new training - progressr_bar <- progressr::progressor(steps = epochs_new) + if (requireNamespace("progressr", quietly = TRUE)) { + progressr_bar <- progressr::progressor(steps = epochs_new) + } else { + progressr_bar <- NULL + } # Train the vaeac model for `epochs_new` number of epochs vaeac_tmp <- vaeac_train_model_auxiliary( @@ -1617,8 +1626,9 @@ vaeac_check_parameters <- function(x_train, #' then a name will be generated based on [base::Sys.time()] to ensure a unique name. We use [base::make.names()] to #' ensure a valid file name for all operating systems. #' @param vaeac.folder_to_save_model String (default is [base::tempdir()]). String specifying a path to a folder where -#' the function is to save the fitted vaeac model. Note that the path will be removed from the returned -#' [shapr::explain()] object if `vaeac.save_model = FALSE`. +#' the function is to save the fitted vaeac model. Note that the path will be removed from the returned +#' [shapr::explain()] object if `vaeac.save_model = FALSE`. Furthermore, the model cannot be moved from its +#' original folder if we are to use the [shapr::vaeac_train_model_continue()] function to continue training the model. #' @param vaeac.pretrained_vaeac_model List or String (default is `NULL`). 1) Either a list of class #' `vaeac`, i.e., the list stored in `explanation$internal$parameters$vaeac` where `explanation` is the returned list #' from an earlier call to the [shapr::explain()] function. 2) A string containing the path to where the `vaeac` diff --git a/R/approach_vaeac_torch_modules.R b/R/approach_vaeac_torch_modules.R index 45c89d3d6..e353327ab 100644 --- a/R/approach_vaeac_torch_modules.R +++ b/R/approach_vaeac_torch_modules.R @@ -1,5 +1,4 @@ -# VAEAC Model ========================================================================================================= -## vaeac -------------------------------------------------------------------------------------------------------------- +# VAEAC Model ========================================================================================================== #' Initializing a vaeac model #' #' @description Class that represents a vaeac model, i.e., the class creates the neural networks in the vaeac @@ -117,572 +116,531 @@ #' #' @author Lars Henry Berge Olsen #' @keywords internal -vaeac <- torch::nn_module( - - # Name of the torch::nn_module object - classname = "vaeac", - - # Initializing a vaeac model - initialize = function(one_hot_max_sizes, - width = 32, - depth = 3, - latent_dim = 8, - activation_function = torch::nn_relu, - skip_conn_layer = FALSE, - skip_conn_masked_enc_dec = FALSE, - batch_normalization = FALSE, - paired_sampling = FALSE, - mask_generator_name = c( - "mcar_mask_generator", - "specified_prob_mask_generator", - "specified_masks_mask_generator" - ), - masking_ratio = 0.5, - mask_gen_coalitions = NULL, - mask_gen_coalitions_prob = NULL, - sigma_mu = 1e4, - sigma_sigma = 1e-4) { - # Check that a valid mask_generator was provided. - mask_generator_name <- match.arg(mask_generator_name) - - # Get the number of features - n_features <- length(one_hot_max_sizes) - - # Extra strings to add to names of layers depending on if we use memory layers and/or batch normalization. - # If FALSE, they are just an empty string and do not effect the names. - name_extra_memory_layer <- ifelse(skip_conn_masked_enc_dec, "_and_memory", "") - name_extra_batch_normalize <- ifelse(batch_normalization, "_and_batch_norm", "") - - # Save some of the initializing hyperparameters to the vaeac object. Others are saved later. - self$one_hot_max_sizes <- one_hot_max_sizes - self$depth <- depth - self$width <- width - self$latent_dim <- latent_dim - self$activation_function <- activation_function - self$skip_conn_layer <- skip_conn_layer - self$skip_conn_masked_enc_dec <- skip_conn_masked_enc_dec - self$batch_normalization <- batch_normalization - self$sigma_mu <- sigma_mu - self$sigma_sigma <- sigma_sigma - self$paired_sampling <- paired_sampling - - # Save the how to compute the loss and how to sample from the vaeac model. - self$reconstruction_log_prob <- gauss_cat_loss(one_hot_max_sizes) - self$sampler_most_likely <- gauss_cat_sampler_most_likely(one_hot_max_sizes) - self$sampler_random <- gauss_cat_sampler_random(one_hot_max_sizes) - self$generative_parameters <- gauss_cat_parameters(one_hot_max_sizes) - self$n_features <- n_features - self$vlb_scale_factor <- 1 / n_features - - ##### Generate the mask generator - if (mask_generator_name == "mcar_mask_generator") { - # Create a mcar_mask_generator and attach it to the vaeac object. Note that masking_ratio is a singleton here. - self$mask_generator <- mcar_mask_generator( - masking_ratio = masking_ratio, - paired_sampling = paired_sampling - ) +vaeac <- function(one_hot_max_sizes, + width = 32, + depth = 3, + latent_dim = 8, + activation_function = torch::nn_relu, + skip_conn_layer = FALSE, + skip_conn_masked_enc_dec = FALSE, + batch_normalization = FALSE, + paired_sampling = FALSE, + mask_generator_name = c( + "mcar_mask_generator", + "specified_prob_mask_generator", + "specified_masks_mask_generator" + ), + masking_ratio = 0.5, + mask_gen_coalitions = NULL, + mask_gen_coalitions_prob = NULL, + sigma_mu = 1e4, + sigma_sigma = 1e-4) { + # Check that a valid mask_generator was provided. + mask_generator_name <- match.arg(mask_generator_name) + + vaeac_tmp <- torch::nn_module( + # Name of the torch::nn_module object + classname = "vaeac", + + # Initializing a vaeac model + initialize = function(one_hot_max_sizes, + width, + depth, + latent_dim, + activation_function, + skip_conn_layer, + skip_conn_masked_enc_dec, + batch_normalization, + paired_sampling, + mask_generator_name, + masking_ratio, + mask_gen_coalitions, + mask_gen_coalitions_prob, + sigma_mu, + sigma_sigma) { + # Get the number of features + n_features <- length(one_hot_max_sizes) + + # Extra strings to add to names of layers depending on if we use memory layers and/or batch normalization. + # If FALSE, they are just an empty string and do not effect the names. + name_extra_memory_layer <- ifelse(skip_conn_masked_enc_dec, "_and_memory", "") + name_extra_batch_normalize <- ifelse(batch_normalization, "_and_batch_norm", "") + + # Set up an environment that the memory_layer objects will use as "memory", i.e., where they store the tensors. + memory_layer_env <- new.env() + + # Save some of the initializing hyperparameters to the vaeac object. Others are saved later. + self$one_hot_max_sizes <- one_hot_max_sizes + self$depth <- depth + self$width <- width + self$latent_dim <- latent_dim + self$activation_function <- activation_function + self$skip_conn_layer <- skip_conn_layer + self$skip_conn_masked_enc_dec <- skip_conn_masked_enc_dec + self$batch_normalization <- batch_normalization + self$sigma_mu <- sigma_mu + self$sigma_sigma <- sigma_sigma + self$paired_sampling <- paired_sampling + + # Save the how to compute the loss and how to sample from the vaeac model. + self$reconstruction_log_prob <- gauss_cat_loss(one_hot_max_sizes) + self$sampler_most_likely <- gauss_cat_sampler_most_likely(one_hot_max_sizes) + self$sampler_random <- gauss_cat_sampler_random(one_hot_max_sizes) + self$generative_parameters <- gauss_cat_parameters(one_hot_max_sizes) + self$n_features <- n_features + self$vlb_scale_factor <- 1 / n_features + + ##### Generate the mask generator + if (mask_generator_name == "mcar_mask_generator") { + # Create a mcar_mask_generator and attach it to the vaeac object. Note that masking_ratio is a singleton here. + self$mask_generator <- mcar_mask_generator( + masking_ratio = masking_ratio, + paired_sampling = paired_sampling + ) - # Attach the masking ratio to the vaeac object. - self$masking_ratio <- masking_ratio - } else if (mask_generator_name == "specified_prob_mask_generator") { - # Create a specified_prob_mask_generator and attach it to the vaeac object. - # Note that masking_ratio is an array here. - self$mask_generator <- specified_prob_mask_generator( - masking_probs = masking_ratio, - paired_sampling = paired_sampling - ) + # Attach the masking ratio to the vaeac object. + self$masking_ratio <- masking_ratio + } else if (mask_generator_name == "specified_prob_mask_generator") { + # Create a specified_prob_mask_generator and attach it to the vaeac object. + # Note that masking_ratio is an array here. + self$mask_generator <- specified_prob_mask_generator( + masking_probs = masking_ratio, + paired_sampling = paired_sampling + ) + + # Attach the masking probabilities to the vaeac object. + self$masking_probs <- masking_ratio + } else if (mask_generator_name == "specified_masks_mask_generator") { + # Small check that they have been provided. + if (is.null(mask_gen_coalitions) | is.null(mask_gen_coalitions_prob)) { + stop(paste0( + "Both 'mask_gen_coalitions' and 'mask_gen_coalitions_prob' ", + "must be provided when using 'specified_masks_mask_generator'." + )) + } - # Attach the masking probabilities to the vaeac object. - self$masking_probs <- masking_ratio - } else if (mask_generator_name == "specified_masks_mask_generator") { - # Small check that they have been provided. - if (is.null(mask_gen_coalitions) | is.null(mask_gen_coalitions_prob)) { + # Create a specified_masks_mask_generator and attach it to the vaeac object. + self$mask_generator <- specified_masks_mask_generator( + masks = mask_gen_coalitions, + masks_probs = mask_gen_coalitions_prob, + paired_sampling = paired_sampling + ) + + # Save the possible masks and corresponding probabilities to the vaeac object. + self$masks <- mask_gen_coalitions + self$masks_probs <- mask_gen_coalitions_prob + } else { + # Print error to user. stop(paste0( - "Both 'mask_gen_coalitions' and 'mask_gen_coalitions_prob' ", - "must be provided when using 'specified_masks_mask_generator'." + "`mask_generator_name` must be one of 'mcar_mask_generator', 'specified_prob_mask_generator', or ", + "'specified_masks_mask_generator', and not '", mask_generator_name, "'." )) } - # Create a specified_masks_mask_generator and attach it to the vaeac object. - self$mask_generator <- specified_masks_mask_generator( - masks = mask_gen_coalitions, - masks_probs = mask_gen_coalitions_prob, - paired_sampling = paired_sampling - ) - - # Save the possible masks and corresponding probabilities to the vaeac object. - self$masks <- mask_gen_coalitions - self$masks_probs <- mask_gen_coalitions_prob - } else { - # Print error to user. - stop(paste0( - "`mask_generator_name` must be one of 'mcar_mask_generator', 'specified_prob_mask_generator', or ", - "'specified_masks_mask_generator', and not '", mask_generator_name, "'." - )) - } - - ##### Full Encoder - full_encoder_network <- torch::nn_sequential() + ##### Full Encoder + full_encoder_network <- torch::nn_sequential() - # Full Encoder: Input layer - full_encoder_network$add_module( - module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features)), seq(n_features)), - name = "input_layer_cat_to_one_hot" - ) - full_encoder_network$add_module( - module = torch::nn_linear( - in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features * 2, - out_features = width - ), - name = "input_layer_linear" - ) - full_encoder_network$add_module( - module = activation_function(), - name = "input_layer_layer_activation" - ) - if (batch_normalization) { + # Full Encoder: Input layer full_encoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = "input_layer_layer_batch_norm" + module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features)), seq(n_features)), + name = "input_layer_cat_to_one_hot" ) - } - - # Full Encoder: Hidden layers - for (i in seq(depth)) { - if (skip_conn_layer) { - # Add identity skip connection. Such that the input is added to the output of the linear layer - # and activation function: output = X + activation(WX + b). - full_encoder_network$add_module( - module = skip_connection( - torch::nn_linear(width, width), - activation_function(), - if (batch_normalization) torch::nn_batch_norm1d(width) - ), - name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_batch_normalize) - ) - } else { - # Do not use skip connections and do not add the input to the output. - full_encoder_network$add_module( - module = torch::nn_linear(width, width), - name = paste0("hidden_layer_", i, "_linear") - ) + full_encoder_network$add_module( + module = torch::nn_linear( + in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features * 2, + out_features = width + ), + name = "input_layer_linear" + ) + full_encoder_network$add_module( + module = activation_function(), + name = "input_layer_layer_activation" + ) + if (batch_normalization) { full_encoder_network$add_module( - module = activation_function(), - name = paste0("hidden_layer_", i, "_activation") + module = torch::nn_batch_norm1d(width), + name = "input_layer_layer_batch_norm" ) - if (batch_normalization) { + } + + # Full Encoder: Hidden layers + for (i in seq(depth)) { + if (skip_conn_layer) { + # Add identity skip connection. Such that the input is added to the output of the linear layer + # and activation function: output = X + activation(WX + b). full_encoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = paste0("hidden_layer_", i, "_batch_norm") + module = skip_connection( + torch::nn_linear(width, width), + activation_function(), + if (batch_normalization) torch::nn_batch_norm1d(width) + ), + name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_batch_normalize) ) + } else { + # Do not use skip connections and do not add the input to the output. + full_encoder_network$add_module( + module = torch::nn_linear(width, width), + name = paste0("hidden_layer_", i, "_linear") + ) + full_encoder_network$add_module( + module = activation_function(), + name = paste0("hidden_layer_", i, "_activation") + ) + if (batch_normalization) { + full_encoder_network$add_module( + module = torch::nn_batch_norm1d(width), + name = paste0("hidden_layer_", i, "_batch_norm") + ) + } } } - } - # Full Encoder: Go to latent space - full_encoder_network$add_module( - module = torch::nn_linear(width, latent_dim * 2), - name = "latent_space_layer_linear" - ) + # Full Encoder: Go to latent space + full_encoder_network$add_module( + module = torch::nn_linear(width, latent_dim * 2), + name = "latent_space_layer_linear" + ) - ##### Masked Encoder - masked_encoder_network <- torch::nn_sequential() + ##### Masked Encoder + masked_encoder_network <- torch::nn_sequential() - # Masked Encoder: Input layer - masked_encoder_network$add_module( - module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features))), - name = "input_layer_cat_to_one_hot" - ) - if (skip_conn_masked_enc_dec) { + # Masked Encoder: Input layer masked_encoder_network$add_module( - module = memory_layer("#input"), - name = "input_layer_memory" + module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features))), + name = "input_layer_cat_to_one_hot" ) - } - masked_encoder_network$add_module( - module = torch::nn_linear( - in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features, - out_features = width - ), - name = "input_layer_linear" - ) - masked_encoder_network$add_module( - module = activation_function(), - name = "input_layer_activation" - ) - if (batch_normalization) { + if (skip_conn_masked_enc_dec) { + masked_encoder_network$add_module( + module = memory_layer(id = "#input", shared_env = memory_layer_env), + name = "input_layer_memory" + ) + } masked_encoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = "input_layer_batch_norm" + module = torch::nn_linear( + in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features, + out_features = width + ), + name = "input_layer_linear" ) - } - - # Masked Encoder: Hidden layers - for (i in seq(depth)) { - if (skip_conn_layer) { - # Add identity skip connection. Such that the input is added to the output of the linear layer - # and activation function: output = X + activation(WX + b). - # Also check inside skip_connection if we are to use memory_layer. I.e., skip connection with - # concatenation from masked encoder to decoder. + masked_encoder_network$add_module( + module = activation_function(), + name = "input_layer_activation" + ) + if (batch_normalization) { masked_encoder_network$add_module( - module = skip_connection( - if (skip_conn_masked_enc_dec) memory_layer(paste0("#", i)), - torch::nn_linear(width, width), - activation_function() - ), - name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer) + module = torch::nn_batch_norm1d(width), + name = "input_layer_batch_norm" ) - if (batch_normalization) { + } + + # Masked Encoder: Hidden layers + for (i in seq(depth)) { + if (skip_conn_layer) { + # Add identity skip connection. Such that the input is added to the output of the linear layer + # and activation function: output = X + activation(WX + b). + # Also check inside skip_connection if we are to use memory_layer. I.e., skip connection with + # concatenation from masked encoder to decoder. masked_encoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = paste0("hidden_layer_", i, "_batch_norm") + module = skip_connection( + if (skip_conn_masked_enc_dec) memory_layer(id = paste0("#", i), shared_env = memory_layer_env), + torch::nn_linear(width, width), + activation_function() + ), + name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer) ) - } - } else { - # Do not use skip connections and do not add the input to the output. - if (skip_conn_masked_enc_dec) { + if (batch_normalization) { + masked_encoder_network$add_module( + module = torch::nn_batch_norm1d(width), + name = paste0("hidden_layer_", i, "_batch_norm") + ) + } + } else { + # Do not use skip connections and do not add the input to the output. + if (skip_conn_masked_enc_dec) { + masked_encoder_network$add_module( + module = memory_layer(id = paste0("#", i), shared_env = memory_layer_env), + name = paste0("hidden_layer_", i, "_memory") + ) + } masked_encoder_network$add_module( - module = memory_layer(paste0("#", i)), - name = paste0("hidden_layer_", i, "_memory") + module = torch::nn_linear(width, width), + name = paste0("hidden_layer_", i, "_linear") ) - } - masked_encoder_network$add_module( - module = torch::nn_linear(width, width), - name = paste0("hidden_layer_", i, "_linear") - ) - masked_encoder_network$add_module( - module = activation_function(), - name = paste0("hidden_layer_", i, "_activation") - ) - if (batch_normalization) { masked_encoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = paste0("hidden_layer_", i, "_batch_norm") + module = activation_function(), + name = paste0("hidden_layer_", i, "_activation") ) + if (batch_normalization) { + masked_encoder_network$add_module( + module = torch::nn_batch_norm1d(width), + name = paste0("hidden_layer_", i, "_batch_norm") + ) + } } } - } - # Masked Encoder: Go to latent space - if (skip_conn_masked_enc_dec) { + # Masked Encoder: Go to latent space + if (skip_conn_masked_enc_dec) { + masked_encoder_network$add_module( + module = memory_layer(id = paste0("#", depth + 1), shared_env = memory_layer_env), + name = "latent_space_layer_memory" + ) + } masked_encoder_network$add_module( - module = memory_layer(paste0("#", depth + 1)), - name = "latent_space_layer_memory" + module = torch::nn_linear(width, 2 * latent_dim), + name = "latent_space_layer_linear" ) - } - masked_encoder_network$add_module( - module = torch::nn_linear(width, 2 * latent_dim), - name = "latent_space_layer_linear" - ) - ##### Decoder - decoder_network <- torch::nn_sequential() + ##### Decoder + decoder_network <- torch::nn_sequential() - # Decoder: Go from latent space - decoder_network$add_module( - module = torch::nn_linear(latent_dim, width), - name = "latent_space_layer_linear" - ) - decoder_network$add_module( - module = activation_function(), - name = "latent_space_layer_activation" - ) - if (batch_normalization) { + # Decoder: Go from latent space decoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = "latent_space_layer_batch_norm" + module = torch::nn_linear(latent_dim, width), + name = "latent_space_layer_linear" ) - } - - # Get the width of the hidden layers in the decoder. Needs to be multiplied with two if - # we use skip connections between masked encoder and decoder as we concatenate the tensors. - width_decoder <- ifelse(skip_conn_masked_enc_dec, 2 * width, width) - - # Same for the input dimension to the last layer in decoder that yields the distribution params. - extra_params_skip_con_mask_enc <- - ifelse(test = skip_conn_masked_enc_dec, - yes = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features, - no = 0 + decoder_network$add_module( + module = activation_function(), + name = "latent_space_layer_activation" ) - - # Will need an extra hidden layer if we use skip connection from masked encoder to decoder - # as we send the full input layer of the masked encoder to the last layer in the decoder. - depth_decoder <- ifelse(skip_conn_masked_enc_dec, depth + 1, depth) - - # Decoder: Hidden layers - for (i in seq(depth_decoder)) { - if (skip_conn_layer) { - # Add identity skip connection. Such that the input is added to the output of the linear layer and activation - # function: output = X + activation(WX + b). Also check inside skip_connection if we are to use memory_layer. - # I.e., skip connection with concatenation from masked encoder to decoder. If TRUE, then the memory layers - # extracts the corresponding input used in the masked encoder and concatenate them with the current input. Note - # that we add the memory layers in the opposite direction from how they were created. So, we get a classical - # U-net with latent space at the bottom and a connection between the layers on the same height of the U-shape. + if (batch_normalization) { decoder_network$add_module( - module = torch::nn_sequential( - skip_connection( - if (skip_conn_masked_enc_dec) { - memory_layer(paste0("#", depth - i + 2), TRUE) - }, - torch::nn_linear(width_decoder, width), - activation_function() - ) - ), - name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer) + module = torch::nn_batch_norm1d(width), + name = "latent_space_layer_batch_norm" ) - if (batch_normalization) { + } + + # Get the width of the hidden layers in the decoder. Needs to be multiplied with two if + # we use skip connections between masked encoder and decoder as we concatenate the tensors. + width_decoder <- ifelse(skip_conn_masked_enc_dec, 2 * width, width) + + # Same for the input dimension to the last layer in decoder that yields the distribution params. + extra_params_skip_con_mask_enc <- + ifelse(test = skip_conn_masked_enc_dec, + yes = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features, + no = 0 + ) + + # Will need an extra hidden layer if we use skip connection from masked encoder to decoder + # as we send the full input layer of the masked encoder to the last layer in the decoder. + depth_decoder <- ifelse(skip_conn_masked_enc_dec, depth + 1, depth) + + # Decoder: Hidden layers + for (i in seq(depth_decoder)) { + if (skip_conn_layer) { + # Add identity skip connection. Such that the input is added to the output of the linear layer and activation + # function: output = X + activation(WX + b). Also check inside skip_connection if we are to use memory_layer. + # I.e., skip connection with concatenation from masked encoder to decoder. If TRUE, then the memory layers + # extracts the corresponding input used in the masked encoder and concatenate them with the current input. + # We add the memory layers in the opposite direction from how they were created. Thus, we get a classical + # U-net with latent space at the bottom and a connection between the layers on the same height of the U-shape. decoder_network$add_module( - module = torch::nn_batch_norm1d(n_features = width), - name = paste0("hidden_layer_", i, "_batch_norm") + module = torch::nn_sequential( + skip_connection( + if (skip_conn_masked_enc_dec) { + memory_layer(id = paste0("#", depth - i + 2), shared_env = memory_layer_env, output = TRUE) + }, + torch::nn_linear(width_decoder, width), + activation_function() + ) + ), + name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer) ) - } - } else { - # Do not use skip connections and do not add the input to the output. - if (skip_conn_masked_enc_dec) { + if (batch_normalization) { + decoder_network$add_module( + module = torch::nn_batch_norm1d(n_features = width), + name = paste0("hidden_layer_", i, "_batch_norm") + ) + } + } else { + # Do not use skip connections and do not add the input to the output. + if (skip_conn_masked_enc_dec) { + decoder_network$add_module( + module = memory_layer(id = paste0("#", depth - i + 2), shared_env = memory_layer_env, output = TRUE), + name = paste0("hidden_layer_", i, "_memory") + ) + } decoder_network$add_module( - module = memory_layer(paste0("#", depth - i + 2), TRUE), - name = paste0("hidden_layer_", i, "_memory") + module = torch::nn_linear(width_decoder, width), + name = paste0("hidden_layer_", i, "_linear") ) - } - decoder_network$add_module( - module = torch::nn_linear(width_decoder, width), - name = paste0("hidden_layer_", i, "_linear") - ) - decoder_network$add_module( - module = activation_function(), - name = paste0("hidden_layer_", i, "_activation") - ) - if (batch_normalization) { decoder_network$add_module( - module = torch::nn_batch_norm1d(width), - name = paste0("hidden_layer_", i, "_batch_norm") + module = activation_function(), + name = paste0("hidden_layer_", i, "_activation") ) + if (batch_normalization) { + decoder_network$add_module( + module = torch::nn_batch_norm1d(width), + name = paste0("hidden_layer_", i, "_batch_norm") + ) + } } } - } - # Decoder: Go the parameter space of the generative distributions - # Concatenate the input to the first layer of the masked encoder to the last layer of the decoder network. - if (skip_conn_masked_enc_dec) { + # Decoder: Go the parameter space of the generative distributions + # Concatenate the input to the first layer of the masked encoder to the last layer of the decoder network. + if (skip_conn_masked_enc_dec) { + decoder_network$add_module( + module = memory_layer(id = "#input", shared_env = memory_layer_env, output = TRUE), + name = "output_layer_memory" + ) + } + # Linear layer to the parameters of the generative distributions Gaussian and Categorical. + # Note that sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) is the number of + # one hot variables to the masked encoder and n_features represents the binary variables if + # the features was masked/missing or not when they entered the masked encoder. + # The output dimension is 2 for the continuous features and K_i for categorical feature X_i, + # where K_i is the number of classes the i'th categorical feature can take on. decoder_network$add_module( - module = memory_layer("#input", TRUE), - name = "output_layer_memory" + module = torch::nn_linear( + in_features = width + extra_params_skip_con_mask_enc, + out_features = sum(apply(rbind(one_hot_max_sizes, rep(2, n_features)), 2, max)) + ), + name = "output_layer_linear" ) - } - # Linear layer to the parameters of the generative distributions Gaussian and Categorical. - # Note that sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) is the number of - # one hot variables to the masked encoder and n_features represents the binary variables if - # the features was masked/missing or not when they entered the masked encoder. - # The output dimension is 2 for the continuous features and K_i for categorical feature X_i, - # where K_i is the number of classes the i'th categorical feature can take on. - decoder_network$add_module( - module = torch::nn_linear( - in_features = width + extra_params_skip_con_mask_enc, - out_features = sum(apply(rbind(one_hot_max_sizes, rep(2, n_features)), 2, max)) - ), - name = "output_layer_linear" - ) - # Save the networks to the vaeac object - self$full_encoder_network <- full_encoder_network - self$masked_encoder_network <- masked_encoder_network - self$decoder_network <- decoder_network - - # Compute the number of trainable parameters in the different networks and save them - n_para_full_encoder <- sum(sapply(full_encoder_network$parameters, function(p) prod(p$size()))) - n_para_masked_encoder <- sum(sapply(masked_encoder_network$parameters, function(p) prod(p$size()))) - n_para_decoder <- sum(sapply(decoder_network$parameters, function(p) prod(p$size()))) - n_para_total <- n_para_full_encoder + n_para_masked_encoder + n_para_decoder - self$n_train_param <- rbind(n_para_total, n_para_full_encoder, n_para_masked_encoder, n_para_decoder) - }, - - # Forward functions are required in torch::nn_modules, but is it not needed in the way we have implemented vaeac. - forward = function(...) { - warning("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.") - return("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.") - }, - - # Apply Mask to Batch to Create Observed Batch - # - # description Clones the batch and applies the mask to set masked entries to 0 to create the observed batch. - # - # param batch Tensor of dimension batch_size x n_features containing a batch of observations. - # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. - make_observed = function(batch, mask) { - observed <- batch$clone()$detach() # Clone and detach the batch from the graph (remove gradient element) - observed[mask == 1] <- 0 # Apply the mask by masking every entry in batch where 'mask' is 1. - return(observed) # Return the observed batch where masked entries are set to 0. - }, - - # Compute the Latent Distributions Inferred by the Encoders - # - # description Compute the parameters for the latent normal distributions inferred by the encoders. - # If `only_masked_encoder = TRUE`, then we only compute the latent normal distributions inferred by the - # masked encoder. This is used in the deployment phase when we do not have access to the full observation. - # - # param batch Tensor of dimension batch_size x n_features containing a batch of observations. - # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. - # param only_masked_encoder Boolean. If we are only to compute the latent distributions for the masked encoder. - # Used in deployment phase when we do not have access to the full data. Always FALSE in the training phase. - make_latent_distributions = function(batch, mask, only_masked_encoder = FALSE) { - # Artificially mask the observations where mask == 1 to create the observed batch values. - observed <- self$make_observed(batch = batch, mask = mask) - - # Check if we are in training or deployment phase - if (only_masked_encoder) { - # In deployment phase and only use the masked encoder. - full_encoder <- NULL - } else { - # In the training phase where we need to use both masked and full encoder. - - # Column bind the batch and the mask to create the full information sent to the full encoder. - full_info <- torch::torch_cat(c(batch, mask), dim = 2) - - # Send the full_information through the full encoder. It needs the full information to know if a - # value is missing or just masked. The output tensor is of shape batch_size x (2 x latent_dim) - # In each row, i.e., each observation in the batch, the first latent_dim entries are the means mu - # while the last latent_dim entries are the softplus of the sigmas, so they can take on any - # negative or positive value. Recall that softplus(x) = ln(1+e^{x}). - full_encoder_params <- self$full_encoder_network(full_info) - - # Takes the full_encoder_parameters and returns a normal distribution, which is component-wise - # independent. If sigma (after softmax transform) is less than 1e-3, then we set sigma to 0.001. - full_encoder <- vaeac_normal_parse_params(params = full_encoder_params, min_sigma = 1e-3) - } + # Save the networks to the vaeac object + self$full_encoder_network <- full_encoder_network + self$masked_encoder_network <- masked_encoder_network + self$decoder_network <- decoder_network + + # Compute the number of trainable parameters in the different networks and save them + n_para_full_encoder <- sum(sapply(full_encoder_network$parameters, function(p) prod(p$size()))) + n_para_masked_encoder <- sum(sapply(masked_encoder_network$parameters, function(p) prod(p$size()))) + n_para_decoder <- sum(sapply(decoder_network$parameters, function(p) prod(p$size()))) + n_para_total <- n_para_full_encoder + n_para_masked_encoder + n_para_decoder + self$n_train_param <- rbind(n_para_total, n_para_full_encoder, n_para_masked_encoder, n_para_decoder) + }, + + # Forward functions are required in torch::nn_modules, but is it not needed in the way we have implemented vaeac. + forward = function(...) { + warning("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.") + return("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.") + }, + + # Apply Mask to Batch to Create Observed Batch + # + # description Clones the batch and applies the mask to set masked entries to 0 to create the observed batch. + # + # param batch Tensor of dimension batch_size x n_features containing a batch of observations. + # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. + make_observed = function(batch, mask) { + observed <- batch$clone()$detach() # Clone and detach the batch from the graph (remove gradient element) + observed[mask == 1] <- 0 # Apply the mask by masking every entry in batch where 'mask' is 1. + return(observed) # Return the observed batch where masked entries are set to 0. + }, + + # Compute the Latent Distributions Inferred by the Encoders + # + # description Compute the parameters for the latent normal distributions inferred by the encoders. + # If `only_masked_encoder = TRUE`, then we only compute the latent normal distributions inferred by the + # masked encoder. This is used in the deployment phase when we do not have access to the full observation. + # + # param batch Tensor of dimension batch_size x n_features containing a batch of observations. + # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. + # param only_masked_encoder Boolean. If we are only to compute the latent distributions for the masked encoder. + # Used in deployment phase when we do not have access to the full data. Always FALSE in the training phase. + make_latent_distributions = function(batch, mask, only_masked_encoder = FALSE) { + # Artificially mask the observations where mask == 1 to create the observed batch values. + observed <- self$make_observed(batch = batch, mask = mask) + + # Check if we are in training or deployment phase + if (only_masked_encoder) { + # In deployment phase and only use the masked encoder. + full_encoder <- NULL + } else { + # In the training phase where we need to use both masked and full encoder. + + # Column bind the batch and the mask to create the full information sent to the full encoder. + full_info <- torch::torch_cat(c(batch, mask), dim = 2) + + # Send the full_information through the full encoder. It needs the full information to know if a + # value is missing or just masked. The output tensor is of shape batch_size x (2 x latent_dim) + # In each row, i.e., each observation in the batch, the first latent_dim entries are the means mu + # while the last latent_dim entries are the softplus of the sigmas, so they can take on any + # negative or positive value. Recall that softplus(x) = ln(1+e^{x}). + full_encoder_params <- self$full_encoder_network(full_info) + + # Takes the full_encoder_parameters and returns a normal distribution, which is component-wise + # independent. If sigma (after softmax transform) is less than 1e-3, then we set sigma to 0.001. + full_encoder <- vaeac_normal_parse_params(params = full_encoder_params, min_sigma = 1e-3) + } - # Column bind the batch and the mask to create the observed information sent to the masked encoder. - observed_info <- torch::torch_cat(c(observed, mask), dim = -1) - - # Compute the latent normal dist parameters (mu, sigma) for the masked - # encoder by sending the observed values and the mask to the masked encoder. - masked_encoder_params <- self$masked_encoder_network(observed_info) - - # Create the latent normal distributions based on the parameters (mu, sigma) from the masked encoder - masked_encoder <- vaeac_normal_parse_params(params = masked_encoder_params, min_sigma = 1e-3) - - # Return the full and masked encoders - return(list( - full_encoder = full_encoder, - masked_encoder = masked_encoder - )) - }, - - # Compute the Regularizes for the Latent Distribution Inferred by the Masked Encoder. - # - # description The masked encoder (prior) distribution regularization in the latent space. - # This is used to compute the extended variational lower bound used to train vaeac, see - # Section 3.3.1 in Olsen et al. (2022). - # Though regularizing prevents the masked encoder distribution parameters from going to infinity, - # the model usually doesn't diverge even without this regularization. It almost doesn't affect - # learning process near zero with default regularization parameters which are recommended to be used. - # - # param masked_encoder The torch_Normal object returned when calling the masked encoder. - masked_encoder_regularization = function(masked_encoder) { - # Extract the number of observations. Same as batch_size. - n_observations <- masked_encoder$mean$shape[1] - - # Extract the number of dimension in the latent space. - n_latent_dimensions <- masked_encoder$mean$shape[2] - - # Extract means and ensure correct shape (batch_size x latent_dim). - mu <- masked_encoder$mean$view(c(n_observations, n_latent_dimensions)) - - # Extract the sigmas and ensure correct shape (batch_size x latent_dim). - sigma <- masked_encoder$scale$view(c(n_observations, n_latent_dimensions)) - - # Note that sum(-1) indicates that we sum together the columns. - # mu_regularizer is then a tensor of length n_observations - mu_regularizer <- -(mu^2)$sum(-1) / (2 * self$sigma_mu^2) - - # sigma_regularizer is then also a tensor of length n_observations. - sigma_regularizer <- (sigma$log() - sigma)$sum(-1) * self$sigma_sigma - - # Add the regularization terms together and return them. - return(mu_regularizer + sigma_regularizer) - }, - - # Compute the Variational Lower Bound for the Observations in the Batch - # - # description Compute differentiable lower bound for the given batch of objects and mask. - # Used as the (negative) loss function for training the vaeac model. - # - # param batch Tensor of dimension batch_size x n_features containing a batch of observations. - # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. - batch_vlb = function(batch, mask) { - # Compute the latent normal distributions obtained from the full and masked encoder - encoders_list <- self$make_latent_distributions(batch = batch, mask = mask) - - # Extract the masked and full encoders. These are torch_Normal objects. - masked_encoder <- encoders_list$masked_encoder - full_encoder <- encoders_list$full_encoder - - # Apply the regularization on the mus and sigmas of the normal dist obtained from the masked encoder - # such that they don't blow up. Regularized according to their normal gamma prior, see Olsen et al. (2022). - masked_encoder_regularization <- self$masked_encoder_regularization(masked_encoder) - - # To use the reparameterization trick to train vaeac, we need to use 'rsample' - # and not 'sample', which allows backpropagation through the mean and standard deviation layers, - # see https://pytorch.org/docs/stable/distributions.html#pathwise-derivative. - # For each training instance in the batch we sample values for each of the latent variables, - # i.e., we get a tensor of dimension batch_size x latent_dim. - latent <- full_encoder$rsample() - - # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case) - # where we for each row have a normal dist on each feature The form will be (mu_1, sigma_1, ..., mu_p, sigma_p) - reconstruction_params <- self$decoder_network(latent) - - # Compute the reconstruction loss, i.e., the log likelihood of only the masked values in - # the batch (true values) given the current reconstruction parameters from the decoder. - # We do not consider the log likelihood of observed or missing/nan values. - reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask) - - # Compute the KL divergence between the two latent normal distributions obtained from the full encoder - # and masked encoder. Since the networks create MVN with diagonal covariance matrices, that is, the same as - # a product of individual Gaussian distributions, we can compute KL analytically very easily: - # KL(p, q) = \int p(x) log(p(x)/q(x)) dx - # = 0.5 * { (sigma_p/sigma_q)^2 + (mu_q - mu_p)^2/sigma_q^2 - 1 + 2 ln (sigma_q/sigma_p)} - # when both p and q are torch_Normal objects. - kl <- vaeac_kl_normal_normal(full_encoder, masked_encoder)$view(c(batch$shape[1], -1))$sum(-1) - - # Return the variational lower bound with the prior regularization. See Section 3.3.1 in Olsen et al. (2022) - return(reconstruction_loss - kl + masked_encoder_regularization) - }, - - # Compute the Importance Sampling Estimator for the Observations in the Batch - # - # description Compute IWAE log likelihood estimate with K samples per object. - # - # details Technically, it is differentiable, but it is recommended to use it for - # evaluation purposes inside torch.no_grad in order to save memory. With torch::with_no_grad - # the method almost doesn't require extra memory for very large K. The method makes K independent - # passes through decoder network, so the batch size is the same as for training with batch_vlb. - # IWAE is an abbreviation for Importance Sampling Estimator - # log p_{theta, psi}(x|y) approx - # log {1/K * sum_{i=1}^K [p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)]} = - # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)])} - log(K) = - # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)])} - log(K) = - # logsumexp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)]) - log(K) = - # logsumexp(rec_loss + prior_log_prob - proposal_log_prob) - log(K), - # where z_i ~ q_phi(z|x,y). - # - # param batch Tensor of dimension batch_size x n_features containing a batch of observations. - # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. - # param K Integer. The number of samples generated to compute the IWAE for each observation in `batch`. - batch_iwae = function(batch, mask, K) { - # Compute the latent normal distributions obtained from the full and masked encoder - encoders_list <- self$make_latent_distributions(batch = batch, mask = mask) - - # Extract the masked and full encoders. These are torch_Normal objects. - masked_encoder <- encoders_list$masked_encoder - full_encoder <- encoders_list$full_encoder - - # List to store the estimates. - estimates <- list() - - # Iterate over the number of samples/passes through the decoder for each validation observation. - for (i in seq(K)) { - # See equation 18 on page 18 in Ivanov et al. (2019). Create samples from the - # full encoder; z_i ~ q_phi(z|x,y). We get a tensor of dimension batch_size x latent_dim. + # Column bind the batch and the mask to create the observed information sent to the masked encoder. + observed_info <- torch::torch_cat(c(observed, mask), dim = -1) + + # Compute the latent normal dist parameters (mu, sigma) for the masked + # encoder by sending the observed values and the mask to the masked encoder. + masked_encoder_params <- self$masked_encoder_network(observed_info) + + # Create the latent normal distributions based on the parameters (mu, sigma) from the masked encoder + masked_encoder <- vaeac_normal_parse_params(params = masked_encoder_params, min_sigma = 1e-3) + + # Return the full and masked encoders + return(list( + full_encoder = full_encoder, + masked_encoder = masked_encoder + )) + }, + + # Compute the Regularizes for the Latent Distribution Inferred by the Masked Encoder. + # + # description The masked encoder (prior) distribution regularization in the latent space. + # This is used to compute the extended variational lower bound used to train vaeac, see + # Section 3.3.1 in Olsen et al. (2022). + # Though regularizing prevents the masked encoder distribution parameters from going to infinity, + # the model usually doesn't diverge even without this regularization. It almost doesn't affect + # learning process near zero with default regularization parameters which are recommended to be used. + # + # param masked_encoder The torch_Normal object returned when calling the masked encoder. + masked_encoder_regularization = function(masked_encoder) { + # Extract the number of observations. Same as batch_size. + n_observations <- masked_encoder$mean$shape[1] + + # Extract the number of dimension in the latent space. + n_latent_dimensions <- masked_encoder$mean$shape[2] + + # Extract means and ensure correct shape (batch_size x latent_dim). + mu <- masked_encoder$mean$view(c(n_observations, n_latent_dimensions)) + + # Extract the sigmas and ensure correct shape (batch_size x latent_dim). + sigma <- masked_encoder$scale$view(c(n_observations, n_latent_dimensions)) + + # Note that sum(-1) indicates that we sum together the columns. + # mu_regularizer is then a tensor of length n_observations + mu_regularizer <- -(mu^2)$sum(-1) / (2 * self$sigma_mu^2) + + # sigma_regularizer is then also a tensor of length n_observations. + sigma_regularizer <- (sigma$log() - sigma)$sum(-1) * self$sigma_sigma + + # Add the regularization terms together and return them. + return(mu_regularizer + sigma_regularizer) + }, + + # Compute the Variational Lower Bound for the Observations in the Batch + # + # description Compute differentiable lower bound for the given batch of objects and mask. + # Used as the (negative) loss function for training the vaeac model. + # + # param batch Tensor of dimension batch_size x n_features containing a batch of observations. + # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. + batch_vlb = function(batch, mask) { + # Compute the latent normal distributions obtained from the full and masked encoder + encoders_list <- self$make_latent_distributions(batch = batch, mask = mask) + + # Extract the masked and full encoders. These are torch_Normal objects. + masked_encoder <- encoders_list$masked_encoder + full_encoder <- encoders_list$full_encoder + + # Apply the regularization on the mus and sigmas of the normal dist obtained from the masked encoder + # such that they don't blow up. Regularized according to their normal gamma prior, see Olsen et al. (2022). + masked_encoder_regularization <- self$masked_encoder_regularization(masked_encoder) + + # To use the reparameterization trick to train vaeac, we need to use 'rsample' + # and not 'sample', which allows backpropagation through the mean and standard deviation layers, + # see https://pytorch.org/docs/stable/distributions.html#pathwise-derivative. + # For each training instance in the batch we sample values for each of the latent variables, + # i.e., we get a tensor of dimension batch_size x latent_dim. latent <- full_encoder$rsample() # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case) @@ -694,82 +652,158 @@ vaeac <- torch::nn_module( # We do not consider the log likelihood of observed or missing/nan values. reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask) - # Compute the log likelihood of observing the sampled latent representations from - # the full_encoder when using the normal distribution estimated by the masked_encoder. - masked_encoder_log_prob <- masked_encoder$log_prob(latent) - - # Ensure dimensions batch$shape[1] x something. - masked_encoder_log_prob <- masked_encoder_log_prob$view(c(batch$shape[1], -1)) - - # Sum over the rows (last dimension), i.e., add the log-likelihood for each instance. - masked_encoder_log_prob <- masked_encoder_log_prob$sum(-1) - - # Same explanations here as above, but now for the full_encoder. - full_encoder_log_prob <- full_encoder$log_prob(latent) - full_encoder_log_prob <- full_encoder_log_prob$view(c(batch$shape[1], -1)) - full_encoder_log_prob <- full_encoder_log_prob$sum(-1) - - # Combine the estimated loss based on the formula from equation 18 on page 18 in Ivanov et al. (2019). - # Consists of batch.shape[0] number of values - estimate <- reconstruction_loss + masked_encoder_log_prob - full_encoder_log_prob - - # Make sure that the results are a column vector of height batch_size. - estimate <- estimate$unsqueeze(-1) + # Compute the KL divergence between the two latent normal distributions obtained from the full encoder + # and masked encoder. Since the networks create MVN with diagonal covariance matrices, that is, the same as + # a product of individual Gaussian distributions, we can compute KL analytically very easily: + # KL(p, q) = \int p(x) log(p(x)/q(x)) dx + # = 0.5 * { (sigma_p/sigma_q)^2 + (mu_q - mu_p)^2/sigma_q^2 - 1 + 2 ln (sigma_q/sigma_p)} + # when both p and q are torch_Normal objects. + kl <- vaeac_kl_normal_normal(full_encoder, masked_encoder)$view(c(batch$shape[1], -1))$sum(-1) + + # Return the variational lower bound with the prior regularization. See Section 3.3.1 in Olsen et al. (2022) + return(reconstruction_loss - kl + masked_encoder_regularization) + }, + + # Compute the Importance Sampling Estimator for the Observations in the Batch + # + # description Compute IWAE log likelihood estimate with K samples per object. + # + # details Technically, it is differentiable, but it is recommended to use it for + # evaluation purposes inside torch.no_grad in order to save memory. With torch::with_no_grad + # the method almost doesn't require extra memory for very large K. The method makes K independent + # passes through decoder network, so the batch size is the same as for training with batch_vlb. + # IWAE is an abbreviation for Importance Sampling Estimator + # log p_{theta, psi}(x|y) approx + # log {1/K * sum_{i=1}^K [p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)]} = + # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)])} - log(K) = + # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)])} - log(K) = + # logsumexp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)]) - log(K) = + # logsumexp(rec_loss + prior_log_prob - proposal_log_prob) - log(K), + # where z_i ~ q_phi(z|x,y). + # + # param batch Tensor of dimension batch_size x n_features containing a batch of observations. + # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. + # param K Integer. The number of samples generated to compute the IWAE for each observation in `batch`. + batch_iwae = function(batch, mask, K) { + # Compute the latent normal distributions obtained from the full and masked encoder + encoders_list <- self$make_latent_distributions(batch = batch, mask = mask) + + # Extract the masked and full encoders. These are torch_Normal objects. + masked_encoder <- encoders_list$masked_encoder + full_encoder <- encoders_list$full_encoder + + # List to store the estimates. + estimates <- list() + + # Iterate over the number of samples/passes through the decoder for each validation observation. + for (i in seq(K)) { + # See equation 18 on page 18 in Ivanov et al. (2019). Create samples from the + # full encoder; z_i ~ q_phi(z|x,y). We get a tensor of dimension batch_size x latent_dim. + latent <- full_encoder$rsample() + + # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case) + # where we for each row have a normal dist on each feature The form will be (mu_1, sigma_1, ..., mu_p, sigma_p) + reconstruction_params <- self$decoder_network(latent) + + # Compute the reconstruction loss, i.e., the log likelihood of only the masked values in + # the batch (true values) given the current reconstruction parameters from the decoder. + # We do not consider the log likelihood of observed or missing/nan values. + reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask) + + # Compute the log likelihood of observing the sampled latent representations from + # the full_encoder when using the normal distribution estimated by the masked_encoder. + masked_encoder_log_prob <- masked_encoder$log_prob(latent) + + # Ensure dimensions batch$shape[1] x something. + masked_encoder_log_prob <- masked_encoder_log_prob$view(c(batch$shape[1], -1)) + + # Sum over the rows (last dimension), i.e., add the log-likelihood for each instance. + masked_encoder_log_prob <- masked_encoder_log_prob$sum(-1) + + # Same explanations here as above, but now for the full_encoder. + full_encoder_log_prob <- full_encoder$log_prob(latent) + full_encoder_log_prob <- full_encoder_log_prob$view(c(batch$shape[1], -1)) + full_encoder_log_prob <- full_encoder_log_prob$sum(-1) + + # Combine the estimated loss based on the formula from equation 18 on page 18 in Ivanov et al. (2019). + # Consists of batch.shape[0] number of values + estimate <- reconstruction_loss + masked_encoder_log_prob - full_encoder_log_prob + + # Make sure that the results are a column vector of height batch_size. + estimate <- estimate$unsqueeze(-1) + + # Add the results to the estimates list + estimates <- append(estimates, estimate) + } - # Add the results to the estimates list - estimates <- append(estimates, estimate) - } + # Convert from list of tensors to a single tensor using colum bind + estimates <- torch::torch_cat(estimates, -1) + + # Use the stabilizing trick logsumexp. + # We have worked on log-scale above, hence plus and minus and not multiplication and division, + # while Eq. 18 in Ivanov et al. (2019) work on regular scale with multiplication and division. + # We take the exp of the values to get back to original scale, then sum it and convert back to + # log scale. Note that we add -log(K) instead of dividing each term by K. + # Take the log sum exp along the rows (validation samples) then subtract log(K). + return(torch::torch_logsumexp(estimates, -1) - log(K)) + }, + + # Generate the Parameters of the Generative Distributions + # + # description Generate the parameters of the generative distributions for samples from the batch. + # + # details The function makes K latent representation for each object from the batch, send these + # latent representations through the decoder to obtain the parameters for the generative distributions. + # I.e., means and variances for the normal distributions (continuous features) and probabilities + # for the categorical distribution (categorical features). + # The second axis is used to index samples for an object, i.e. if the batch shape is [n x D1 x D2], then + # the result shape is [n x K x D1 x D2]. It is better to use it inside torch::with_no_grad in order to save + # memory. With torch::with_no_grad the method doesn't require extra memory except the memory for the result. + # + # param batch Tensor of dimension batch_size x n_features containing a batch of observations. + # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. + # param K Integer. The number of imputations to be done for each observation in batch. + generate_samples_params = function(batch, mask, K = 1) { + # Compute the latent normal distributions obtained from only the masked encoder. + encoders_list <- self$make_latent_distributions(batch = batch, mask = mask, only_masked_encoder = TRUE) + + # Only extract the masked encoder (torch_Normal object) as we are in the deployment phase. + masked_encoder <- encoders_list$masked_encoder + + # Create a list to keep the sampled parameters. + samples_params <- list() + + # Iterate over the number of imputations for each observation in the batch. + for (i in seq(K)) { + latent <- masked_encoder$rsample() # Generate latent representations by using the masked encoder + sample_params <- self$decoder_network(latent) # Send the latent representations through the decoder + samples_params <- append(samples_params, sample_params$unsqueeze(2)) # Store the inferred Gaussian distributions + } - # Convert from list of tensors to a single tensor using colum bind - estimates <- torch::torch_cat(estimates, -1) - - # Use the stabilizing trick logsumexp. - # We have worked on log-scale above, hence plus and minus and not multiplication and division, - # while Eq. 18 in Ivanov et al. (2019) work on regular scale with multiplication and division. - # We take the exp of the values to get back to original scale, then sum it and convert back to - # log scale. Note that we add -log(K) instead of dividing each term by K. - # Take the log sum exp along the rows (validation samples) then subtract log(K). - return(torch::torch_logsumexp(estimates, -1) - log(K)) - }, - - # Generate the Parameters of the Generative Distributions - # - # description Generate the parameters of the generative distributions for samples from the batch. - # - # details The function makes K latent representation for each object from the batch, send these - # latent representations through the decoder to obtain the parameters for the generative distributions. - # I.e., means and variances for the normal distributions (continuous features) and probabilities - # for the categorical distribution (categorical features). - # The second axis is used to index samples for an object, i.e. if the batch shape is [n x D1 x D2], then - # the result shape is [n x K x D1 x D2]. It is better to use it inside torch::with_no_grad in order to save - # memory. With torch::with_no_grad the method doesn't require extra memory except the memory for the result. - # - # param batch Tensor of dimension batch_size x n_features containing a batch of observations. - # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`. - # param K Integer. The number of imputations to be done for each observation in batch. - generate_samples_params = function(batch, mask, K = 1) { - # Compute the latent normal distributions obtained from only the masked encoder. - encoders_list <- self$make_latent_distributions(batch = batch, mask = mask, only_masked_encoder = TRUE) - - # Only extract the masked encoder (torch_Normal object) as we are in the deployment phase. - masked_encoder <- encoders_list$masked_encoder - - # Create a list to keep the sampled parameters. - samples_params <- list() - - # Iterate over the number of imputations for each observation in the batch. - for (i in seq(K)) { - latent <- masked_encoder$rsample() # Generate latent representations by using the masked encoder - sample_params <- self$decoder_network(latent) # Send the latent representations through the decoder - samples_params <- append(samples_params, sample_params$unsqueeze(2)) # Store the inferred Gaussian distributions + # Concatenate the list to a 3d-tensor. 2nd dimensions is the imputations. + return(torch::torch_cat(samples_params, 2)) } + ) + return(vaeac_tmp( + one_hot_max_sizes = one_hot_max_sizes, + width = width, + depth = depth, + latent_dim = latent_dim, + activation_function = activation_function, + skip_conn_layer = skip_conn_layer, + skip_conn_masked_enc_dec = skip_conn_masked_enc_dec, + batch_normalization = batch_normalization, + paired_sampling = paired_sampling, + mask_generator_name = mask_generator_name, + masking_ratio = masking_ratio, + mask_gen_coalitions = mask_gen_coalitions, + mask_gen_coalitions_prob = mask_gen_coalitions_prob, + sigma_mu = sigma_mu, + sigma_sigma = sigma_sigma + )) +} - # Concatenate the list to a 3d-tensor. 2nd dimensions is the imputations. - return(torch::torch_cat(samples_params, 2)) - } -) - -# Dataset Utility Functions =========================================================================================== +# Dataset Utility Functions ============================================================================================ #' Compute Featurewise Means and Standard Deviations #' #' @description Returns the means and standard deviations for all continuous features in the data set. @@ -998,7 +1032,7 @@ vaeac_postprocess_data <- function(data, vaeac_model_state_list) { return(data) } -## vaeac_dataset ------------------------------------------------------------------------------------------------------ + #' Dataset used by the `vaeac` model #' #' @description @@ -1047,25 +1081,28 @@ vaeac_postprocess_data <- function(data, vaeac_model_state_list) { #' vaeac_iterator$.next() # batch2 #' vaeac_iterator$.next() # Empty #' } -vaeac_dataset <- torch::dataset( - name = "vaeac_dataset", # field name The name of the `torch::dataset`. - - # description Create a new vaeac_dataset object. - # param X A torch_tensor containing the data - # param one_hot_max_sizes A torch tensor of dimension p containing the one hot sizes of the p features. - # The sizes for the continuous features can either be '0' or '1'. - initialize = function(X, one_hot_max_sizes) { - # Save the number of observations and features in X, the one hot dummy feature sizes and the dataset - self$N <- nrow(X) - self$p <- ncol(X) - self$one_hot_max_sizes <- one_hot_max_sizes - self$X <- X - }, - .getbatch = function(index) self$X[index, , drop = FALSE], # Get a batch of data based on the provided indices - .length = function() nrow(self$X) # Get the number of observations in the dataset -) - -## Paired Sampler ---------------------------------------------------------------------------------------------------- +vaeac_dataset <- function(X, one_hot_max_sizes) { + vaeac_dataset_tmp <- torch::dataset( + name = "vaeac_dataset", # field name The name of the `torch::dataset`. + + # description Create a new vaeac_dataset object. + # param X A torch_tensor containing the data + # param one_hot_max_sizes A torch tensor of dimension p containing the one hot sizes of the p features. + # The sizes for the continuous features can either be '0' or '1'. + initialize = function(X, one_hot_max_sizes) { + # Save the number of observations and features in X, the one hot dummy feature sizes and the dataset + self$N <- nrow(X) + self$p <- ncol(X) + self$one_hot_max_sizes <- one_hot_max_sizes + self$X <- X + }, + .getbatch = function(index) self$X[index, , drop = FALSE], # Get a batch of data based on the provided indices + .length = function() nrow(self$X) # Get the number of observations in the dataset + ) + return(vaeac_dataset_tmp(X = X, one_hot_max_sizes = one_hot_max_sizes)) +} + + #' Sampling Paired Observations #' #' @description @@ -1109,38 +1146,41 @@ vaeac_dataset <- torch::dataset( #' } #' @author Lars Henry Berge Olsen #' @keywords internal -paired_sampler <- torch::sampler( - classname = "paired_sampler", # field Name of the paired sampler object - # description Initialize the paired_sampler object - initialize = function(vaeac_dataset_object, shuffle = FALSE) { - self$vaeac_dataset_object <- vaeac_dataset_object - self$shuffle <- shuffle - }, - # description Get the number of observations in the datasaet - .length = function() length(self$vaeac_dataset_object) * 2, # Multiply by two do to get the actual number - # description Function to iterate over the data - .iter = function() { - n <- length(self$vaeac_dataset_object) # Get the number of observations in the data - indices <- if (self$shuffle) sample.int(n) else seq_len(n) # Check if randomly shuffle indices or increasing order - return(coro::as_iterator(rep(indices, each = 2))) # Duplicate each index and return an iterator - } -) +paired_sampler <- function(vaeac_dataset_object, shuffle = FALSE) { + paired_sampler_tmp <- torch::sampler( + classname = "paired_sampler", # field Name of the paired sampler object + # description Initialize the paired_sampler object + initialize = function(vaeac_dataset_object, shuffle = FALSE) { + self$vaeac_dataset_object <- vaeac_dataset_object + self$shuffle <- shuffle + }, + # description Get the number of observations in the datasaet + .length = function() length(self$vaeac_dataset_object) * 2, # Multiply by two do to get the actual number + # description Function to iterate over the data + .iter = function() { + n <- length(self$vaeac_dataset_object) # Get the number of observations in the data + indices <- if (self$shuffle) sample.int(n) else seq_len(n) # Check if randomly shuffle indices or increasing order + return(coro::as_iterator(rep(indices, each = 2))) # Duplicate each index and return an iterator + } + ) + return(paired_sampler_tmp(vaeac_dataset_object = vaeac_dataset_object, shuffle = shuffle)) +} -# Neural Network Utility Functions ==================================================================================== -## memory_layer ------------------------------------------------------------------------------------------------------- +# Neural Network Utility Functions ===================================================================================== #' A [torch::nn_module()] Representing a Memory Layer #' -#' @description The layer is used to make skip-connections inside a [torch::nn_sequential] network -#' or between several [torch::nn_sequential] networks without unnecessary code complication. +#' @description The layer is used to make skip-connections inside a [torch::nn_sequential()] network +#' or between several [torch::nn_sequential()] networks without unnecessary code complication. #' -#' @details If `output = FALSE`, this layer stores its input in a static list `storage` with the key `id`` and then +#' @details If `output = FALSE`, this layer stores its input in the `shared_env` with the key `id` and then #' passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If `output = TRUE`, this #' layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If `add = TRUE`, it #' returns sum of the stored vector and an `input`, otherwise it returns their concatenation. If the tensor with #' specified `id` is not in storage when the layer with `output = TRUE` is called, it would cause an exception. #' #' @param id A unique id to use as a key in the storage list. +#' @param shared_env A shared environment for all instances of memory_layer where the inputs are stored. #' @param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage. #' @param add Boolean variable indicating if the extracted value are to be added or concatenated to the input. #' Only applicable when `output = TRUE`. @@ -1151,78 +1191,81 @@ paired_sampler <- torch::sampler( #' #' @examples #' \dontrun{ +#' memory_layer_env <- new.env() #' net1 <- torch::nn_sequential( -#' memory_layer("#1"), -#' memory_layer("#0.1"), +#' memory_layer("#1", shared_env = memory_layer_env), +#' memory_layer("#0.1", shared_env = memory_layer_env), #' torch::nn_linear(512, 256), -#' torch::nn_leaky_relu(), -#' # here add cannot be TRUE because the dimensions mismatch -#' memory_layer("#0.1", output = TRUE, add = FALSE), +#' torch::nn_leaky_relu(), # Here add cannot be TRUE because the dimensions mismatch +#' memory_layer("#0.1", shared_env = memory_layer_env, output = TRUE, add = FALSE), #' torch::nn_linear(768, 256), #' # the dimension after the concatenation with skip-connection is 512 + 256 = 768 #' ) #' net2 <- torch::nn_equential( #' torch::nn_linear(512, 512), -#' memory_layer("#1", output = TRUE, add = TRUE), +#' memory_layer("#1", shared_env = memory_layer_env, output = TRUE, add = TRUE), #' ... #' ) +#' # Here a and c must be of correct dimensions, e.g., a = torch::torch_ones(1,512). #' b <- net1(a) #' d <- net2(c) # net2 must be called after net1, otherwise tensor '#1' will not be in storage. #' } -memory_layer <- torch::nn_module( - classname = "memory_layer", # field classname Name of the of torch::nn_module object. - - # field shared_env A shared environment for all instances of memory_layers. - shared_env = new.env(), - - # description Create a new `memory_layer` object. - # param id A unique id to use as a key in the storage list. - # param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage. - # param add Boolean variable indicating if the extracted value are to be added or concatenated to the input. - # Only applicable when `output = TRUE`. - # param verbose Boolean variable indicating if we want to give printouts to the user. - initialize = function(id, output = FALSE, add = FALSE, verbose = FALSE) { - self$id <- id - self$output <- output - self$add <- add - self$verbose <- verbose - }, - forward = function(input) { - # Check if we are going to insert input into the storage or extract data from the storage. - if (!self$output) { - if (self$verbose) message(paste0("Inserting data to memory layer `self$id = ", self$id, "`.")) - - # Insert the input into the storage list which is in the shared environment of the memory_layer class. - # Note that we do not check if self$id is unique. - self$shared_env$storage[[self$id]] <- input - return(input) # Return/send the input to the next layer in the network. - } else { - # We are to extract data from the storage list. - if (self$verbose) { - message(paste0( - "Extracting data to memory layer `self$id = ", self$id, "`. Using concatination = ", !self$add, "." - )) - } +memory_layer <- function(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE) { + memory_layer_tmp <- torch::nn_module( + classname = "memory_layer", # field classname Name of the of torch::nn_module object. + + # description Create a new `memory_layer` object. + # param id A unique id to use as a key in the storage list. + # param shared_env A shared environment for all instances of memory_layer where the inputs are stored. + # param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage. + # param add Boolean variable indicating if the extracted value are to be added or concatenated to the input. + # Only applicable when `output = TRUE`. + # param verbose Boolean variable indicating if we want to give printouts to the user. + initialize = function(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE) { + self$id <- id + self$shared_env <- shared_env + self$output <- output + self$add <- add + self$verbose <- verbose + }, + forward = function(input) { + # Check if we are going to insert input into the storage or extract data from the storage. + if (!self$output) { + if (self$verbose) message(paste0("Inserting data to memory layer `self$id = ", self$id, "`.")) + + # Insert the input into the storage list which is in the shared environment of the memory_layer class. + # Note that we do not check if self$id is unique. + self$shared_env[[self$id]] <- input + return(input) # Return/send the input to the next layer in the network. + } else { + # We are to extract data from the storage list. + if (self$verbose) { + message(paste0( + "Extracting data to memory layer `self$id = ", self$id, "`. Using concatination = ", !self$add, "." + )) + } - # Check that the memory layer has data is stored in it. If not, then thorw error. - if (!self$id %in% names(self$shared_env$storage)) { - stop(paste0( - "ValueError: Looking for memory layer `self$id = ", self$id, "`, but the only available memory layers are: ", - paste(names(self$shared_env$storage), collapse = "`, `"), "`." - )) - } + # Check that the memory layer has data is stored in it. If not, then thorw error. + if (!self$id %in% names(self$shared_env)) { + stop(paste0( + "ValueError: Looking for memory layer `self$id = ", self$id, "`, but the only available ", + "memory layers are: ", paste(names(self$shared_env), collapse = "`, `"), "`." + )) + } - # Extract the stored data for the given memory layer and check if we are to concatenate or add the input - stored <- self$shared_env$storage[[self$id]] - data <- if (self$add) input + stored else torch::torch_cat(c(input, stored), -1) + # Extract the stored data for the given memory layer and check if we are to concatenate or add the input + stored <- self$shared_env[[self$id]] + data <- if (self$add) input + stored else torch::torch_cat(c(input, stored), -1) - # Return the data - return(data) + # Return the data + return(data) + } } - } -) + ) + + return(memory_layer_tmp(id = id, shared_env = shared_env, output = output, add = add, verbose = verbose)) +} -## skip_connection ----------------------------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a skip connection #' #' @description Skip connection over the sequence of layers in the constructor. The module passes @@ -1233,15 +1276,20 @@ memory_layer <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -skip_connection <- torch::nn_module( - classname = "skip_connection", # field classname Name of the of torch::nn_module object - # description Initialize a new skip_connection module - initialize = function(...) self$inner_net <- torch::nn_sequential(...), - # description What to do when a skip_connection module is called - forward = function(input) { - return(input + self$inner_net(input)) - } -) +skip_connection <- function(...) { + skip_connection_tmp <- torch::nn_module( + classname = "skip_connection", # field classname Name of the of torch::nn_module object + # description Initialize a new skip_connection module + initialize = function(...) self$inner_net <- torch::nn_sequential(...), + # description What to do when a skip_connection module is called + forward = function(input) { + return(input + self$inner_net(input)) + } + ) + + return(skip_connection_tmp(... = ...)) +} + # Training Utility Functions ========================================================================================== #' Extends Incomplete Batches by Sampling Extra Data from Dataloader @@ -1424,8 +1472,8 @@ vaeac_kl_normal_normal <- function(p, q) { return(0.5 * (var_ratio + t1 - 1 - var_ratio$log())) } + # Neural Network Modules =============================================================================================== -## gauss_cat_sampler_most_likely ------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a `gauss_cat_sampler_most_likely` #' #' @description The `gauss_cat_sampler_most_likely` generates the most likely samples from the generative distribution @@ -1439,50 +1487,56 @@ vaeac_kl_normal_normal <- function(p, q) { #' #' @keywords internal #' @author Lars Henry Berge Olsen -gauss_cat_sampler_most_likely <- torch::nn_module( - classname = "gauss_cat_sampler_most_likely", # field classname Type of torch::nn_module - - # description Initialize a gauss_cat_sampler_most_likely which generates the most likely - # sample from the generative distribution defined by the output of the neural network. - initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { - self$one_hot_max_sizes <- one_hot_max_sizes - self$min_sigma <- min_sigma - self$min_prob <- min_prob - }, - - # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering - # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the vaeac, - # but rather logits for the categorical distribution. - # return A tensor containing the generated data. - forward = function(distr_params) { - cur_distr_col <- 1 # A counter to keep track of which column to extract from - sample <- list() # List to store the samples sampled from the normal distribution with parameters from distr_params - - # Iterate over the features - for (i in seq_along(self$one_hot_max_sizes)) { - size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat - - if (size <= 1) { - # Continuous feature which are modeled using the Gaussian distribution - params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2 - cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) - distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params - col_sample <- distr$mean # We sample the mean (most likely value) - } else { - # Categorical feature which are modeled using the categorical distribution - # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. - params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] - cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels - distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params - col_sample <- torch::torch_argmax(distr$probs, -1)[, NULL]$to(dtype = torch::torch_float()) # Most likely class +gauss_cat_sampler_most_likely <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + gauss_cat_sampler_most_lik_tmp <- torch::nn_module( + classname = "gauss_cat_sampler_most_likely", # field classname Type of torch::nn_module + + # description Initialize a gauss_cat_sampler_most_likely which generates the most likely + # sample from the generative distribution defined by the output of the neural network. + initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + self$one_hot_max_sizes <- one_hot_max_sizes + self$min_sigma <- min_sigma + self$min_prob <- min_prob + }, + + # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering + # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the + # vaeac, but rather logits for the categorical distribution. + # return A tensor containing the generated data. + forward = function(distr_params) { + cur_distr_col <- 1 # A counter to keep track of which column to extract from + sample <- list() # List to store the samples sampled from the normal distr. with parameters from distr_params + + # Iterate over the features + for (i in seq_along(self$one_hot_max_sizes)) { + size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat + + if (size <= 1) { + # Continuous feature which are modeled using the Gaussian distribution + params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2 + cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) + distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params + col_sample <- distr$mean # We sample the mean (most likely value) + } else { + # Categorical feature which are modeled using the categorical distribution + # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. + params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] + cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels + distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params + col_sample <- torch::torch_argmax(distr$probs, -1)[, NULL]$to(dtype = torch::torch_float()) # Most lik class + } + sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list } - sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list + return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list } - return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list - } -) + ) + + return( + gauss_cat_sampler_most_lik_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob) + ) +} + -## gauss_cat_sampler_random ----------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a gauss_cat_sampler_random #' #' @description The `gauss_cat_sampler_random` generates random samples from the generative distribution defined by the @@ -1494,52 +1548,56 @@ gauss_cat_sampler_most_likely <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -gauss_cat_sampler_random <- torch::nn_module( - classname = "gauss_cat_sampler_random", # field classname Type of torch::nn_module - - # description Initialize a gauss_cat_sampler_random which generates a sample from the - # generative distribution defined by the output of the neural network by random sampling. - # return A new `gauss_cat_sampler_random` object. - initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { - self$one_hot_max_sizes <- one_hot_max_sizes - self$min_sigma <- min_sigma - self$min_prob <- min_prob - }, - - # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering - # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the vaeac, - # but rather logits for the categorical distribution. - # return A tensor containing the generated data. - forward = function(distr_params) { - cur_distr_col <- 1 # A counter to keep track of which column to extract from - sample <- list() # List to store the samples sampled from the normal distribution with parameters from distr_params - - # Iterate over the features - for (i in seq_along(self$one_hot_max_sizes)) { - size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat - - if (size <= 1) { - # Continuous feature which are modeled using the Gaussian distribution - params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2 - cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) - distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params - col_sample <- distr$sample() # Sample from the inferred Gaussian distributions - } else { - # Categorical feature which are modeled using the categorical distribution - # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. - params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] - cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels - distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params - col_sample <- distr$sample()$unsqueeze(-1)$to(dtype = torch::torch_float()) # Sample class based on class prob +gauss_cat_sampler_random <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + gauss_cat_sampler_random_tmp <- torch::nn_module( + classname = "gauss_cat_sampler_random", # field classname Type of torch::nn_module + + # description Initialize a gauss_cat_sampler_random which generates a sample from the + # generative distribution defined by the output of the neural network by random sampling. + # return A new `gauss_cat_sampler_random` object. + initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + self$one_hot_max_sizes <- one_hot_max_sizes + self$min_sigma <- min_sigma + self$min_prob <- min_prob + }, + + # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering + # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the + # vaeac, but rather logits for the categorical distribution. + # return A tensor containing the generated data. + forward = function(distr_params) { + cur_distr_col <- 1 # A counter to keep track of which column to extract from + sample <- list() # List to store the samples sampled from the normal distr with parameters from distr_params + + # Iterate over the features + for (i in seq_along(self$one_hot_max_sizes)) { + size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat + + if (size <= 1) { + # Continuous feature which are modeled using the Gaussian distribution + params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2 + cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) + distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params + col_sample <- distr$sample() # Sample from the inferred Gaussian distributions + } else { + # Categorical feature which are modeled using the categorical distribution + # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. + params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] + cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels + distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params + col_sample <- distr$sample()$unsqueeze(-1)$to(dtype = torch::torch_float()) # Sample class using class prob + } + sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list } - sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list + return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list } - return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list - } -) + ) + return( + gauss_cat_sampler_random_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob) + ) +} -## gauss_cat_parameters -------------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a `gauss_cat_parameters` #' #' @description The `gauss_cat_parameters` module extracts the parameters from the inferred generative Gaussian and @@ -1558,54 +1616,55 @@ gauss_cat_sampler_random <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -gauss_cat_parameters <- torch::nn_module( - # field classname Type of torch::nn_module - classname = "gauss_cat_parameters", - - # description Initialize a `gauss_cat_parameters` which extract the parameters from the generative distribution - # defined by the output of the neural network. - # return A new `gauss_cat_parameters` object. - initialize = function(one_hot_max_sizes, - min_sigma = 1e-4, - min_prob = 1e-4) { - self$one_hot_max_sizes <- one_hot_max_sizes - self$min_sigma <- min_sigma - self$min_prob <- min_prob - }, - - # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only - # considering continuous features. For categorical features, we do NOT have mu and sigma for the - # decoder at the end of the vaeac, but rather logits for the categorical distribution. - # return A tensor containing the final parameters of the generative distributions (after transformations). - forward = function(distr_params) { - cur_distr_col <- 1 # A counter to keep track of which column to extract from - parameters <- list() # List to store the generative parameters from the normal and categorical distributions - - # Iterate over the features - for (i in seq_along(self$one_hot_max_sizes)) { - size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat - - if (size <= 1) { - # Continuous feature which are modeled using the Gaussian distribution. - params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2 - cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) - distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params - current_parameters <- torch::torch_cat(c(distr$mean, distr$scale), -1) # Combine the current parameters - } else { - # Categorical feature which are modeled using the categorical distribution - # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. - params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] - cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels - distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params - current_parameters <- distr$probs # Extract the current probabilities for each classs +gauss_cat_parameters <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + gauss_cat_parameters_tmp <- torch::nn_module( + # field classname Type of torch::nn_module + classname = "gauss_cat_parameters", + + # description Initialize a `gauss_cat_parameters` which extract the parameters from the generative distribution + # defined by the output of the neural network. + # return A new `gauss_cat_parameters` object. + initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + self$one_hot_max_sizes <- one_hot_max_sizes + self$min_sigma <- min_sigma + self$min_prob <- min_prob + }, + + # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only + # considering continuous features. For categorical features, we do NOT have mu and sigma for the + # decoder at the end of the vaeac, but rather logits for the categorical distribution. + # return A tensor containing the final parameters of the generative distributions (after transformations). + forward = function(distr_params) { + cur_distr_col <- 1 # A counter to keep track of which column to extract from + parameters <- list() # List to store the generative parameters from the normal and categorical distributions + + # Iterate over the features + for (i in seq_along(self$one_hot_max_sizes)) { + size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat + + if (size <= 1) { + # Continuous feature which are modeled using the Gaussian distribution. + params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2 + cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) + distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params + current_parameters <- torch::torch_cat(c(distr$mean, distr$scale), -1) # Combine the current parameters + } else { + # Categorical feature which are modeled using the categorical distribution + # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size. + params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)] + cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels + distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params + current_parameters <- distr$probs # Extract the current probabilities for each classs + } + parameters <- append(parameters, current_parameters) # Add the i´th feature's parameters to the parameters list } - parameters <- append(parameters, current_parameters) # Add parameters for the i´th feature to the parameters list + return(torch::torch_cat(parameters, -1)) # Create a 2D torch_tensor by column binding the tensors in the list } - return(torch::torch_cat(parameters, -1)) # Create a 2D torch_tensor by column binding the tensors in the list - } -) + ) + return(gauss_cat_parameters_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob)) +} + -## gauss_cat_loss -------------------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a `gauss_cat_loss` #' #' @description The `gauss_cat_loss module` layer computes the log probability of the `groundtruth` for each object @@ -1620,67 +1679,69 @@ gauss_cat_parameters <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -gauss_cat_loss <- torch::nn_module( - classname = "gauss_cat_loss", # field classname Type of torch::nn_module - - # description Initialize a `gauss_cat_loss`. - # return A new `gauss_cat_loss` object. - initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { - self$one_hot_max_sizes <- one_hot_max_sizes - self$min_sigma <- min_sigma - self$min_prob <- min_prob - }, - forward = function(groundtruth, distr_params, mask) { - cur_distr_col <- 1 # A counter to keep track of which column to extract from - log_prob <- list() # List to store the log probabilities - - # Iterate over the features - for (i in seq_along(self$one_hot_max_sizes)) { - size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat - groundtruth_col <- groundtruth[, i, drop = FALSE] # Get at the ith column of the truth - mask_col <- mask[, i, drop = FALSE] # Get the ith column of the mask - gt_col_nansafe <- groundtruth_col$clone()$detach() # Copy the ground truth column, can now alter this object - nan_mask <- torch::torch_isnan(groundtruth_col) # Check if truth contains any missing values - gt_col_nansafe[nan_mask] <- 0 # Set any missing values to 0 - - # Mask_col masks both the nan/missing values and the artificially masked values. We want to compute the log prob - # only over the artificially missing features, so we omit the true missing values. We remove the masking of the - # missing values. So those ones in mask_col which are there due to missing values are now turned in to zeros. - mask_col <- mask_col * (torch::torch_logical_not(nan_mask))$to(dtype = torch::torch_float()) - - if (size <= 1) { - # Continuous feature which are modeled using the Gaussian distribution - params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2 - cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) - distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params - - # Get the log-likelihood, but only of the masked values i.e., the ones hat are masked by the masking scheme - # MCARGenerator. This one is batch_size x 1 and is the log-lik of observing the ground truth given the current - # parameters, for only the artificially masked features. - col_log_prob <- distr$log_prob(gt_col_nansafe) * mask_col - } else { - # Categorical feature which are modeled using the categorical distribution - # Extract the probabilities for each of the K-classes for the ith feature. The dimension is batch_size x size. - params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1), drop = FALSE] - cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels - distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params - col_log_prob <- distr$log_prob(gt_col_nansafe$squeeze())[, NULL] * mask_col # Get the log-likelihood +gauss_cat_loss <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + gauss_cat_loss_tmp <- torch::nn_module( + classname = "gauss_cat_loss", # field classname Type of torch::nn_module + + # description Initialize a `gauss_cat_loss`. + # return A new `gauss_cat_loss` object. + initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) { + self$one_hot_max_sizes <- one_hot_max_sizes + self$min_sigma <- min_sigma + self$min_prob <- min_prob + }, + forward = function(groundtruth, distr_params, mask) { + cur_distr_col <- 1 # A counter to keep track of which column to extract from + log_prob <- list() # List to store the log probabilities + + # Iterate over the features + for (i in seq_along(self$one_hot_max_sizes)) { + size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat + groundtruth_col <- groundtruth[, i, drop = FALSE] # Get at the ith column of the truth + mask_col <- mask[, i, drop = FALSE] # Get the ith column of the mask + gt_col_nansafe <- groundtruth_col$clone()$detach() # Copy the ground truth column, can now alter this object + nan_mask <- torch::torch_isnan(groundtruth_col) # Check if truth contains any missing values + gt_col_nansafe[nan_mask] <- 0 # Set any missing values to 0 + + # Mask_col masks both the nan/missing values and the artificially masked values. We want to compute the log prob + # only over the artificially missing features, so we omit the true missing values. We remove the masking of the + # missing values. So those ones in mask_col which are there due to missing values are now turned in to zeros. + mask_col <- mask_col * (torch::torch_logical_not(nan_mask))$to(dtype = torch::torch_float()) + + if (size <= 1) { + # Continuous feature which are modeled using the Gaussian distribution + params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2 + cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd) + distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params + + # Get the log-likelihood, but only of the masked values i.e., the ones hat are masked by the masking scheme + # MCARGenerator. This one is batch_size x 1 and is the log-lik of observing the ground truth given the current + # parameters, for only the artificially masked features. + col_log_prob <- distr$log_prob(gt_col_nansafe) * mask_col + } else { + # Categorical feature which are modeled using the categorical distribution + # Extract the probabilities for each of the K-classes for the ith feature. The dimension is batch_size x size. + params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1), drop = FALSE] + cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels + distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params + col_log_prob <- distr$log_prob(gt_col_nansafe$squeeze())[, NULL] * mask_col # Get the log-likelihood + } + + # Append the column of log probabilities for the i-th feature for those instances that are masked into log_prob. + # log_prob is now a list of length n_features, where each element is a tensor batch_size x 1 containing the + # log-lik of the parameters of the masked values. + log_prob <- append(log_prob, col_log_prob) } - # Append the column of log probabilities for the i-th feature for those instances that are masked into log_prob. - # log_prob is now a list of length n_features, where each element is a tensor batch_size x 1 containing the - # log-lik of the parameters of the masked values. - log_prob <- append(log_prob, col_log_prob) + # Concatenate the list into a tensor of dim batch x features. Then sum along the the rows. + # That is, for each observation in the batch to get a tensor of length batch size. + return(torch::torch_cat(log_prob, 2)$sum(-1)) } - - # Concatenate the list into a tensor of dim batch x features. Then sum along the the rows. - # That is, for each observation in the batch to get a tensor of length batch size. - return(torch::torch_cat(log_prob, 2)$sum(-1)) - } -) + ) + return(gauss_cat_loss_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob)) +} -## categorical_to_one_hot_layer ---------------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a `categorical_to_one_hot_layer` #' #' @description @@ -1698,74 +1759,80 @@ gauss_cat_loss <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -categorical_to_one_hot_layer <- torch::nn_module( - classname = "categorical_to_one_hot_layer", # field classname Type of torch::nn_module - - # description Initialize a `categorical_to_one_hot_layer`. - # return A new `categorical_to_one_hot_layer` object. - initialize = function(one_hot_max_sizes, add_nans_map_for_columns = NULL) { - # Here one_hot_max_sizes includes zeros at the end of the list: one_hot_max_sizes + [0] * len(one_hot_max_sizes) - # Thus, if features have this many categories [1, 2, 3, 1], then one_hot_max_sizes = [1, 2, 3, 1, 0, 0, 0, 0] - self$one_hot_max_sizes <- one_hot_max_sizes - - # Always an empty column for the Masked Encoder network while it is a list [0, 1, ..., length(one_hot_max_sizes)-1) - # for the Full Encoder network. So for the Full Encoder network we apply the nan masks to each column/feature - self$add_nans_map_for_columns <- add_nans_map_for_columns - }, - forward = function(input) { - # Input is torch::torch_cat(c(batch, mask), -1), so a torch of dimension batch_size x 2*sum(one_hot_max_sizes) - # for continuous data where one_hot_max_sizes only consists of ones. Recall that one_hot_max_sizes are padded with - # zeros at the end in this function. - n <- input$shape[1] # Get the number of instances in the input batch. - out_cols <- NULL # variable to store the out columns, i.e., the input columns / one hot encoding + is nan.mask. - - # Iterate over the features. Note that i goes from 0 to 2*n_features-1. - for (i in seq_along(self$one_hot_max_sizes)) { - # Get the number of categories for each feature. For i in [n_features, 2*n_features-1], size <= 1, even for - # categorical features. - size <- self$one_hot_max_sizes[i] - - # Distinguish between continuous and categorical features - if (size <= 1) { - # Continuous feature. Copy it and replace NaNs with either zeros or the last half of self.one_hot_max_sizes - # Take the ith column of the input (NOTE THAT THIS IS NOT A DEEP COPY, so changing out_col changes input) - out_col <- input[, i:i] - nan_mask <- torch::torch_isnan(out_col) # check if any of the values are nan, i.e., missing - out_col[nan_mask] <- 0 # set all the missing values to 0. (This changes the input too) - } else { - # Categorical feature. Get the categories for each instance for the ith feature and start to count at zero. - # So if we have 2 cat, then this vector will contains zeros and ones. - cat_idx <- input[, i:i] - nan_mask <- torch::torch_isnan(cat_idx) # Check if any of the categories are nan / missing - cat_idx[nan_mask] <- 0 # Set the nan values to 0 - - # Create a matrix, where the jth row is the one-hot encoding of the ith feature of the jth instance. - out_col <- matrix(0, nrow = n, ncol = size) - out_col[cbind(seq(n), as.matrix(cat_idx$cpu()))] <- 1 - out_col <- torch::torch_tensor(out_col, device = input$device) - } +categorical_to_one_hot_layer <- function(one_hot_max_sizes, add_nans_map_for_columns = NULL) { + cat_to_one_hot_layer_tmp <- torch::nn_module( + classname = "categorical_to_one_hot_layer", # field classname Type of torch::nn_module + + # description Initialize a `categorical_to_one_hot_layer`. + # return A new `categorical_to_one_hot_layer` object. + initialize = function(one_hot_max_sizes, add_nans_map_for_columns = NULL) { + # Here one_hot_max_sizes includes zeros at the end of the list: one_hot_max_sizes + [0] * len(one_hot_max_sizes) + # Thus, if features have this many categories [1, 2, 3, 1], then one_hot_max_sizes = [1, 2, 3, 1, 0, 0, 0, 0] + self$one_hot_max_sizes <- one_hot_max_sizes + + # Always an empty column for the Masked Encoder network while it's a list [0, 1, ..., length(one_hot_max_sizes)-1) + # for the Full Encoder network. So for the Full Encoder network we apply the nan masks to each column/feature + self$add_nans_map_for_columns <- add_nans_map_for_columns + }, + forward = function(input) { + # Input is torch::torch_cat(c(batch, mask), -1), so a torch of dimension batch_size x 2*sum(one_hot_max_sizes) + # for continuous data where one_hot_max_sizes only consists of ones. Recall that one_hot_max_sizes are padded with + # zeros at the end in this function. + n <- input$shape[1] # Get the number of instances in the input batch. + out_cols <- NULL # variable to store the out columns, i.e., the input columns / one hot encoding + is nan.mask. + + # Iterate over the features. Note that i goes from 0 to 2*n_features-1. + for (i in seq_along(self$one_hot_max_sizes)) { + # Get the number of categories for each feature. For i in [n_features, 2*n_features-1], size <= 1, even for + # categorical features. + size <- self$one_hot_max_sizes[i] + + # Distinguish between continuous and categorical features + if (size <= 1) { + # Continuous feature. Copy it and replace NaNs with either zeros or the last half of self.one_hot_max_sizes + # Take the ith column of the input (NOTE THAT THIS IS NOT A DEEP COPY, so changing out_col changes input) + out_col <- input[, i:i] + nan_mask <- torch::torch_isnan(out_col) # check if any of the values are nan, i.e., missing + out_col[nan_mask] <- 0 # set all the missing values to 0. (This changes the input too) + } else { + # Categorical feature. Get the categories for each instance for the ith feature and start to count at zero. + # So if we have 2 cat, then this vector will contains zeros and ones. + cat_idx <- input[, i:i] + nan_mask <- torch::torch_isnan(cat_idx) # Check if any of the categories are nan / missing + cat_idx[nan_mask] <- 0 # Set the nan values to 0 + + # Create a matrix, where the jth row is the one-hot encoding of the ith feature of the jth instance. + out_col <- matrix(0, nrow = n, ncol = size) + out_col[cbind(seq(n), as.matrix(cat_idx$cpu()))] <- 1 + out_col <- torch::torch_tensor(out_col, device = input$device) + } - # Append this feature column to the result. out_col is n x size = batch_size x n_categories_for_this_feature - out_cols <- torch::torch_cat(c(out_cols, out_col), dim = -1) + # Append this feature column to the result. out_col is n x size = batch_size x n_categories_for_this_feature + out_cols <- torch::torch_cat(c(out_cols, out_col), dim = -1) - # If necessary, append isnan mask of this feature to the result which we always do for the proposal network. - # This only happens for the first half of the i's, so for i = 1, ..., n_features. - if (i %in% self$add_nans_map_for_columns) { - # add the columns of nan_mask - out_cols <- torch::torch_cat(c(out_cols, nan_mask$to(dtype = torch::torch_float())), dim = -1) + # If necessary, append isnan mask of this feature to the result which we always do for the proposal network. + # This only happens for the first half of the i's, so for i = 1, ..., n_features. + if (i %in% self$add_nans_map_for_columns) { + # add the columns of nan_mask + out_cols <- torch::torch_cat(c(out_cols, nan_mask$to(dtype = torch::torch_float())), dim = -1) + } } + + # ONLY FOR CONTINUOUS FEATURES: out_cols now is a list of n_features tensors of shape n x size = n x 1 for + # continuous variables. We concatenate them to a matrix of dim n x 2*n_features (in cont case) for prior net, but + # for proposal net, it is n x 3*n_features, and they take the form + # [batch1, is.nan1, batch2, is.nan2, …, batch12, is.nan12, mask1, mask2, …, mask12] + return(out_cols) } + ) + + return( + cat_to_one_hot_layer_tmp(one_hot_max_sizes = one_hot_max_sizes, add_nans_map_for_columns = add_nans_map_for_columns) + ) +} - # ONLY FOR CONTINUOUS FEATURES: out_cols now is a list of n_features tensors of shape n x size = n x 1 for - # continuous variables. We concatenate them to a matrix of dim n x 2*n_features (in cont case) for prior net, but - # for proposal net, it is n x 3*n_features, and they take the form - # [batch1, is.nan1, batch2, is.nan2, …, batch12, is.nan12, mask1, mask2, …, mask12] - return(out_cols) - } -) -# Mask Generators ===================================================================================================== -## mcar_mask_generator ------------------------------------------------------------------------------------------------ +# Mask Generators ====================================================================================================== #' Missing Completely at Random (MCAR) Mask Generator #' #' @description A mask generator which masks the entries in the input completely at random. @@ -1795,84 +1862,86 @@ categorical_to_one_hot_layer <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -mcar_mask_generator <- torch::nn_module( - name = "mcar_mask_generator", # field name Type of mask generator - - # description Initialize a missing completely at random mask generator. - # param masking_ratio The probability for an entry in the generated mask to be 1 (masked). - # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}. - # If TRUE, then batch must be sampled using `paired_sampler` which creates batches where - # the first half and second half of the rows are duplicates of each other. That is, - # batch = [row1, row1, row2, row2, row3, row3, ...]. - # return A new `mcar_mask_generator` object. - initialize = function(masking_ratio = 0.5, paired_sampling = FALSE) { - self$masking_ratio <- masking_ratio - self$paired_sampling <- paired_sampling - }, - - # description Generates a MCAR mask by calling self$mcar_mask_generator_function function. - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the - # entries are missing. If any are missing, then the returned mask will ensure that - # these missing entries are masked. - forward = function(batch) { - self$mcar_mask_generator_function(batch, prob = self$masking_ratio, paired_sampling = self$paired_sampling) - }, - - # description Missing Completely At Random Mask Generator: A mask generator where the masking - # is determined by component-wise independent Bernoulli distribution. - # - # details Function that takes in a batch of observations and the probability - # of masking each element based on a component-wise independent Bernoulli - # distribution. Default value is 0.5, so all masks are equally likely to be trained. - # Function returns the mask of same shape as batch. - # Note that the batch can contain missing values, indicated by the "NaN" token. - # The mask will always mask missing values. - # - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the - # entries are missing. If any are missing, then the returned mask will ensure that - # these missing entries are masked. - # param prob Numeric between 0 and 1. The probability that an entry will be masked. - # param seed Integer. Used to set the seed for the sampling process such that we - # can reproduce the same masks. - # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}. - # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where - # the first half and second half of the rows are duplicates of each other. That is, - # batch = [row1, row1, row2, row2, row3, row3, ...]. - # - # examples - # mcar_mask_generator_function(torch::torch_rand(c(5, 3))) - # - # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the - # observed feature value will be masked. '0' means that the entry is NOT masked, - # i.e., the feature value will be observed/given/available. - mcar_mask_generator_function = function(batch, prob = 0.5, seed = NULL, paired_sampling = FALSE) { - if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility - size <- prod(batch$shape) # Get the number of entries in the batch. - nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch - - # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. - if (paired_sampling) size <- size / 2 - - # # Torch version, but marginally slower than r version when batch_size <= 128 and n_features <= 50 - # mask = torch::torch_bernoulli(torch::torch_full_like(batch, prob)) - # Create the Bernoulli mask where an element is masked (1) with probability 'prob'. - mask <- torch::torch_tensor( - matrix(sample(c(0, 1), size = size, replace = TRUE, prob = c(prob, 1 - prob)), ncol = ncol(batch)), - dtype = torch::torch_float() - ) +mcar_mask_generator <- function(masking_ratio = 0.5, paired_sampling = FALSE) { + mcar_mask_gen_tmp <- torch::nn_module( + name = "mcar_mask_generator", # field name Type of mask generator + + # description Initialize a missing completely at random mask generator. + # param masking_ratio The probability for an entry in the generated mask to be 1 (masked). + # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}. + # If TRUE, then batch must be sampled using `paired_sampler` which creates batches where + # the first half and second half of the rows are duplicates of each other. That is, + # batch = [row1, row1, row2, row2, row3, row3, ...]. + # return A new `mcar_mask_generator` object. + initialize = function(masking_ratio = 0.5, paired_sampling = FALSE) { + self$masking_ratio <- masking_ratio + self$paired_sampling <- paired_sampling + }, + + # description Generates a MCAR mask by calling self$mcar_mask_generator_function function. + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the + # entries are missing. If any are missing, then the returned mask will ensure that + # these missing entries are masked. + forward = function(batch) { + self$mcar_mask_generator_function(batch, prob = self$masking_ratio, paired_sampling = self$paired_sampling) + }, + + # description Missing Completely At Random Mask Generator: A mask generator where the masking + # is determined by component-wise independent Bernoulli distribution. + # + # details Function that takes in a batch of observations and the probability + # of masking each element based on a component-wise independent Bernoulli + # distribution. Default value is 0.5, so all masks are equally likely to be trained. + # Function returns the mask of same shape as batch. + # Note that the batch can contain missing values, indicated by the "NaN" token. + # The mask will always mask missing values. + # + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the + # entries are missing. If any are missing, then the returned mask will ensure that + # these missing entries are masked. + # param prob Numeric between 0 and 1. The probability that an entry will be masked. + # param seed Integer. Used to set the seed for the sampling process such that we + # can reproduce the same masks. + # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}. + # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where + # the first half and second half of the rows are duplicates of each other. That is, + # batch = [row1, row1, row2, row2, row3, row3, ...]. + # + # examples + # mcar_mask_generator_function(torch::torch_rand(c(5, 3))) + # + # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the + # observed feature value will be masked. '0' means that the entry is NOT masked, + # i.e., the feature value will be observed/given/available. + mcar_mask_generator_function = function(batch, prob = 0.5, seed = NULL, paired_sampling = FALSE) { + if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility + size <- prod(batch$shape) # Get the number of entries in the batch. + nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch + + # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. + if (paired_sampling) size <- size / 2 + + # # Torch version, but marginally slower than r version when batch_size <= 128 and n_features <= 50 + # mask = torch::torch_bernoulli(torch::torch_full_like(batch, prob)) + # Create the Bernoulli mask where an element is masked (1) with probability 'prob'. + mask <- torch::torch_tensor( + matrix(sample(c(0, 1), size = size, replace = TRUE, prob = c(prob, 1 - prob)), ncol = ncol(batch)), + dtype = torch::torch_float() + ) - # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. - if (paired_sampling) { - mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] - } + # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. + if (paired_sampling) { + mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] + } - # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked. - return(mask + nan_mask >= 1) - } -) + # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked. + return(mask + nan_mask >= 1) + } + ) + return(mcar_mask_gen_tmp(masking_ratio = masking_ratio, paired_sampling = paired_sampling)) +} -## specified_prob_mask_generator ------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a specified_prob_mask_generator #' #' @description A mask generator which masks the entries based on specified probabilities. @@ -1907,79 +1976,83 @@ mcar_mask_generator <- torch::nn_module( #' } #' #' @keywords internal -specified_prob_mask_generator <- torch::nn_module( - name = "specified_prob_mask_generator", # field name Type of mask generator - - # description Initialize a specified_probability mask generator. - initialize = function(masking_probs, paired_sampling = FALSE) { - self$masking_probs <- masking_probs / sum(masking_probs) - self$paired_sampling <- paired_sampling - }, - - # description Generates a specified probability mask by calling the self$specified_prob_mask_generator_function. - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are - # missing. If any are missing, then the returned mask will ensure that these missing entries are masked. - forward = function(batch) { - self$specified_prob_mask_generator_function( - batch = batch, - masking_prob = self$masking_probs, - paired_sampling = self$paired_sampling - ) - }, - - # description Specified Probability Mask Generator: A mask generator that first samples the number of entries 'd' to - # be masked in the 'M'-dimensional observation 'x' in the batch based on the given M+1 probabilities. The 'd' maskes - # are uniformly sampled from the 'M' possible feature indices. The d'th entry of the probability of having d-1 masked - # values. - # - # details Note that mcar_mask_generator with p = 0.5 is the same as using specified_prob_mask_generator - # with masking_ratio = choose(M, 0:M), where M is the number of features. This function was initially - # created to check if increasing the probability of having a masks with many masked features improved - # vaeac's performance by focusing more on these situations during training. - # - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are missing. If any - # are missing, then the returned mask will ensure that these missing entries are masked. - # param masking_probs An M+1 numerics containing the probabilities masking 'd' (0,...M) entries for each observation. - # param seed Integer. Used to set the seed for the sampling process such that we can reproduce the same masks. - # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}. - # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where - # the first half and second half of the rows are duplicates of each other. That is, - # `batch = [row1, row1, row2, row2, row3, row3, ...]`. - # - # examples specified_prob_mask_generator_function(torch::torch_rand(c(5, 4)), masking_probs = c(2,7,5,3,3)) - # - # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the - # observed feature value will be masked. '0' means that the entry is NOT masked, - # i.e., the feature value will be observed/given/available. - specified_prob_mask_generator_function = function(batch, masking_probs, seed = NULL, paired_sampling = FALSE) { - if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility - n_features <- ncol(batch) # Get the number of features in the batch - size <- nrow(batch) # Get the number of observations in the batch - nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch - - # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. - if (paired_sampling) size <- size / 2 - - # Sample the number of masked features in each row. - n_masked_each_row <- sample(x = seq(0, n_features), size = size, replace = TRUE, prob = masking_probs) - - # Crate the mask matrix - mask <- torch::torch_zeros_like(batch) - for (i in seq(size)) { - if (n_masked_each_row[i] != 0) mask[i, sample(n_features, size = n_masked_each_row[i], replace = FALSE)] <- 1 - } +specified_prob_mask_generator <- function(masking_probs, paired_sampling = FALSE) { + specified_prob_mask_gen_tmp <- torch::nn_module( + name = "specified_prob_mask_generator", # field name Type of mask generator + + # description Initialize a specified_probability mask generator. + initialize = function(masking_probs, paired_sampling = FALSE) { + self$masking_probs <- masking_probs / sum(masking_probs) + self$paired_sampling <- paired_sampling + }, + + # description Generates a specified probability mask by calling the self$specified_prob_mask_generator_function. + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are + # missing. If any are missing, then the returned mask will ensure that these missing entries are masked. + forward = function(batch) { + self$specified_prob_mask_generator_function( + batch = batch, + masking_prob = self$masking_probs, + paired_sampling = self$paired_sampling + ) + }, + + # description Specified Probability Mask Generator: A mask generator that first samples the number of entries 'd' to + # be masked in the 'M'-dimensional observation 'x' in the batch based on the given M+1 probabilities. The 'd' masks + # are uniformly sampled from the 'M' possible feature indices. The d'th entry of the probability of having d-1 + # masked values. + # + # details Note that mcar_mask_generator with p = 0.5 is the same as using specified_prob_mask_generator + # with masking_ratio = choose(M, 0:M), where M is the number of features. This function was initially + # created to check if increasing the probability of having a masks with many masked features improved + # vaeac's performance by focusing more on these situations during training. + # + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are missing. If any + # are missing, then the returned mask will ensure that these missing entries are masked. + # param masking_probs An M+1 numerics containing the probabilities masking 'd' (0,...M) entries for each instance. + # param seed Integer. Used to set the seed for the sampling process such that we can reproduce the same masks. + # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}. + # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where + # the first half and second half of the rows are duplicates of each other. That is, + # `batch = [row1, row1, row2, row2, row3, row3, ...]`. + # + # examples specified_prob_mask_generator_function(torch::torch_rand(c(5, 4)), masking_probs = c(2,7,5,3,3)) + # + # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the + # observed feature value will be masked. '0' means that the entry is NOT masked, + # i.e., the feature value will be observed/given/available. + specified_prob_mask_generator_function = function(batch, masking_probs, seed = NULL, paired_sampling = FALSE) { + if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility + n_features <- ncol(batch) # Get the number of features in the batch + size <- nrow(batch) # Get the number of observations in the batch + nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch + + # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. + if (paired_sampling) size <- size / 2 + + # Sample the number of masked features in each row. + n_masked_each_row <- sample(x = seq(0, n_features), size = size, replace = TRUE, prob = masking_probs) + + # Crate the mask matrix + mask <- torch::torch_zeros_like(batch) + for (i in seq(size)) { + if (n_masked_each_row[i] != 0) mask[i, sample(n_features, size = n_masked_each_row[i], replace = FALSE)] <- 1 + } - # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. - if (paired_sampling) { - mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] + # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. + if (paired_sampling) { + mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] + } + + # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked. + return(mask + nan_mask >= 1) } + ) + + return(specified_prob_mask_gen_tmp(masking_probs = masking_probs, paired_sampling = paired_sampling)) +} - # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked. - return(mask + nan_mask >= 1) - } -) -## specified_masks_mask_generator ------------------------------------------------------------------------------------- #' A [torch::nn_module()] Representing a specified_masks_mask_generator #' #' @description @@ -2010,72 +2083,78 @@ specified_prob_mask_generator <- torch::nn_module( #' #' @author Lars Henry Berge Olsen #' @keywords internal -specified_masks_mask_generator <- torch::nn_module( - name = "specified_masks_mask_generator", # field name Type of mask generator - - # description Initialize a specified masks mask generator. - initialize = function(masks, masks_probs, paired_sampling = FALSE) { - self$masks <- masks - self$masks_probs <- masks_probs / sum(masks_probs) - self$paired_sampling <- paired_sampling - }, - - # description Generates a mask by calling self$specified_masks_mask_generator_function function. - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the - # entries are missing. If any are missing, then the returned mask will ensure that - # these missing entries are masked. - forward = function(batch) { - self$specified_masks_mask_generator_function( - batch = batch, - masks = self$masks, - masks_probs = self$masks_probs, - paired_sampling = self$paired_sampling - ) - }, - - # description Sampling Masks from the Provided Masks with the Given Probabilities - # - # details Function that takes in a 'batch' of observations and matrix of possible/allowed - # 'masks' which we are going to sample from based on the provided probability in 'masks_probs'. - # Function returns a mask of same shape as batch. Note that the batch can contain missing values, - # indicated by the "NaN" token. The mask will always mask missing values. - # - # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the - # entries are missing. If any are missing, then the returned mask will ensure that - # these missing entries are masked. - # param masks Matrix/Tensor of possible/allowed 'masks' which we sample from. - # param masks_probs Array of 'probabilities' for each of the masks specified in 'masks'. - # Note that they do not need to be between 0 and 1. They are scaled, hence, they only need to be positive. - # param seed Integer. Used to set the seed for the sampling process such that we - # can reproduce the same masks. - # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}. - # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where - # the first half and second half of the rows are duplicates of each other. That is, - # batch = [row1, row1, row2, row2, row3, row3, ...]. - # - # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the - # observed feature value will be masked. '0' means that the entry is NOT masked, - # i.e., the feature value will be observed/given/available. - specified_masks_mask_generator_function = function(batch, masks, masks_probs, seed = NULL, paired_sampling = FALSE) { - if (!is.null(seed)) set.seed(seed) # Set seed if the user specifies a seed for reproducibility. - nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch - n_masks <- nrow(masks) # Get the number of masks to choose from - size <- nrow(batch) # Get the number of observations in the batch - - # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. - if (paired_sampling) size <- size / 2 - - # Sample 'n_observation' masks from the possible masks by first sampling the row indices - # based on the given mask probabilities and then use these indices to extract the masks. - mask_rows_indices <- sample.int(n = n_masks, size = size, replace = TRUE, prob = masks_probs) - mask <- torch::torch_tensor(masks[mask_rows_indices, ], dtype = torch::torch_float()) - - # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. - if (paired_sampling) { - mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] - } +specified_masks_mask_generator <- function(masks, masks_probs, paired_sampling = FALSE) { + specified_masks_mask_gen_tmp <- torch::nn_module( + name = "specified_masks_mask_generator", # field name Type of mask generator + + # description Initialize a specified masks mask generator. + initialize = function(masks, masks_probs, paired_sampling = FALSE) { + self$masks <- masks + self$masks_probs <- masks_probs / sum(masks_probs) + self$paired_sampling <- paired_sampling + }, + + # description Generates a mask by calling self$specified_masks_mask_generator_function function. + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the + # entries are missing. If any are missing, then the returned mask will ensure that + # these missing entries are masked. + forward = function(batch) { + self$specified_masks_mask_generator_function( + batch = batch, + masks = self$masks, + masks_probs = self$masks_probs, + paired_sampling = self$paired_sampling + ) + }, + + # description Sampling Masks from the Provided Masks with the Given Probabilities + # + # details Function that takes in a 'batch' of observations and matrix of possible/allowed + # 'masks' which we are going to sample from based on the provided probability in 'masks_probs'. + # Function returns a mask of same shape as batch. Note that the batch can contain missing values, + # indicated by the "NaN" token. The mask will always mask missing values. + # + # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the + # entries are missing. If any are missing, then the returned mask will ensure that + # these missing entries are masked. + # param masks Matrix/Tensor of possible/allowed 'masks' which we sample from. + # param masks_probs Array of 'probabilities' for each of the masks specified in 'masks'. + # Note that they do not need to be between 0 and 1. They are scaled, hence, they only need to be positive. + # param seed Integer. Used to set the seed for the sampling process such that we + # can reproduce the same masks. + # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}. + # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where + # the first half and second half of the rows are duplicates of each other. That is, + # batch = [row1, row1, row2, row2, row3, row3, ...]. + # + # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the + # observed feature value will be masked. '0' means that the entry is NOT masked, + # i.e., the feature value will be observed/given/available. + specified_masks_mask_generator_function = + function(batch, masks, masks_probs, seed = NULL, paired_sampling = FALSE) { + if (!is.null(seed)) set.seed(seed) # Set seed if the user specifies a seed for reproducibility. + nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch + n_masks <- nrow(masks) # Get the number of masks to choose from + size <- nrow(batch) # Get the number of observations in the batch + + # If doing paired sampling, divide size by two as we later concatenate with the inverse mask. + if (paired_sampling) size <- size / 2 + + # Sample 'n_observation' masks from the possible masks by first sampling the row indices + # based on the given mask probabilities and then use these indices to extract the masks. + mask_rows_indices <- sample.int(n = n_masks, size = size, replace = TRUE, prob = masks_probs) + mask <- torch::torch_tensor(masks[mask_rows_indices, ], dtype = torch::torch_float()) + + # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...]. + if (paired_sampling) { + mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ] + } - # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked. - return(mask + nan_mask >= 1) - } -) + # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked. + return(mask + nan_mask >= 1) + } + ) + + # Initate the specified_masks_mask_generator and return it + return(specified_masks_mask_gen_tmp(masks = masks, masks_probs = masks_probs, paired_sampling = paired_sampling)) +} diff --git a/R/zzz.R b/R/zzz.R index cc55b6de6..f540d5c86 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -105,6 +105,7 @@ "x_train", "x_train_preprocessed", "x_train_torch", + "self", "..current_comb", "..regression.response_var" ) diff --git a/man/memory_layer.Rd b/man/memory_layer.Rd index f3b3ce54d..c0a5d1d37 100644 --- a/man/memory_layer.Rd +++ b/man/memory_layer.Rd @@ -4,11 +4,13 @@ \alias{memory_layer} \title{A \code{\link[torch:nn_module]{torch::nn_module()}} Representing a Memory Layer} \usage{ -memory_layer(id, output = FALSE, add = FALSE, verbose = FALSE) +memory_layer(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE) } \arguments{ \item{id}{A unique id to use as a key in the storage list.} +\item{shared_env}{A shared environment for all instances of memory_layer where the inputs are stored.} + \item{output}{Boolean variable indicating if the memory layer is to store input in storage or extract from storage.} \item{add}{Boolean variable indicating if the extracted value are to be added or concatenated to the input. @@ -17,29 +19,34 @@ Only applicable when \code{output = TRUE}.} \item{verbose}{Boolean variable indicating if we want to give printouts to the user.} } \description{ -The layer is used to make skip-connections inside a \link[torch:nn_sequential]{torch::nn_sequential} network -or between several \link[torch:nn_sequential]{torch::nn_sequential} networks without unnecessary code complication. +The layer is used to make skip-connections inside a \code{\link[torch:nn_sequential]{torch::nn_sequential()}} network +or between several \code{\link[torch:nn_sequential]{torch::nn_sequential()}} networks without unnecessary code complication. } \details{ -If \code{output = FALSE}, this layer stores its input in a static list \code{storage} with the key \verb{id`` and then passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If }output = TRUE\verb{, this layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If }add = TRUE\verb{, it returns sum of the stored vector and an }input\verb{, otherwise it returns their concatenation. If the tensor with specified }id\verb{is not in storage when the layer with}output = TRUE` is called, it would cause an exception. +If \code{output = FALSE}, this layer stores its input in the \code{shared_env} with the key \code{id} and then +passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If \code{output = TRUE}, this +layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If \code{add = TRUE}, it +returns sum of the stored vector and an \code{input}, otherwise it returns their concatenation. If the tensor with +specified \code{id} is not in storage when the layer with \code{output = TRUE} is called, it would cause an exception. } \examples{ \dontrun{ +memory_layer_env <- new.env() net1 <- torch::nn_sequential( - memory_layer("#1"), - memory_layer("#0.1"), + memory_layer("#1", shared_env = memory_layer_env), + memory_layer("#0.1", shared_env = memory_layer_env), torch::nn_linear(512, 256), - torch::nn_leaky_relu(), - # here add cannot be TRUE because the dimensions mismatch - memory_layer("#0.1", output = TRUE, add = FALSE), + torch::nn_leaky_relu(), # Here add cannot be TRUE because the dimensions mismatch + memory_layer("#0.1", shared_env = memory_layer_env, output = TRUE, add = FALSE), torch::nn_linear(768, 256), # the dimension after the concatenation with skip-connection is 512 + 256 = 768 ) net2 <- torch::nn_equential( torch::nn_linear(512, 512), - memory_layer("#1", output = TRUE, add = TRUE), + memory_layer("#1", shared_env = memory_layer_env, output = TRUE, add = TRUE), ... ) +# Here a and c must be of correct dimensions, e.g., a = torch::torch_ones(1,512). b <- net1(a) d <- net2(c) # net2 must be called after net1, otherwise tensor '#1' will not be in storage. } diff --git a/man/shapr-package.Rd b/man/shapr-package.Rd index 0cf15f8e8..1041460af 100644 --- a/man/shapr-package.Rd +++ b/man/shapr-package.Rd @@ -30,9 +30,9 @@ Authors: Other contributors: \itemize{ - \item Anders Lland \email{Anders.Loland@nr.no} [contributor] + \item Anders Løland \email{Anders.Loland@nr.no} [contributor] \item Jens Christian Wahl \email{Jens.Christian.Wahl@nr.no} [contributor] - \item Camilla Lingjrde [contributor] + \item Camilla Lingjærde [contributor] \item Norsk Regnesentral [copyright holder, funder] } diff --git a/man/vaeac_get_extra_para_default.Rd b/man/vaeac_get_extra_para_default.Rd index d3a35736a..f2229c3b3 100644 --- a/man/vaeac_get_extra_para_default.Rd +++ b/man/vaeac_get_extra_para_default.Rd @@ -40,8 +40,9 @@ then a name will be generated based on \code{\link[base:Sys.time]{base::Sys.time ensure a valid file name for all operating systems.} \item{vaeac.folder_to_save_model}{String (default is \code{\link[base:tempfile]{base::tempdir()}}). String specifying a path to a folder where -the function is to save the fitted vaeac model. Note that the path will be removed from the returned -\code{\link[=explain]{explain()}} object if \code{vaeac.save_model = FALSE}.} +the function is to save the fitted vaeac model. Note that the path will be removed from the returned +\code{\link[=explain]{explain()}} object if \code{vaeac.save_model = FALSE}. Furthermore, the model cannot be moved from its +original folder if we are to use the \code{\link[=vaeac_train_model_continue]{vaeac_train_model_continue()}} function to continue training the model.} \item{vaeac.pretrained_vaeac_model}{List or String (default is \code{NULL}). 1) Either a list of class \code{vaeac}, i.e., the list stored in \code{explanation$internal$parameters$vaeac} where \code{explanation} is the returned list diff --git a/vignettes/understanding_shapr_vaeac.Rmd b/vignettes/understanding_shapr_vaeac.Rmd index 6eb6088f9..79053b197 100644 --- a/vignettes/understanding_shapr_vaeac.Rmd +++ b/vignettes/understanding_shapr_vaeac.Rmd @@ -456,7 +456,7 @@ rbind( ## Progressr {#progress_bar} -As discussed in the main vignette, the `shapr` package provides two ways for recieving information about the progress +As discussed in the main vignette, the `shapr` package provides two ways for receiving information about the progress of the approach. First, the `shapr` package provides progress updates of the computation of the Shapley values through the `progressr` package. Second, the user can also get information by setting `verbose = 2` in `explain()`, which will print out extra information related to the `vaeac` approach. The `verbose` parameter works independently of the @@ -466,7 +466,6 @@ two examples here, and refer the reader to the main vignette for more detailed i By setting `verbose = 2`, we get messages about the progress of the `vaeac` approach. ```r -handlers("void") # To silence all progressr updates expl_with_messages <- explain( model = model, x_explain = x_explain, @@ -528,7 +527,7 @@ approach and only get the progress bars. See the main vignette for examples for ```r library(progressr) -progressr::handlers("cli") +progressr::handlers("cli") # Use `progressr::handlers("void")` to silence all `progressr` updates progressr::with_progress({ expl_with_progressr <- explain( model = model, diff --git a/vignettes/understanding_shapr_vaeac.Rmd.orig b/vignettes/understanding_shapr_vaeac.Rmd.orig index bb1673d90..9d12a64c5 100644 --- a/vignettes/understanding_shapr_vaeac.Rmd.orig +++ b/vignettes/understanding_shapr_vaeac.Rmd.orig @@ -367,7 +367,7 @@ rbind( ## Progressr {#progress_bar} -As discussed in the main vignette, the `shapr` package provides two ways for recieving information about the progress +As discussed in the main vignette, the `shapr` package provides two ways for receiving information about the progress of the approach. First, the `shapr` package provides progress updates of the computation of the Shapley values through the `progressr` package. Second, the user can also get information by setting `verbose = 2` in `explain()`, which will print out extra information related to the `vaeac` approach. The `verbose` parameter works independently of the @@ -376,7 +376,6 @@ two examples here, and refer the reader to the main vignette for more detailed i By setting `verbose = 2`, we get messages about the progress of the `vaeac` approach. ```{r progressr-false-verbose-2, cache = TRUE} -handlers("void") # To silence all progressr updates expl_with_messages <- explain( model = model, x_explain = x_explain, @@ -396,7 +395,7 @@ step for the final `vaeac` model. Note that one can set `verbose = 0` to not get approach and only get the progress bars. See the main vignette for examples for how to change the progress bar. ```{r progressr-true-verbose-2, cache = TRUE} library(progressr) -progressr::handlers("cli") +progressr::handlers("cli") # Use `progressr::handlers("void")` to silence all `progressr` updates progressr::with_progress({ expl_with_progressr <- explain( model = model,