From ddd32c7c92ba9f37c8505720129d7e10979ccc4c Mon Sep 17 00:00:00 2001
From: "Lars H. B. Olsen" <92097196+LHBO@users.noreply.github.com>
Date: Thu, 18 Apr 2024 09:14:42 +0200
Subject: [PATCH] Restructured torch modules to support shapr installation
 without torch (#393)

* Updated the mask generators

* Fixed the Neural Network Modules

* Fixed Dataset Utility Functions

* Added check that progressr is installed, otherwise we proceed without a progress_bar

* Fixed vaeac and memory layer

* Missed default value in `mcar_mask_generator`.

* Manuals

* Typos in the vaeac vignettes

* Updated the documentation to clearly state that a vaeac model cannot be moved from the folder it was trained in if one want to continue to train it. This is a limitation that I should consider fixing. But I am unsure how often this will occur.

Also made sure that continue train works if the explanation object was trained by giving a path.

* styler + lintr

* Added self as global variable
---
 R/approach_vaeac.R                           |   22 +-
 R/approach_vaeac_torch_modules.R             | 2375 +++++++++---------
 R/zzz.R                                      |    1 +
 man/memory_layer.Rd                          |   27 +-
 man/shapr-package.Rd                         |    4 +-
 man/vaeac_get_extra_para_default.Rd          |    5 +-
 vignettes/understanding_shapr_vaeac.Rmd      |    5 +-
 vignettes/understanding_shapr_vaeac.Rmd.orig |    5 +-
 8 files changed, 1270 insertions(+), 1174 deletions(-)

diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R
index e997d8f5f..4ba03ba20 100644
--- a/R/approach_vaeac.R
+++ b/R/approach_vaeac.R
@@ -438,7 +438,11 @@ vaeac_train_model <- function(x_train,
   best_vlb <- -Inf
 
   # Create a `progressr::progressor()` to keep track of the overall training time of the vaeac approach
-  progressr_bar <- progressr::progressor(steps = epochs_initiation_phase * (n_vaeacs_initialize - 1) + epochs)
+  if (requireNamespace("progressr", quietly = TRUE)) {
+    progressr_bar <- progressr::progressor(steps = epochs_initiation_phase * (n_vaeacs_initialize - 1) + epochs)
+  } else {
+    progressr_bar <- NULL
+  }
 
   # Iterate over the initializations.
   initialization_idx <- 1
@@ -835,9 +839,10 @@ vaeac_train_model_continue <- function(explanation,
   # Set seed for reproducibility
   set.seed(seed)
 
-  # Extract the vaeac list and load the model at the last epoch
+  # Extract the vaeac list and load the model at the last epoch or the best (default 'best' when path is provided)
   vaeac_model <- explanation$internal$parameters$vaeac
-  checkpoint <- torch::torch_load(vaeac_model$models$last)
+  vaeac_model_path <- if (!is.null(vaeac_model$models$last)) vaeac_model$models$last else vaeac_model$models$best
+  checkpoint <- torch::torch_load(vaeac_model_path)
 
   # Get which device we are to continue to train the model
   device <- ifelse(checkpoint$cuda, "cuda", "cpu")
@@ -939,7 +944,11 @@ vaeac_train_model_continue <- function(explanation,
   state_list$epochs <- epochs
 
   # Create a `progressr::progressor()` to keep track of the new training
-  progressr_bar <- progressr::progressor(steps = epochs_new)
+  if (requireNamespace("progressr", quietly = TRUE)) {
+    progressr_bar <- progressr::progressor(steps = epochs_new)
+  } else {
+    progressr_bar <- NULL
+  }
 
   # Train the vaeac model for `epochs_new` number of epochs
   vaeac_tmp <- vaeac_train_model_auxiliary(
@@ -1617,8 +1626,9 @@ vaeac_check_parameters <- function(x_train,
 #' then a name will be generated based on [base::Sys.time()] to ensure a unique name. We use [base::make.names()] to
 #' ensure a valid file name for all operating systems.
 #' @param vaeac.folder_to_save_model String (default is [base::tempdir()]). String specifying a path to a folder where
-#' the function is to save the fitted vaeac model. Note that  the path will be removed from the returned
-#' [shapr::explain()] object if `vaeac.save_model = FALSE`.
+#' the function is to save the fitted vaeac model. Note that the path will be removed from the returned
+#' [shapr::explain()] object if `vaeac.save_model = FALSE`. Furthermore, the model cannot be moved from its
+#' original folder if we are to use the [shapr::vaeac_train_model_continue()] function to continue training the model.
 #' @param vaeac.pretrained_vaeac_model List or String (default is `NULL`). 1) Either a list of class
 #' `vaeac`, i.e., the list stored in `explanation$internal$parameters$vaeac` where `explanation` is the returned list
 #' from an earlier call to the [shapr::explain()] function. 2) A string containing the path to where the `vaeac`
diff --git a/R/approach_vaeac_torch_modules.R b/R/approach_vaeac_torch_modules.R
index 45c89d3d6..e353327ab 100644
--- a/R/approach_vaeac_torch_modules.R
+++ b/R/approach_vaeac_torch_modules.R
@@ -1,5 +1,4 @@
-# VAEAC Model =========================================================================================================
-## vaeac --------------------------------------------------------------------------------------------------------------
+# VAEAC Model ==========================================================================================================
 #' Initializing a vaeac model
 #'
 #' @description Class that represents a vaeac model, i.e., the class creates the neural networks in the vaeac
@@ -117,572 +116,531 @@
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-vaeac <- torch::nn_module(
-
-  # Name of the torch::nn_module object
-  classname = "vaeac",
-
-  # Initializing a vaeac model
-  initialize = function(one_hot_max_sizes,
-                        width = 32,
-                        depth = 3,
-                        latent_dim = 8,
-                        activation_function = torch::nn_relu,
-                        skip_conn_layer = FALSE,
-                        skip_conn_masked_enc_dec = FALSE,
-                        batch_normalization = FALSE,
-                        paired_sampling = FALSE,
-                        mask_generator_name = c(
-                          "mcar_mask_generator",
-                          "specified_prob_mask_generator",
-                          "specified_masks_mask_generator"
-                        ),
-                        masking_ratio = 0.5,
-                        mask_gen_coalitions = NULL,
-                        mask_gen_coalitions_prob = NULL,
-                        sigma_mu = 1e4,
-                        sigma_sigma = 1e-4) {
-    # Check that a valid mask_generator was provided.
-    mask_generator_name <- match.arg(mask_generator_name)
-
-    # Get the number of features
-    n_features <- length(one_hot_max_sizes)
-
-    # Extra strings to add to names of layers depending on if we use memory layers and/or batch normalization.
-    # If FALSE, they are just an empty string and do not effect the names.
-    name_extra_memory_layer <- ifelse(skip_conn_masked_enc_dec, "_and_memory", "")
-    name_extra_batch_normalize <- ifelse(batch_normalization, "_and_batch_norm", "")
-
-    # Save some of the initializing hyperparameters to the vaeac object. Others are saved later.
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$depth <- depth
-    self$width <- width
-    self$latent_dim <- latent_dim
-    self$activation_function <- activation_function
-    self$skip_conn_layer <- skip_conn_layer
-    self$skip_conn_masked_enc_dec <- skip_conn_masked_enc_dec
-    self$batch_normalization <- batch_normalization
-    self$sigma_mu <- sigma_mu
-    self$sigma_sigma <- sigma_sigma
-    self$paired_sampling <- paired_sampling
-
-    # Save the how to compute the loss and how to sample from the vaeac model.
-    self$reconstruction_log_prob <- gauss_cat_loss(one_hot_max_sizes)
-    self$sampler_most_likely <- gauss_cat_sampler_most_likely(one_hot_max_sizes)
-    self$sampler_random <- gauss_cat_sampler_random(one_hot_max_sizes)
-    self$generative_parameters <- gauss_cat_parameters(one_hot_max_sizes)
-    self$n_features <- n_features
-    self$vlb_scale_factor <- 1 / n_features
-
-    ##### Generate the mask generator
-    if (mask_generator_name == "mcar_mask_generator") {
-      # Create a mcar_mask_generator and attach it to the vaeac object. Note that masking_ratio is a singleton here.
-      self$mask_generator <- mcar_mask_generator(
-        masking_ratio = masking_ratio,
-        paired_sampling = paired_sampling
-      )
+vaeac <- function(one_hot_max_sizes,
+                  width = 32,
+                  depth = 3,
+                  latent_dim = 8,
+                  activation_function = torch::nn_relu,
+                  skip_conn_layer = FALSE,
+                  skip_conn_masked_enc_dec = FALSE,
+                  batch_normalization = FALSE,
+                  paired_sampling = FALSE,
+                  mask_generator_name = c(
+                    "mcar_mask_generator",
+                    "specified_prob_mask_generator",
+                    "specified_masks_mask_generator"
+                  ),
+                  masking_ratio = 0.5,
+                  mask_gen_coalitions = NULL,
+                  mask_gen_coalitions_prob = NULL,
+                  sigma_mu = 1e4,
+                  sigma_sigma = 1e-4) {
+  # Check that a valid mask_generator was provided.
+  mask_generator_name <- match.arg(mask_generator_name)
+
+  vaeac_tmp <- torch::nn_module(
+    # Name of the torch::nn_module object
+    classname = "vaeac",
+
+    # Initializing a vaeac model
+    initialize = function(one_hot_max_sizes,
+                          width,
+                          depth,
+                          latent_dim,
+                          activation_function,
+                          skip_conn_layer,
+                          skip_conn_masked_enc_dec,
+                          batch_normalization,
+                          paired_sampling,
+                          mask_generator_name,
+                          masking_ratio,
+                          mask_gen_coalitions,
+                          mask_gen_coalitions_prob,
+                          sigma_mu,
+                          sigma_sigma) {
+      # Get the number of features
+      n_features <- length(one_hot_max_sizes)
+
+      # Extra strings to add to names of layers depending on if we use memory layers and/or batch normalization.
+      # If FALSE, they are just an empty string and do not effect the names.
+      name_extra_memory_layer <- ifelse(skip_conn_masked_enc_dec, "_and_memory", "")
+      name_extra_batch_normalize <- ifelse(batch_normalization, "_and_batch_norm", "")
+
+      # Set up an environment that the memory_layer objects will use as "memory", i.e., where they store the tensors.
+      memory_layer_env <- new.env()
+
+      # Save some of the initializing hyperparameters to the vaeac object. Others are saved later.
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$depth <- depth
+      self$width <- width
+      self$latent_dim <- latent_dim
+      self$activation_function <- activation_function
+      self$skip_conn_layer <- skip_conn_layer
+      self$skip_conn_masked_enc_dec <- skip_conn_masked_enc_dec
+      self$batch_normalization <- batch_normalization
+      self$sigma_mu <- sigma_mu
+      self$sigma_sigma <- sigma_sigma
+      self$paired_sampling <- paired_sampling
+
+      # Save the how to compute the loss and how to sample from the vaeac model.
+      self$reconstruction_log_prob <- gauss_cat_loss(one_hot_max_sizes)
+      self$sampler_most_likely <- gauss_cat_sampler_most_likely(one_hot_max_sizes)
+      self$sampler_random <- gauss_cat_sampler_random(one_hot_max_sizes)
+      self$generative_parameters <- gauss_cat_parameters(one_hot_max_sizes)
+      self$n_features <- n_features
+      self$vlb_scale_factor <- 1 / n_features
+
+      ##### Generate the mask generator
+      if (mask_generator_name == "mcar_mask_generator") {
+        # Create a mcar_mask_generator and attach it to the vaeac object. Note that masking_ratio is a singleton here.
+        self$mask_generator <- mcar_mask_generator(
+          masking_ratio = masking_ratio,
+          paired_sampling = paired_sampling
+        )
 
-      # Attach the masking ratio to the vaeac object.
-      self$masking_ratio <- masking_ratio
-    } else if (mask_generator_name == "specified_prob_mask_generator") {
-      # Create a specified_prob_mask_generator and attach it to the vaeac object.
-      # Note that masking_ratio is an array here.
-      self$mask_generator <- specified_prob_mask_generator(
-        masking_probs = masking_ratio,
-        paired_sampling = paired_sampling
-      )
+        # Attach the masking ratio to the vaeac object.
+        self$masking_ratio <- masking_ratio
+      } else if (mask_generator_name == "specified_prob_mask_generator") {
+        # Create a specified_prob_mask_generator and attach it to the vaeac object.
+        # Note that masking_ratio is an array here.
+        self$mask_generator <- specified_prob_mask_generator(
+          masking_probs = masking_ratio,
+          paired_sampling = paired_sampling
+        )
+
+        # Attach the masking probabilities to the vaeac object.
+        self$masking_probs <- masking_ratio
+      } else if (mask_generator_name == "specified_masks_mask_generator") {
+        # Small check that they have been provided.
+        if (is.null(mask_gen_coalitions) | is.null(mask_gen_coalitions_prob)) {
+          stop(paste0(
+            "Both 'mask_gen_coalitions' and 'mask_gen_coalitions_prob' ",
+            "must be provided when using 'specified_masks_mask_generator'."
+          ))
+        }
 
-      # Attach the masking probabilities to the vaeac object.
-      self$masking_probs <- masking_ratio
-    } else if (mask_generator_name == "specified_masks_mask_generator") {
-      # Small check that they have been provided.
-      if (is.null(mask_gen_coalitions) | is.null(mask_gen_coalitions_prob)) {
+        # Create a specified_masks_mask_generator and attach it to the vaeac object.
+        self$mask_generator <- specified_masks_mask_generator(
+          masks = mask_gen_coalitions,
+          masks_probs = mask_gen_coalitions_prob,
+          paired_sampling = paired_sampling
+        )
+
+        # Save the possible masks and corresponding probabilities to the vaeac object.
+        self$masks <- mask_gen_coalitions
+        self$masks_probs <- mask_gen_coalitions_prob
+      } else {
+        # Print error to user.
         stop(paste0(
-          "Both 'mask_gen_coalitions' and 'mask_gen_coalitions_prob' ",
-          "must be provided when using 'specified_masks_mask_generator'."
+          "`mask_generator_name` must be one of 'mcar_mask_generator', 'specified_prob_mask_generator', or ",
+          "'specified_masks_mask_generator', and not '", mask_generator_name, "'."
         ))
       }
 
-      # Create a specified_masks_mask_generator and attach it to the vaeac object.
-      self$mask_generator <- specified_masks_mask_generator(
-        masks = mask_gen_coalitions,
-        masks_probs = mask_gen_coalitions_prob,
-        paired_sampling = paired_sampling
-      )
-
-      # Save the possible masks and corresponding probabilities to the vaeac object.
-      self$masks <- mask_gen_coalitions
-      self$masks_probs <- mask_gen_coalitions_prob
-    } else {
-      # Print error to user.
-      stop(paste0(
-        "`mask_generator_name` must be one of 'mcar_mask_generator', 'specified_prob_mask_generator', or ",
-        "'specified_masks_mask_generator', and not '", mask_generator_name, "'."
-      ))
-    }
-
-    ##### Full Encoder
-    full_encoder_network <- torch::nn_sequential()
+      ##### Full Encoder
+      full_encoder_network <- torch::nn_sequential()
 
-    # Full Encoder: Input layer
-    full_encoder_network$add_module(
-      module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features)), seq(n_features)),
-      name = "input_layer_cat_to_one_hot"
-    )
-    full_encoder_network$add_module(
-      module = torch::nn_linear(
-        in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features * 2,
-        out_features = width
-      ),
-      name = "input_layer_linear"
-    )
-    full_encoder_network$add_module(
-      module = activation_function(),
-      name = "input_layer_layer_activation"
-    )
-    if (batch_normalization) {
+      # Full Encoder: Input layer
       full_encoder_network$add_module(
-        module = torch::nn_batch_norm1d(width),
-        name = "input_layer_layer_batch_norm"
+        module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features)), seq(n_features)),
+        name = "input_layer_cat_to_one_hot"
       )
-    }
-
-    # Full Encoder: Hidden layers
-    for (i in seq(depth)) {
-      if (skip_conn_layer) {
-        # Add identity skip connection. Such that the input is added to the output of the linear layer
-        # and activation function: output = X + activation(WX + b).
-        full_encoder_network$add_module(
-          module = skip_connection(
-            torch::nn_linear(width, width),
-            activation_function(),
-            if (batch_normalization) torch::nn_batch_norm1d(width)
-          ),
-          name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_batch_normalize)
-        )
-      } else {
-        # Do not use skip connections and do not add the input to the output.
-        full_encoder_network$add_module(
-          module = torch::nn_linear(width, width),
-          name = paste0("hidden_layer_", i, "_linear")
-        )
+      full_encoder_network$add_module(
+        module = torch::nn_linear(
+          in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features * 2,
+          out_features = width
+        ),
+        name = "input_layer_linear"
+      )
+      full_encoder_network$add_module(
+        module = activation_function(),
+        name = "input_layer_layer_activation"
+      )
+      if (batch_normalization) {
         full_encoder_network$add_module(
-          module = activation_function(),
-          name = paste0("hidden_layer_", i, "_activation")
+          module = torch::nn_batch_norm1d(width),
+          name = "input_layer_layer_batch_norm"
         )
-        if (batch_normalization) {
+      }
+
+      # Full Encoder: Hidden layers
+      for (i in seq(depth)) {
+        if (skip_conn_layer) {
+          # Add identity skip connection. Such that the input is added to the output of the linear layer
+          # and activation function: output = X + activation(WX + b).
           full_encoder_network$add_module(
-            module = torch::nn_batch_norm1d(width),
-            name = paste0("hidden_layer_", i, "_batch_norm")
+            module = skip_connection(
+              torch::nn_linear(width, width),
+              activation_function(),
+              if (batch_normalization) torch::nn_batch_norm1d(width)
+            ),
+            name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_batch_normalize)
           )
+        } else {
+          # Do not use skip connections and do not add the input to the output.
+          full_encoder_network$add_module(
+            module = torch::nn_linear(width, width),
+            name = paste0("hidden_layer_", i, "_linear")
+          )
+          full_encoder_network$add_module(
+            module = activation_function(),
+            name = paste0("hidden_layer_", i, "_activation")
+          )
+          if (batch_normalization) {
+            full_encoder_network$add_module(
+              module = torch::nn_batch_norm1d(width),
+              name = paste0("hidden_layer_", i, "_batch_norm")
+            )
+          }
         }
       }
-    }
 
-    # Full Encoder: Go to latent space
-    full_encoder_network$add_module(
-      module = torch::nn_linear(width, latent_dim * 2),
-      name = "latent_space_layer_linear"
-    )
+      # Full Encoder: Go to latent space
+      full_encoder_network$add_module(
+        module = torch::nn_linear(width, latent_dim * 2),
+        name = "latent_space_layer_linear"
+      )
 
-    ##### Masked Encoder
-    masked_encoder_network <- torch::nn_sequential()
+      ##### Masked Encoder
+      masked_encoder_network <- torch::nn_sequential()
 
-    # Masked Encoder: Input layer
-    masked_encoder_network$add_module(
-      module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features))),
-      name = "input_layer_cat_to_one_hot"
-    )
-    if (skip_conn_masked_enc_dec) {
+      # Masked Encoder: Input layer
       masked_encoder_network$add_module(
-        module = memory_layer("#input"),
-        name = "input_layer_memory"
+        module = categorical_to_one_hot_layer(c(one_hot_max_sizes, rep(0, n_features))),
+        name = "input_layer_cat_to_one_hot"
       )
-    }
-    masked_encoder_network$add_module(
-      module = torch::nn_linear(
-        in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features,
-        out_features = width
-      ),
-      name = "input_layer_linear"
-    )
-    masked_encoder_network$add_module(
-      module = activation_function(),
-      name = "input_layer_activation"
-    )
-    if (batch_normalization) {
+      if (skip_conn_masked_enc_dec) {
+        masked_encoder_network$add_module(
+          module = memory_layer(id = "#input", shared_env = memory_layer_env),
+          name = "input_layer_memory"
+        )
+      }
       masked_encoder_network$add_module(
-        module = torch::nn_batch_norm1d(width),
-        name = "input_layer_batch_norm"
+        module = torch::nn_linear(
+          in_features = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features,
+          out_features = width
+        ),
+        name = "input_layer_linear"
       )
-    }
-
-    # Masked Encoder: Hidden layers
-    for (i in seq(depth)) {
-      if (skip_conn_layer) {
-        # Add identity skip connection. Such that the input is added to the output of the linear layer
-        # and activation function: output = X + activation(WX + b).
-        # Also check inside skip_connection if we are to use memory_layer. I.e., skip connection with
-        # concatenation from masked encoder to decoder.
+      masked_encoder_network$add_module(
+        module = activation_function(),
+        name = "input_layer_activation"
+      )
+      if (batch_normalization) {
         masked_encoder_network$add_module(
-          module = skip_connection(
-            if (skip_conn_masked_enc_dec) memory_layer(paste0("#", i)),
-            torch::nn_linear(width, width),
-            activation_function()
-          ),
-          name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer)
+          module = torch::nn_batch_norm1d(width),
+          name = "input_layer_batch_norm"
         )
-        if (batch_normalization) {
+      }
+
+      # Masked Encoder: Hidden layers
+      for (i in seq(depth)) {
+        if (skip_conn_layer) {
+          # Add identity skip connection. Such that the input is added to the output of the linear layer
+          # and activation function: output = X + activation(WX + b).
+          # Also check inside skip_connection if we are to use memory_layer. I.e., skip connection with
+          # concatenation from masked encoder to decoder.
           masked_encoder_network$add_module(
-            module = torch::nn_batch_norm1d(width),
-            name = paste0("hidden_layer_", i, "_batch_norm")
+            module = skip_connection(
+              if (skip_conn_masked_enc_dec) memory_layer(id = paste0("#", i), shared_env = memory_layer_env),
+              torch::nn_linear(width, width),
+              activation_function()
+            ),
+            name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer)
           )
-        }
-      } else {
-        # Do not use skip connections and do not add the input to the output.
-        if (skip_conn_masked_enc_dec) {
+          if (batch_normalization) {
+            masked_encoder_network$add_module(
+              module = torch::nn_batch_norm1d(width),
+              name = paste0("hidden_layer_", i, "_batch_norm")
+            )
+          }
+        } else {
+          # Do not use skip connections and do not add the input to the output.
+          if (skip_conn_masked_enc_dec) {
+            masked_encoder_network$add_module(
+              module = memory_layer(id = paste0("#", i), shared_env = memory_layer_env),
+              name = paste0("hidden_layer_", i, "_memory")
+            )
+          }
           masked_encoder_network$add_module(
-            module = memory_layer(paste0("#", i)),
-            name = paste0("hidden_layer_", i, "_memory")
+            module = torch::nn_linear(width, width),
+            name = paste0("hidden_layer_", i, "_linear")
           )
-        }
-        masked_encoder_network$add_module(
-          module = torch::nn_linear(width, width),
-          name = paste0("hidden_layer_", i, "_linear")
-        )
-        masked_encoder_network$add_module(
-          module = activation_function(),
-          name = paste0("hidden_layer_", i, "_activation")
-        )
-        if (batch_normalization) {
           masked_encoder_network$add_module(
-            module = torch::nn_batch_norm1d(width),
-            name = paste0("hidden_layer_", i, "_batch_norm")
+            module = activation_function(),
+            name = paste0("hidden_layer_", i, "_activation")
           )
+          if (batch_normalization) {
+            masked_encoder_network$add_module(
+              module = torch::nn_batch_norm1d(width),
+              name = paste0("hidden_layer_", i, "_batch_norm")
+            )
+          }
         }
       }
-    }
 
-    # Masked Encoder: Go to latent space
-    if (skip_conn_masked_enc_dec) {
+      # Masked Encoder: Go to latent space
+      if (skip_conn_masked_enc_dec) {
+        masked_encoder_network$add_module(
+          module = memory_layer(id = paste0("#", depth + 1), shared_env = memory_layer_env),
+          name = "latent_space_layer_memory"
+        )
+      }
       masked_encoder_network$add_module(
-        module = memory_layer(paste0("#", depth + 1)),
-        name = "latent_space_layer_memory"
+        module = torch::nn_linear(width, 2 * latent_dim),
+        name = "latent_space_layer_linear"
       )
-    }
-    masked_encoder_network$add_module(
-      module = torch::nn_linear(width, 2 * latent_dim),
-      name = "latent_space_layer_linear"
-    )
 
-    ##### Decoder
-    decoder_network <- torch::nn_sequential()
+      ##### Decoder
+      decoder_network <- torch::nn_sequential()
 
-    # Decoder: Go from latent space
-    decoder_network$add_module(
-      module = torch::nn_linear(latent_dim, width),
-      name = "latent_space_layer_linear"
-    )
-    decoder_network$add_module(
-      module = activation_function(),
-      name = "latent_space_layer_activation"
-    )
-    if (batch_normalization) {
+      # Decoder: Go from latent space
       decoder_network$add_module(
-        module = torch::nn_batch_norm1d(width),
-        name = "latent_space_layer_batch_norm"
+        module = torch::nn_linear(latent_dim, width),
+        name = "latent_space_layer_linear"
       )
-    }
-
-    # Get the width of the hidden layers in the decoder. Needs to be multiplied with two if
-    # we use skip connections between masked encoder and decoder as we concatenate the tensors.
-    width_decoder <- ifelse(skip_conn_masked_enc_dec, 2 * width, width)
-
-    # Same for the input dimension to the last layer in decoder that yields the distribution params.
-    extra_params_skip_con_mask_enc <-
-      ifelse(test = skip_conn_masked_enc_dec,
-        yes = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features,
-        no = 0
+      decoder_network$add_module(
+        module = activation_function(),
+        name = "latent_space_layer_activation"
       )
-
-    # Will need an extra hidden layer if we use skip connection from masked encoder to decoder
-    # as we send the full input layer of the masked encoder to the last layer in the decoder.
-    depth_decoder <- ifelse(skip_conn_masked_enc_dec, depth + 1, depth)
-
-    # Decoder: Hidden layers
-    for (i in seq(depth_decoder)) {
-      if (skip_conn_layer) {
-        # Add identity skip connection. Such that the input is added to the output of the linear layer and activation
-        # function: output = X + activation(WX + b). Also check inside skip_connection if we are to use memory_layer.
-        # I.e., skip connection with concatenation from masked encoder to decoder. If TRUE, then the memory layers
-        # extracts the corresponding input used in the masked encoder and concatenate them with the current input. Note
-        # that we add the memory layers in the opposite direction from how they were created. So, we get a classical
-        # U-net with latent space at the bottom and a connection between the layers on the same height of the U-shape.
+      if (batch_normalization) {
         decoder_network$add_module(
-          module = torch::nn_sequential(
-            skip_connection(
-              if (skip_conn_masked_enc_dec) {
-                memory_layer(paste0("#", depth - i + 2), TRUE)
-              },
-              torch::nn_linear(width_decoder, width),
-              activation_function()
-            )
-          ),
-          name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer)
+          module = torch::nn_batch_norm1d(width),
+          name = "latent_space_layer_batch_norm"
         )
-        if (batch_normalization) {
+      }
+
+      # Get the width of the hidden layers in the decoder. Needs to be multiplied with two if
+      # we use skip connections between masked encoder and decoder as we concatenate the tensors.
+      width_decoder <- ifelse(skip_conn_masked_enc_dec, 2 * width, width)
+
+      # Same for the input dimension to the last layer in decoder that yields the distribution params.
+      extra_params_skip_con_mask_enc <-
+        ifelse(test = skip_conn_masked_enc_dec,
+          yes = sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) + n_features,
+          no = 0
+        )
+
+      # Will need an extra hidden layer if we use skip connection from masked encoder to decoder
+      # as we send the full input layer of the masked encoder to the last layer in the decoder.
+      depth_decoder <- ifelse(skip_conn_masked_enc_dec, depth + 1, depth)
+
+      # Decoder: Hidden layers
+      for (i in seq(depth_decoder)) {
+        if (skip_conn_layer) {
+          # Add identity skip connection. Such that the input is added to the output of the linear layer and activation
+          # function: output = X + activation(WX + b). Also check inside skip_connection if we are to use memory_layer.
+          # I.e., skip connection with concatenation from masked encoder to decoder. If TRUE, then the memory layers
+          # extracts the corresponding input used in the masked encoder and concatenate them with the current input.
+          # We add the memory layers in the opposite direction from how they were created. Thus, we get a classical
+          # U-net with latent space at the bottom and a connection between the layers on the same height of the U-shape.
           decoder_network$add_module(
-            module = torch::nn_batch_norm1d(n_features = width),
-            name = paste0("hidden_layer_", i, "_batch_norm")
+            module = torch::nn_sequential(
+              skip_connection(
+                if (skip_conn_masked_enc_dec) {
+                  memory_layer(id = paste0("#", depth - i + 2), shared_env = memory_layer_env, output = TRUE)
+                },
+                torch::nn_linear(width_decoder, width),
+                activation_function()
+              )
+            ),
+            name = paste0("hidden_layer_", i, "_skip_conn_with_linear_and_activation", name_extra_memory_layer)
           )
-        }
-      } else {
-        # Do not use skip connections and do not add the input to the output.
-        if (skip_conn_masked_enc_dec) {
+          if (batch_normalization) {
+            decoder_network$add_module(
+              module = torch::nn_batch_norm1d(n_features = width),
+              name = paste0("hidden_layer_", i, "_batch_norm")
+            )
+          }
+        } else {
+          # Do not use skip connections and do not add the input to the output.
+          if (skip_conn_masked_enc_dec) {
+            decoder_network$add_module(
+              module = memory_layer(id = paste0("#", depth - i + 2), shared_env = memory_layer_env, output = TRUE),
+              name = paste0("hidden_layer_", i, "_memory")
+            )
+          }
           decoder_network$add_module(
-            module = memory_layer(paste0("#", depth - i + 2), TRUE),
-            name = paste0("hidden_layer_", i, "_memory")
+            module = torch::nn_linear(width_decoder, width),
+            name = paste0("hidden_layer_", i, "_linear")
           )
-        }
-        decoder_network$add_module(
-          module = torch::nn_linear(width_decoder, width),
-          name = paste0("hidden_layer_", i, "_linear")
-        )
-        decoder_network$add_module(
-          module = activation_function(),
-          name = paste0("hidden_layer_", i, "_activation")
-        )
-        if (batch_normalization) {
           decoder_network$add_module(
-            module = torch::nn_batch_norm1d(width),
-            name = paste0("hidden_layer_", i, "_batch_norm")
+            module = activation_function(),
+            name = paste0("hidden_layer_", i, "_activation")
           )
+          if (batch_normalization) {
+            decoder_network$add_module(
+              module = torch::nn_batch_norm1d(width),
+              name = paste0("hidden_layer_", i, "_batch_norm")
+            )
+          }
         }
       }
-    }
 
-    # Decoder: Go the parameter space of the generative distributions
-    # Concatenate the input to the first layer of the masked encoder to the last layer of the decoder network.
-    if (skip_conn_masked_enc_dec) {
+      # Decoder: Go the parameter space of the generative distributions
+      # Concatenate the input to the first layer of the masked encoder to the last layer of the decoder network.
+      if (skip_conn_masked_enc_dec) {
+        decoder_network$add_module(
+          module = memory_layer(id = "#input", shared_env = memory_layer_env, output = TRUE),
+          name = "output_layer_memory"
+        )
+      }
+      # Linear layer to the parameters of the generative distributions Gaussian and Categorical.
+      # Note that sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) is the number of
+      # one hot variables to the masked encoder and n_features represents the binary variables if
+      # the features was masked/missing or not when they entered the masked encoder.
+      # The output dimension is 2 for the continuous features and K_i for categorical feature X_i,
+      # where K_i is the number of classes the i'th categorical feature can take on.
       decoder_network$add_module(
-        module = memory_layer("#input", TRUE),
-        name = "output_layer_memory"
+        module = torch::nn_linear(
+          in_features = width + extra_params_skip_con_mask_enc,
+          out_features = sum(apply(rbind(one_hot_max_sizes, rep(2, n_features)), 2, max))
+        ),
+        name = "output_layer_linear"
       )
-    }
-    # Linear layer to the parameters of the generative distributions Gaussian and Categorical.
-    # Note that sum(apply(rbind(one_hot_max_sizes, rep(1, n_features)), 2, max)) is the number of
-    # one hot variables to the masked encoder and n_features represents the binary variables if
-    # the features was masked/missing or not when they entered the masked encoder.
-    # The output dimension is 2 for the continuous features and K_i for categorical feature X_i,
-    # where K_i is the number of classes the i'th categorical feature can take on.
-    decoder_network$add_module(
-      module = torch::nn_linear(
-        in_features = width + extra_params_skip_con_mask_enc,
-        out_features = sum(apply(rbind(one_hot_max_sizes, rep(2, n_features)), 2, max))
-      ),
-      name = "output_layer_linear"
-    )
 
-    # Save the networks to the vaeac object
-    self$full_encoder_network <- full_encoder_network
-    self$masked_encoder_network <- masked_encoder_network
-    self$decoder_network <- decoder_network
-
-    # Compute the number of trainable parameters in the different networks and save them
-    n_para_full_encoder <- sum(sapply(full_encoder_network$parameters, function(p) prod(p$size())))
-    n_para_masked_encoder <- sum(sapply(masked_encoder_network$parameters, function(p) prod(p$size())))
-    n_para_decoder <- sum(sapply(decoder_network$parameters, function(p) prod(p$size())))
-    n_para_total <- n_para_full_encoder + n_para_masked_encoder + n_para_decoder
-    self$n_train_param <- rbind(n_para_total, n_para_full_encoder, n_para_masked_encoder, n_para_decoder)
-  },
-
-  # Forward functions are required in torch::nn_modules, but is it not needed in the way we have implemented vaeac.
-  forward = function(...) {
-    warning("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.")
-    return("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.")
-  },
-
-  # Apply Mask to Batch to Create Observed Batch
-  #
-  # description Clones the batch and applies the mask to set masked entries to 0 to create the observed batch.
-  #
-  # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
-  # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
-  make_observed = function(batch, mask) {
-    observed <- batch$clone()$detach() # Clone and detach the batch from the graph (remove gradient element)
-    observed[mask == 1] <- 0 # Apply the mask by masking every entry in batch where 'mask' is 1.
-    return(observed) # Return the observed batch where masked entries are set to 0.
-  },
-
-  # Compute the Latent Distributions Inferred by the Encoders
-  #
-  # description Compute the parameters for the latent normal distributions inferred by the encoders.
-  # If `only_masked_encoder = TRUE`, then we only compute the latent normal distributions inferred by the
-  # masked encoder. This is used in the deployment phase when we do not have access to the full observation.
-  #
-  # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
-  # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
-  # param only_masked_encoder Boolean. If we are only to compute the latent distributions for the masked encoder.
-  # Used in deployment phase when we do not have access to the full data. Always FALSE in the training phase.
-  make_latent_distributions = function(batch, mask, only_masked_encoder = FALSE) {
-    # Artificially mask the observations where mask == 1 to create the observed batch values.
-    observed <- self$make_observed(batch = batch, mask = mask)
-
-    # Check if we are in training or deployment phase
-    if (only_masked_encoder) {
-      # In deployment phase and only use the masked encoder.
-      full_encoder <- NULL
-    } else {
-      # In the training phase where we need to use both masked and full encoder.
-
-      # Column bind the batch and the mask to create the full information sent to the full encoder.
-      full_info <- torch::torch_cat(c(batch, mask), dim = 2)
-
-      # Send the full_information through the full encoder. It needs the full information to know if a
-      # value is missing or just masked. The output tensor is of shape batch_size x (2 x latent_dim)
-      # In each row, i.e., each observation in the batch, the first latent_dim entries are the means mu
-      # while the last latent_dim entries are the softplus of the sigmas, so they can take on any
-      # negative or positive value. Recall that softplus(x) = ln(1+e^{x}).
-      full_encoder_params <- self$full_encoder_network(full_info)
-
-      # Takes the full_encoder_parameters and returns a normal distribution, which is component-wise
-      # independent. If sigma (after softmax transform) is less than 1e-3, then we set sigma to 0.001.
-      full_encoder <- vaeac_normal_parse_params(params = full_encoder_params, min_sigma = 1e-3)
-    }
+      # Save the networks to the vaeac object
+      self$full_encoder_network <- full_encoder_network
+      self$masked_encoder_network <- masked_encoder_network
+      self$decoder_network <- decoder_network
+
+      # Compute the number of trainable parameters in the different networks and save them
+      n_para_full_encoder <- sum(sapply(full_encoder_network$parameters, function(p) prod(p$size())))
+      n_para_masked_encoder <- sum(sapply(masked_encoder_network$parameters, function(p) prod(p$size())))
+      n_para_decoder <- sum(sapply(decoder_network$parameters, function(p) prod(p$size())))
+      n_para_total <- n_para_full_encoder + n_para_masked_encoder + n_para_decoder
+      self$n_train_param <- rbind(n_para_total, n_para_full_encoder, n_para_masked_encoder, n_para_decoder)
+    },
+
+    # Forward functions are required in torch::nn_modules, but is it not needed in the way we have implemented vaeac.
+    forward = function(...) {
+      warning("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.")
+      return("NO FORWARD FUNCTION IMPLEMENTED FOR VAEAC.")
+    },
+
+    # Apply Mask to Batch to Create Observed Batch
+    #
+    # description Clones the batch and applies the mask to set masked entries to 0 to create the observed batch.
+    #
+    # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
+    # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
+    make_observed = function(batch, mask) {
+      observed <- batch$clone()$detach() # Clone and detach the batch from the graph (remove gradient element)
+      observed[mask == 1] <- 0 # Apply the mask by masking every entry in batch where 'mask' is 1.
+      return(observed) # Return the observed batch where masked entries are set to 0.
+    },
+
+    # Compute the Latent Distributions Inferred by the Encoders
+    #
+    # description Compute the parameters for the latent normal distributions inferred by the encoders.
+    # If `only_masked_encoder = TRUE`, then we only compute the latent normal distributions inferred by the
+    # masked encoder. This is used in the deployment phase when we do not have access to the full observation.
+    #
+    # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
+    # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
+    # param only_masked_encoder Boolean. If we are only to compute the latent distributions for the masked encoder.
+    # Used in deployment phase when we do not have access to the full data. Always FALSE in the training phase.
+    make_latent_distributions = function(batch, mask, only_masked_encoder = FALSE) {
+      # Artificially mask the observations where mask == 1 to create the observed batch values.
+      observed <- self$make_observed(batch = batch, mask = mask)
+
+      # Check if we are in training or deployment phase
+      if (only_masked_encoder) {
+        # In deployment phase and only use the masked encoder.
+        full_encoder <- NULL
+      } else {
+        # In the training phase where we need to use both masked and full encoder.
+
+        # Column bind the batch and the mask to create the full information sent to the full encoder.
+        full_info <- torch::torch_cat(c(batch, mask), dim = 2)
+
+        # Send the full_information through the full encoder. It needs the full information to know if a
+        # value is missing or just masked. The output tensor is of shape batch_size x (2 x latent_dim)
+        # In each row, i.e., each observation in the batch, the first latent_dim entries are the means mu
+        # while the last latent_dim entries are the softplus of the sigmas, so they can take on any
+        # negative or positive value. Recall that softplus(x) = ln(1+e^{x}).
+        full_encoder_params <- self$full_encoder_network(full_info)
+
+        # Takes the full_encoder_parameters and returns a normal distribution, which is component-wise
+        # independent. If sigma (after softmax transform) is less than 1e-3, then we set sigma to 0.001.
+        full_encoder <- vaeac_normal_parse_params(params = full_encoder_params, min_sigma = 1e-3)
+      }
 
-    # Column bind the batch and the mask to create the observed information sent to the masked encoder.
-    observed_info <- torch::torch_cat(c(observed, mask), dim = -1)
-
-    # Compute the latent normal dist parameters (mu, sigma) for the masked
-    # encoder by sending the observed values and the mask to the masked encoder.
-    masked_encoder_params <- self$masked_encoder_network(observed_info)
-
-    # Create the latent normal distributions based on the parameters (mu, sigma) from the masked encoder
-    masked_encoder <- vaeac_normal_parse_params(params = masked_encoder_params, min_sigma = 1e-3)
-
-    # Return the full and masked encoders
-    return(list(
-      full_encoder = full_encoder,
-      masked_encoder = masked_encoder
-    ))
-  },
-
-  # Compute the Regularizes for the Latent Distribution Inferred by the Masked Encoder.
-  #
-  # description The masked encoder (prior) distribution regularization in the latent space.
-  # This is used to compute the extended variational lower bound used to train vaeac, see
-  # Section 3.3.1 in Olsen et al. (2022).
-  # Though regularizing prevents the masked encoder distribution parameters from going to infinity,
-  # the model usually doesn't diverge even without this regularization. It almost doesn't affect
-  # learning process near zero with default regularization parameters which are recommended to be used.
-  #
-  # param masked_encoder The torch_Normal object returned when calling the masked encoder.
-  masked_encoder_regularization = function(masked_encoder) {
-    # Extract the number of observations. Same as batch_size.
-    n_observations <- masked_encoder$mean$shape[1]
-
-    # Extract the number of dimension in the latent space.
-    n_latent_dimensions <- masked_encoder$mean$shape[2]
-
-    # Extract means and ensure correct shape (batch_size x latent_dim).
-    mu <- masked_encoder$mean$view(c(n_observations, n_latent_dimensions))
-
-    # Extract the sigmas and ensure correct shape (batch_size x latent_dim).
-    sigma <- masked_encoder$scale$view(c(n_observations, n_latent_dimensions))
-
-    # Note that sum(-1) indicates that we sum together the columns.
-    # mu_regularizer is then a tensor of length n_observations
-    mu_regularizer <- -(mu^2)$sum(-1) / (2 * self$sigma_mu^2)
-
-    # sigma_regularizer is then also a tensor of length n_observations.
-    sigma_regularizer <- (sigma$log() - sigma)$sum(-1) * self$sigma_sigma
-
-    # Add the regularization terms together and return them.
-    return(mu_regularizer + sigma_regularizer)
-  },
-
-  # Compute the Variational Lower Bound for the Observations in the Batch
-  #
-  # description Compute differentiable lower bound for the given batch of objects and mask.
-  # Used as the (negative) loss function for training the vaeac model.
-  #
-  # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
-  # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
-  batch_vlb = function(batch, mask) {
-    # Compute the latent normal distributions obtained from the full and masked encoder
-    encoders_list <- self$make_latent_distributions(batch = batch, mask = mask)
-
-    # Extract the masked and full encoders. These are torch_Normal objects.
-    masked_encoder <- encoders_list$masked_encoder
-    full_encoder <- encoders_list$full_encoder
-
-    # Apply the regularization on the mus and sigmas of the normal dist obtained from the masked encoder
-    # such that they don't blow up. Regularized according to their normal gamma prior, see Olsen et al. (2022).
-    masked_encoder_regularization <- self$masked_encoder_regularization(masked_encoder)
-
-    # To use the reparameterization trick to train vaeac, we need to use 'rsample'
-    # and not 'sample', which allows backpropagation through the mean and standard deviation layers,
-    # see https://pytorch.org/docs/stable/distributions.html#pathwise-derivative.
-    # For each training instance in the batch we sample values for each of the latent variables,
-    # i.e.,  we get a tensor of dimension batch_size x latent_dim.
-    latent <- full_encoder$rsample()
-
-    # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case)
-    # where we for each row have a normal dist on each feature The form will be (mu_1, sigma_1, ..., mu_p, sigma_p)
-    reconstruction_params <- self$decoder_network(latent)
-
-    # Compute the reconstruction loss, i.e., the log likelihood of only the masked values in
-    # the batch (true values) given the current reconstruction parameters from the decoder.
-    # We do not consider the log likelihood of observed or missing/nan values.
-    reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask)
-
-    # Compute the KL divergence between the two latent normal distributions obtained from the full encoder
-    # and masked encoder. Since the networks create MVN with diagonal covariance matrices, that is, the same as
-    # a product of individual Gaussian distributions, we can compute KL analytically very easily:
-    # KL(p, q) = \int p(x) log(p(x)/q(x)) dx
-    #          = 0.5 * { (sigma_p/sigma_q)^2 + (mu_q - mu_p)^2/sigma_q^2 - 1 + 2 ln (sigma_q/sigma_p)}
-    # when both p and q are torch_Normal objects.
-    kl <- vaeac_kl_normal_normal(full_encoder, masked_encoder)$view(c(batch$shape[1], -1))$sum(-1)
-
-    # Return the variational lower bound with the prior regularization. See Section 3.3.1 in Olsen et al. (2022)
-    return(reconstruction_loss - kl + masked_encoder_regularization)
-  },
-
-  # Compute the Importance Sampling Estimator for the Observations in the Batch
-  #
-  # description Compute IWAE log likelihood estimate with K samples per object.
-  #
-  # details Technically, it is differentiable, but it is recommended to use it for
-  # evaluation purposes inside torch.no_grad in order to save memory. With torch::with_no_grad
-  # the method almost doesn't require extra memory for very large K. The method makes K independent
-  # passes through decoder network, so the batch size is the same as for training with batch_vlb.
-  # IWAE is an abbreviation for Importance Sampling Estimator
-  # log p_{theta, psi}(x|y) approx
-  # log {1/K * sum_{i=1}^K [p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)]} =
-  # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)])} - log(K) =
-  # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)])} - log(K) =
-  # logsumexp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)]) - log(K) =
-  # logsumexp(rec_loss + prior_log_prob - proposal_log_prob) - log(K),
-  # where z_i ~ q_phi(z|x,y).
-  #
-  # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
-  # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
-  # param K Integer. The number of samples generated to compute the IWAE for each observation in `batch`.
-  batch_iwae = function(batch, mask, K) {
-    # Compute the latent normal distributions obtained from the full and masked encoder
-    encoders_list <- self$make_latent_distributions(batch = batch, mask = mask)
-
-    # Extract the masked and full encoders. These are torch_Normal objects.
-    masked_encoder <- encoders_list$masked_encoder
-    full_encoder <- encoders_list$full_encoder
-
-    # List to store the estimates.
-    estimates <- list()
-
-    # Iterate over the number of samples/passes through the decoder for each validation observation.
-    for (i in seq(K)) {
-      # See equation 18 on page 18 in Ivanov et al. (2019). Create samples from the
-      # full encoder; z_i ~ q_phi(z|x,y). We get a tensor of dimension batch_size x latent_dim.
+      # Column bind the batch and the mask to create the observed information sent to the masked encoder.
+      observed_info <- torch::torch_cat(c(observed, mask), dim = -1)
+
+      # Compute the latent normal dist parameters (mu, sigma) for the masked
+      # encoder by sending the observed values and the mask to the masked encoder.
+      masked_encoder_params <- self$masked_encoder_network(observed_info)
+
+      # Create the latent normal distributions based on the parameters (mu, sigma) from the masked encoder
+      masked_encoder <- vaeac_normal_parse_params(params = masked_encoder_params, min_sigma = 1e-3)
+
+      # Return the full and masked encoders
+      return(list(
+        full_encoder = full_encoder,
+        masked_encoder = masked_encoder
+      ))
+    },
+
+    # Compute the Regularizes for the Latent Distribution Inferred by the Masked Encoder.
+    #
+    # description The masked encoder (prior) distribution regularization in the latent space.
+    # This is used to compute the extended variational lower bound used to train vaeac, see
+    # Section 3.3.1 in Olsen et al. (2022).
+    # Though regularizing prevents the masked encoder distribution parameters from going to infinity,
+    # the model usually doesn't diverge even without this regularization. It almost doesn't affect
+    # learning process near zero with default regularization parameters which are recommended to be used.
+    #
+    # param masked_encoder The torch_Normal object returned when calling the masked encoder.
+    masked_encoder_regularization = function(masked_encoder) {
+      # Extract the number of observations. Same as batch_size.
+      n_observations <- masked_encoder$mean$shape[1]
+
+      # Extract the number of dimension in the latent space.
+      n_latent_dimensions <- masked_encoder$mean$shape[2]
+
+      # Extract means and ensure correct shape (batch_size x latent_dim).
+      mu <- masked_encoder$mean$view(c(n_observations, n_latent_dimensions))
+
+      # Extract the sigmas and ensure correct shape (batch_size x latent_dim).
+      sigma <- masked_encoder$scale$view(c(n_observations, n_latent_dimensions))
+
+      # Note that sum(-1) indicates that we sum together the columns.
+      # mu_regularizer is then a tensor of length n_observations
+      mu_regularizer <- -(mu^2)$sum(-1) / (2 * self$sigma_mu^2)
+
+      # sigma_regularizer is then also a tensor of length n_observations.
+      sigma_regularizer <- (sigma$log() - sigma)$sum(-1) * self$sigma_sigma
+
+      # Add the regularization terms together and return them.
+      return(mu_regularizer + sigma_regularizer)
+    },
+
+    # Compute the Variational Lower Bound for the Observations in the Batch
+    #
+    # description Compute differentiable lower bound for the given batch of objects and mask.
+    # Used as the (negative) loss function for training the vaeac model.
+    #
+    # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
+    # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
+    batch_vlb = function(batch, mask) {
+      # Compute the latent normal distributions obtained from the full and masked encoder
+      encoders_list <- self$make_latent_distributions(batch = batch, mask = mask)
+
+      # Extract the masked and full encoders. These are torch_Normal objects.
+      masked_encoder <- encoders_list$masked_encoder
+      full_encoder <- encoders_list$full_encoder
+
+      # Apply the regularization on the mus and sigmas of the normal dist obtained from the masked encoder
+      # such that they don't blow up. Regularized according to their normal gamma prior, see Olsen et al. (2022).
+      masked_encoder_regularization <- self$masked_encoder_regularization(masked_encoder)
+
+      # To use the reparameterization trick to train vaeac, we need to use 'rsample'
+      # and not 'sample', which allows backpropagation through the mean and standard deviation layers,
+      # see https://pytorch.org/docs/stable/distributions.html#pathwise-derivative.
+      # For each training instance in the batch we sample values for each of the latent variables,
+      # i.e.,  we get a tensor of dimension batch_size x latent_dim.
       latent <- full_encoder$rsample()
 
       # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case)
@@ -694,82 +652,158 @@ vaeac <- torch::nn_module(
       # We do not consider the log likelihood of observed or missing/nan values.
       reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask)
 
-      # Compute the log likelihood of observing the sampled latent representations from
-      # the full_encoder when using the normal distribution estimated by the masked_encoder.
-      masked_encoder_log_prob <- masked_encoder$log_prob(latent)
-
-      # Ensure dimensions batch$shape[1] x something.
-      masked_encoder_log_prob <- masked_encoder_log_prob$view(c(batch$shape[1], -1))
-
-      # Sum over the rows (last dimension), i.e., add the log-likelihood for each instance.
-      masked_encoder_log_prob <- masked_encoder_log_prob$sum(-1)
-
-      # Same explanations here as above, but now for the full_encoder.
-      full_encoder_log_prob <- full_encoder$log_prob(latent)
-      full_encoder_log_prob <- full_encoder_log_prob$view(c(batch$shape[1], -1))
-      full_encoder_log_prob <- full_encoder_log_prob$sum(-1)
-
-      # Combine the estimated loss based on the formula from equation 18 on page 18 in Ivanov et al. (2019).
-      # Consists of batch.shape[0] number of values
-      estimate <- reconstruction_loss + masked_encoder_log_prob - full_encoder_log_prob
-
-      # Make sure that the results are a column vector of height batch_size.
-      estimate <- estimate$unsqueeze(-1)
+      # Compute the KL divergence between the two latent normal distributions obtained from the full encoder
+      # and masked encoder. Since the networks create MVN with diagonal covariance matrices, that is, the same as
+      # a product of individual Gaussian distributions, we can compute KL analytically very easily:
+      # KL(p, q) = \int p(x) log(p(x)/q(x)) dx
+      #          = 0.5 * { (sigma_p/sigma_q)^2 + (mu_q - mu_p)^2/sigma_q^2 - 1 + 2 ln (sigma_q/sigma_p)}
+      # when both p and q are torch_Normal objects.
+      kl <- vaeac_kl_normal_normal(full_encoder, masked_encoder)$view(c(batch$shape[1], -1))$sum(-1)
+
+      # Return the variational lower bound with the prior regularization. See Section 3.3.1 in Olsen et al. (2022)
+      return(reconstruction_loss - kl + masked_encoder_regularization)
+    },
+
+    # Compute the Importance Sampling Estimator for the Observations in the Batch
+    #
+    # description Compute IWAE log likelihood estimate with K samples per object.
+    #
+    # details Technically, it is differentiable, but it is recommended to use it for
+    # evaluation purposes inside torch.no_grad in order to save memory. With torch::with_no_grad
+    # the method almost doesn't require extra memory for very large K. The method makes K independent
+    # passes through decoder network, so the batch size is the same as for training with batch_vlb.
+    # IWAE is an abbreviation for Importance Sampling Estimator
+    # log p_{theta, psi}(x|y) approx
+    # log {1/K * sum_{i=1}^K [p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)]} =
+    # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y) * p_psi(z_i|y) / q_phi(z_i|x,y)])} - log(K) =
+    # log {sum_{i=1}^K exp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)])} - log(K) =
+    # logsumexp(log[p_theta(x|z_i, y)] + log[p_psi(z_i|y)] - log[q_phi(z_i|x,y)]) - log(K) =
+    # logsumexp(rec_loss + prior_log_prob - proposal_log_prob) - log(K),
+    # where z_i ~ q_phi(z|x,y).
+    #
+    # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
+    # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
+    # param K Integer. The number of samples generated to compute the IWAE for each observation in `batch`.
+    batch_iwae = function(batch, mask, K) {
+      # Compute the latent normal distributions obtained from the full and masked encoder
+      encoders_list <- self$make_latent_distributions(batch = batch, mask = mask)
+
+      # Extract the masked and full encoders. These are torch_Normal objects.
+      masked_encoder <- encoders_list$masked_encoder
+      full_encoder <- encoders_list$full_encoder
+
+      # List to store the estimates.
+      estimates <- list()
+
+      # Iterate over the number of samples/passes through the decoder for each validation observation.
+      for (i in seq(K)) {
+        # See equation 18 on page 18 in Ivanov et al. (2019). Create samples from the
+        # full encoder; z_i ~ q_phi(z|x,y). We get a tensor of dimension batch_size x latent_dim.
+        latent <- full_encoder$rsample()
+
+        # Send the latent samples through the decoder and get the batch_size x 2*n_features (in cont case)
+        # where we for each row have a normal dist on each feature The form will be (mu_1, sigma_1, ..., mu_p, sigma_p)
+        reconstruction_params <- self$decoder_network(latent)
+
+        # Compute the reconstruction loss, i.e., the log likelihood of only the masked values in
+        # the batch (true values) given the current reconstruction parameters from the decoder.
+        # We do not consider the log likelihood of observed or missing/nan values.
+        reconstruction_loss <- self$reconstruction_log_prob(batch, reconstruction_params, mask)
+
+        # Compute the log likelihood of observing the sampled latent representations from
+        # the full_encoder when using the normal distribution estimated by the masked_encoder.
+        masked_encoder_log_prob <- masked_encoder$log_prob(latent)
+
+        # Ensure dimensions batch$shape[1] x something.
+        masked_encoder_log_prob <- masked_encoder_log_prob$view(c(batch$shape[1], -1))
+
+        # Sum over the rows (last dimension), i.e., add the log-likelihood for each instance.
+        masked_encoder_log_prob <- masked_encoder_log_prob$sum(-1)
+
+        # Same explanations here as above, but now for the full_encoder.
+        full_encoder_log_prob <- full_encoder$log_prob(latent)
+        full_encoder_log_prob <- full_encoder_log_prob$view(c(batch$shape[1], -1))
+        full_encoder_log_prob <- full_encoder_log_prob$sum(-1)
+
+        # Combine the estimated loss based on the formula from equation 18 on page 18 in Ivanov et al. (2019).
+        # Consists of batch.shape[0] number of values
+        estimate <- reconstruction_loss + masked_encoder_log_prob - full_encoder_log_prob
+
+        # Make sure that the results are a column vector of height batch_size.
+        estimate <- estimate$unsqueeze(-1)
+
+        # Add the results to the estimates list
+        estimates <- append(estimates, estimate)
+      }
 
-      # Add the results to the estimates list
-      estimates <- append(estimates, estimate)
-    }
+      # Convert from list of tensors to a single tensor using colum bind
+      estimates <- torch::torch_cat(estimates, -1)
+
+      # Use the stabilizing trick logsumexp.
+      # We have worked on log-scale above, hence plus and minus and not multiplication and division,
+      # while Eq. 18 in Ivanov et al. (2019) work on regular scale with multiplication and division.
+      # We take the exp of the values to get back to original scale, then sum it and convert back to
+      # log scale. Note that we add -log(K) instead of dividing each term by K.
+      # Take the log sum exp along the rows (validation samples) then subtract log(K).
+      return(torch::torch_logsumexp(estimates, -1) - log(K))
+    },
+
+    # Generate the Parameters of the Generative Distributions
+    #
+    # description Generate the parameters of the generative distributions for samples from the batch.
+    #
+    # details The function makes K latent representation for each object from the batch, send these
+    # latent representations through the decoder to obtain the parameters for the generative distributions.
+    # I.e., means and variances for the normal distributions (continuous features) and probabilities
+    # for the categorical distribution (categorical features).
+    # The second axis is used to index samples for an object, i.e. if the batch shape is [n x D1 x D2], then
+    # the result shape is [n x K x D1 x D2]. It is better to use it inside torch::with_no_grad in order to save
+    # memory. With torch::with_no_grad the method doesn't require extra memory except the memory for the result.
+    #
+    # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
+    # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
+    # param K Integer. The number of imputations to be done for each observation in batch.
+    generate_samples_params = function(batch, mask, K = 1) {
+      # Compute the latent normal distributions obtained from only the masked encoder.
+      encoders_list <- self$make_latent_distributions(batch = batch, mask = mask, only_masked_encoder = TRUE)
+
+      # Only extract the masked encoder (torch_Normal object) as we are in the deployment phase.
+      masked_encoder <- encoders_list$masked_encoder
+
+      # Create a list to keep the sampled parameters.
+      samples_params <- list()
+
+      # Iterate over the number of imputations for each observation in the batch.
+      for (i in seq(K)) {
+        latent <- masked_encoder$rsample() # Generate latent representations by using the masked encoder
+        sample_params <- self$decoder_network(latent) # Send the latent representations through the decoder
+        samples_params <- append(samples_params, sample_params$unsqueeze(2)) # Store the inferred Gaussian distributions
+      }
 
-    # Convert from list of tensors to a single tensor using colum bind
-    estimates <- torch::torch_cat(estimates, -1)
-
-    # Use the stabilizing trick logsumexp.
-    # We have worked on log-scale above, hence plus and minus and not multiplication and division,
-    # while Eq. 18 in Ivanov et al. (2019) work on regular scale with multiplication and division.
-    # We take the exp of the values to get back to original scale, then sum it and convert back to
-    # log scale. Note that we add -log(K) instead of dividing each term by K.
-    # Take the log sum exp along the rows (validation samples) then subtract log(K).
-    return(torch::torch_logsumexp(estimates, -1) - log(K))
-  },
-
-  # Generate the Parameters of the Generative Distributions
-  #
-  # description Generate the parameters of the generative distributions for samples from the batch.
-  #
-  # details The function makes K latent representation for each object from the batch, send these
-  # latent representations through the decoder to obtain the parameters for the generative distributions.
-  # I.e., means and variances for the normal distributions (continuous features) and probabilities
-  # for the categorical distribution (categorical features).
-  # The second axis is used to index samples for an object, i.e. if the batch shape is [n x D1 x D2], then
-  # the result shape is [n x K x D1 x D2]. It is better to use it inside torch::with_no_grad in order to save
-  # memory. With torch::with_no_grad the method doesn't require extra memory except the memory for the result.
-  #
-  # param batch Tensor of dimension batch_size x n_features containing a batch of observations.
-  # param mask Tensor of zeros and ones indicating which entries in batch to mask. Same dimension as `batch`.
-  # param K Integer. The number of imputations to be done for each observation in batch.
-  generate_samples_params = function(batch, mask, K = 1) {
-    # Compute the latent normal distributions obtained from only the masked encoder.
-    encoders_list <- self$make_latent_distributions(batch = batch, mask = mask, only_masked_encoder = TRUE)
-
-    # Only extract the masked encoder (torch_Normal object) as we are in the deployment phase.
-    masked_encoder <- encoders_list$masked_encoder
-
-    # Create a list to keep the sampled parameters.
-    samples_params <- list()
-
-    # Iterate over the number of imputations for each observation in the batch.
-    for (i in seq(K)) {
-      latent <- masked_encoder$rsample() # Generate latent representations by using the masked encoder
-      sample_params <- self$decoder_network(latent) # Send the latent representations through the decoder
-      samples_params <- append(samples_params, sample_params$unsqueeze(2)) # Store the inferred Gaussian distributions
+      # Concatenate the list to a 3d-tensor. 2nd dimensions is the imputations.
+      return(torch::torch_cat(samples_params, 2))
     }
+  )
+  return(vaeac_tmp(
+    one_hot_max_sizes = one_hot_max_sizes,
+    width = width,
+    depth = depth,
+    latent_dim = latent_dim,
+    activation_function = activation_function,
+    skip_conn_layer = skip_conn_layer,
+    skip_conn_masked_enc_dec = skip_conn_masked_enc_dec,
+    batch_normalization = batch_normalization,
+    paired_sampling = paired_sampling,
+    mask_generator_name = mask_generator_name,
+    masking_ratio = masking_ratio,
+    mask_gen_coalitions = mask_gen_coalitions,
+    mask_gen_coalitions_prob = mask_gen_coalitions_prob,
+    sigma_mu = sigma_mu,
+    sigma_sigma = sigma_sigma
+  ))
+}
 
-    # Concatenate the list to a 3d-tensor. 2nd dimensions is the imputations.
-    return(torch::torch_cat(samples_params, 2))
-  }
-)
-
-# Dataset Utility Functions ===========================================================================================
+# Dataset Utility Functions ============================================================================================
 #' Compute Featurewise Means and Standard Deviations
 #'
 #' @description Returns the means and standard deviations for all continuous features in the data set.
@@ -998,7 +1032,7 @@ vaeac_postprocess_data <- function(data, vaeac_model_state_list) {
   return(data)
 }
 
-## vaeac_dataset ------------------------------------------------------------------------------------------------------
+
 #' Dataset used by the `vaeac` model
 #'
 #' @description
@@ -1047,25 +1081,28 @@ vaeac_postprocess_data <- function(data, vaeac_model_state_list) {
 #' vaeac_iterator$.next() # batch2
 #' vaeac_iterator$.next() # Empty
 #' }
-vaeac_dataset <- torch::dataset(
-  name = "vaeac_dataset", # field name The name of the `torch::dataset`.
-
-  # description Create a new vaeac_dataset object.
-  # param X A torch_tensor containing the data
-  # param one_hot_max_sizes A torch tensor of dimension p containing the one hot sizes of the p features.
-  # The sizes for the continuous features can either be '0' or '1'.
-  initialize = function(X, one_hot_max_sizes) {
-    # Save the number of observations and features in X, the one hot dummy feature sizes and the dataset
-    self$N <- nrow(X)
-    self$p <- ncol(X)
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$X <- X
-  },
-  .getbatch = function(index) self$X[index, , drop = FALSE], # Get a batch of data based on the provided indices
-  .length = function() nrow(self$X) # Get the number of observations in the dataset
-)
-
-## Paired Sampler  ----------------------------------------------------------------------------------------------------
+vaeac_dataset <- function(X, one_hot_max_sizes) {
+  vaeac_dataset_tmp <- torch::dataset(
+    name = "vaeac_dataset", # field name The name of the `torch::dataset`.
+
+    # description Create a new vaeac_dataset object.
+    # param X A torch_tensor containing the data
+    # param one_hot_max_sizes A torch tensor of dimension p containing the one hot sizes of the p features.
+    # The sizes for the continuous features can either be '0' or '1'.
+    initialize = function(X, one_hot_max_sizes) {
+      # Save the number of observations and features in X, the one hot dummy feature sizes and the dataset
+      self$N <- nrow(X)
+      self$p <- ncol(X)
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$X <- X
+    },
+    .getbatch = function(index) self$X[index, , drop = FALSE], # Get a batch of data based on the provided indices
+    .length = function() nrow(self$X) # Get the number of observations in the dataset
+  )
+  return(vaeac_dataset_tmp(X = X, one_hot_max_sizes = one_hot_max_sizes))
+}
+
+
 #' Sampling Paired Observations
 #'
 #' @description
@@ -1109,38 +1146,41 @@ vaeac_dataset <- torch::dataset(
 #' }
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-paired_sampler <- torch::sampler(
-  classname = "paired_sampler", # field Name of the paired sampler object
-  # description Initialize the paired_sampler object
-  initialize = function(vaeac_dataset_object, shuffle = FALSE) {
-    self$vaeac_dataset_object <- vaeac_dataset_object
-    self$shuffle <- shuffle
-  },
-  # description Get the number of observations in the datasaet
-  .length = function() length(self$vaeac_dataset_object) * 2, # Multiply by two do to get the actual number
-  # description Function to iterate over the data
-  .iter = function() {
-    n <- length(self$vaeac_dataset_object) # Get the number of observations in the data
-    indices <- if (self$shuffle) sample.int(n) else seq_len(n) # Check if randomly shuffle indices or increasing order
-    return(coro::as_iterator(rep(indices, each = 2))) # Duplicate each index and return an iterator
-  }
-)
+paired_sampler <- function(vaeac_dataset_object, shuffle = FALSE) {
+  paired_sampler_tmp <- torch::sampler(
+    classname = "paired_sampler", # field Name of the paired sampler object
+    # description Initialize the paired_sampler object
+    initialize = function(vaeac_dataset_object, shuffle = FALSE) {
+      self$vaeac_dataset_object <- vaeac_dataset_object
+      self$shuffle <- shuffle
+    },
+    # description Get the number of observations in the datasaet
+    .length = function() length(self$vaeac_dataset_object) * 2, # Multiply by two do to get the actual number
+    # description Function to iterate over the data
+    .iter = function() {
+      n <- length(self$vaeac_dataset_object) # Get the number of observations in the data
+      indices <- if (self$shuffle) sample.int(n) else seq_len(n) # Check if randomly shuffle indices or increasing order
+      return(coro::as_iterator(rep(indices, each = 2))) # Duplicate each index and return an iterator
+    }
+  )
+  return(paired_sampler_tmp(vaeac_dataset_object = vaeac_dataset_object, shuffle = shuffle))
+}
 
 
-# Neural Network Utility Functions ====================================================================================
-##  memory_layer -------------------------------------------------------------------------------------------------------
+# Neural Network Utility Functions =====================================================================================
 #' A [torch::nn_module()] Representing a Memory Layer
 #'
-#' @description The layer is used to make skip-connections inside a [torch::nn_sequential] network
-#' or between several [torch::nn_sequential] networks without unnecessary code complication.
+#' @description The layer is used to make skip-connections inside a [torch::nn_sequential()] network
+#' or between several [torch::nn_sequential()] networks without unnecessary code complication.
 #'
-#' @details If `output = FALSE`, this layer stores its input in a static list `storage` with the key `id`` and then
+#' @details If `output = FALSE`, this layer stores its input in the `shared_env` with the key `id` and then
 #' passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If `output = TRUE`, this
 #' layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If `add = TRUE`, it
 #' returns sum of the stored vector and an `input`, otherwise it returns their concatenation. If the tensor with
 #' specified `id` is not in storage when the layer with `output = TRUE` is called, it would cause an exception.
 #'
 #' @param id A unique id to use as a key in the storage list.
+#' @param shared_env A shared environment for all instances of memory_layer where the inputs are stored.
 #' @param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage.
 #' @param add Boolean variable indicating if the extracted value are to be added or concatenated to the input.
 #' Only applicable when `output = TRUE`.
@@ -1151,78 +1191,81 @@ paired_sampler <- torch::sampler(
 #'
 #' @examples
 #' \dontrun{
+#' memory_layer_env <- new.env()
 #' net1 <- torch::nn_sequential(
-#'   memory_layer("#1"),
-#'   memory_layer("#0.1"),
+#'   memory_layer("#1", shared_env = memory_layer_env),
+#'   memory_layer("#0.1", shared_env = memory_layer_env),
 #'   torch::nn_linear(512, 256),
-#'   torch::nn_leaky_relu(),
-#'   # here add cannot be TRUE because the dimensions mismatch
-#'   memory_layer("#0.1", output = TRUE, add = FALSE),
+#'   torch::nn_leaky_relu(), # Here add cannot be TRUE because the dimensions mismatch
+#'   memory_layer("#0.1", shared_env = memory_layer_env, output = TRUE, add = FALSE),
 #'   torch::nn_linear(768, 256),
 #'   # the dimension after the concatenation with skip-connection is 512 + 256 = 768
 #' )
 #' net2 <- torch::nn_equential(
 #'   torch::nn_linear(512, 512),
-#'   memory_layer("#1", output = TRUE, add = TRUE),
+#'   memory_layer("#1", shared_env = memory_layer_env, output = TRUE, add = TRUE),
 #'   ...
 #' )
+#' # Here a and c must be of correct dimensions, e.g., a = torch::torch_ones(1,512).
 #' b <- net1(a)
 #' d <- net2(c) # net2 must be called after net1, otherwise tensor '#1' will not be in storage.
 #' }
-memory_layer <- torch::nn_module(
-  classname = "memory_layer", # field classname Name of the of torch::nn_module object.
-
-  # field shared_env A shared environment for all instances of memory_layers.
-  shared_env = new.env(),
-
-  # description Create a new `memory_layer` object.
-  # param id A unique id to use as a key in the storage list.
-  # param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage.
-  # param add Boolean variable indicating if the extracted value are to be added or concatenated to the input.
-  # Only applicable when `output = TRUE`.
-  # param verbose Boolean variable indicating if we want to give printouts to the user.
-  initialize = function(id, output = FALSE, add = FALSE, verbose = FALSE) {
-    self$id <- id
-    self$output <- output
-    self$add <- add
-    self$verbose <- verbose
-  },
-  forward = function(input) {
-    # Check if we are going to insert input into the storage or extract data from the storage.
-    if (!self$output) {
-      if (self$verbose) message(paste0("Inserting data to memory layer `self$id = ", self$id, "`."))
-
-      # Insert the input into the storage list which is in the shared environment of the memory_layer class.
-      # Note that we do not check if self$id is unique.
-      self$shared_env$storage[[self$id]] <- input
-      return(input) # Return/send the input to the next layer in the network.
-    } else {
-      # We are to extract data from the storage list.
-      if (self$verbose) {
-        message(paste0(
-          "Extracting data to memory layer `self$id = ", self$id, "`. Using concatination = ", !self$add, "."
-        ))
-      }
+memory_layer <- function(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE) {
+  memory_layer_tmp <- torch::nn_module(
+    classname = "memory_layer", # field classname Name of the of torch::nn_module object.
+
+    # description Create a new `memory_layer` object.
+    # param id A unique id to use as a key in the storage list.
+    # param shared_env A shared environment for all instances of memory_layer where the inputs are stored.
+    # param output Boolean variable indicating if the memory layer is to store input in storage or extract from storage.
+    # param add Boolean variable indicating if the extracted value are to be added or concatenated to the input.
+    # Only applicable when `output = TRUE`.
+    # param verbose Boolean variable indicating if we want to give printouts to the user.
+    initialize = function(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE) {
+      self$id <- id
+      self$shared_env <- shared_env
+      self$output <- output
+      self$add <- add
+      self$verbose <- verbose
+    },
+    forward = function(input) {
+      # Check if we are going to insert input into the storage or extract data from the storage.
+      if (!self$output) {
+        if (self$verbose) message(paste0("Inserting data to memory layer `self$id = ", self$id, "`."))
+
+        # Insert the input into the storage list which is in the shared environment of the memory_layer class.
+        # Note that we do not check if self$id is unique.
+        self$shared_env[[self$id]] <- input
+        return(input) # Return/send the input to the next layer in the network.
+      } else {
+        # We are to extract data from the storage list.
+        if (self$verbose) {
+          message(paste0(
+            "Extracting data to memory layer `self$id = ", self$id, "`. Using concatination = ", !self$add, "."
+          ))
+        }
 
-      # Check that the memory layer has data is stored in it. If not, then thorw error.
-      if (!self$id %in% names(self$shared_env$storage)) {
-        stop(paste0(
-          "ValueError: Looking for memory layer `self$id = ", self$id, "`, but the only available memory layers are: ",
-          paste(names(self$shared_env$storage), collapse = "`, `"), "`."
-        ))
-      }
+        # Check that the memory layer has data is stored in it. If not, then thorw error.
+        if (!self$id %in% names(self$shared_env)) {
+          stop(paste0(
+            "ValueError: Looking for memory layer `self$id = ", self$id, "`, but the only available ",
+            "memory layers are: ", paste(names(self$shared_env), collapse = "`, `"), "`."
+          ))
+        }
 
-      # Extract the stored data for the given memory layer and check if we are to concatenate or add the input
-      stored <- self$shared_env$storage[[self$id]]
-      data <- if (self$add) input + stored else torch::torch_cat(c(input, stored), -1)
+        # Extract the stored data for the given memory layer and check if we are to concatenate or add the input
+        stored <- self$shared_env[[self$id]]
+        data <- if (self$add) input + stored else torch::torch_cat(c(input, stored), -1)
 
-      # Return the data
-      return(data)
+        # Return the data
+        return(data)
+      }
     }
-  }
-)
+  )
+
+  return(memory_layer_tmp(id = id, shared_env = shared_env, output = output, add = add, verbose = verbose))
+}
 
-## skip_connection -----------------------------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a skip connection
 #'
 #' @description Skip connection over the sequence of layers in the constructor. The module passes
@@ -1233,15 +1276,20 @@ memory_layer <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-skip_connection <- torch::nn_module(
-  classname = "skip_connection", # field classname Name of the of torch::nn_module object
-  # description Initialize a new skip_connection module
-  initialize = function(...) self$inner_net <- torch::nn_sequential(...),
-  # description What to do when a skip_connection module is called
-  forward = function(input) {
-    return(input + self$inner_net(input))
-  }
-)
+skip_connection <- function(...) {
+  skip_connection_tmp <- torch::nn_module(
+    classname = "skip_connection", # field classname Name of the of torch::nn_module object
+    # description Initialize a new skip_connection module
+    initialize = function(...) self$inner_net <- torch::nn_sequential(...),
+    # description What to do when a skip_connection module is called
+    forward = function(input) {
+      return(input + self$inner_net(input))
+    }
+  )
+
+  return(skip_connection_tmp(... = ...))
+}
+
 
 # Training Utility Functions ==========================================================================================
 #' Extends Incomplete Batches by Sampling Extra Data from Dataloader
@@ -1424,8 +1472,8 @@ vaeac_kl_normal_normal <- function(p, q) {
   return(0.5 * (var_ratio + t1 - 1 - var_ratio$log()))
 }
 
+
 # Neural Network Modules ===============================================================================================
-## gauss_cat_sampler_most_likely -------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a `gauss_cat_sampler_most_likely`
 #'
 #' @description The `gauss_cat_sampler_most_likely` generates the most likely samples from the generative distribution
@@ -1439,50 +1487,56 @@ vaeac_kl_normal_normal <- function(p, q) {
 #'
 #' @keywords internal
 #' @author Lars Henry Berge Olsen
-gauss_cat_sampler_most_likely <- torch::nn_module(
-  classname = "gauss_cat_sampler_most_likely", # field classname Type of torch::nn_module
-
-  # description Initialize a gauss_cat_sampler_most_likely which generates the most likely
-  # sample from the generative distribution defined by the output of the neural network.
-  initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$min_sigma <- min_sigma
-    self$min_prob <- min_prob
-  },
-
-  # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering
-  # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the vaeac,
-  # but rather logits for the categorical distribution.
-  # return A tensor containing the generated data.
-  forward = function(distr_params) {
-    cur_distr_col <- 1 # A counter to keep track of which column to extract from
-    sample <- list() # List to store the samples sampled from the normal distribution with parameters from distr_params
-
-    # Iterate over the features
-    for (i in seq_along(self$one_hot_max_sizes)) {
-      size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
-
-      if (size <= 1) {
-        # Continuous feature which are modeled using the Gaussian distribution
-        params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2
-        cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
-        distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
-        col_sample <- distr$mean # We sample the mean (most likely value)
-      } else {
-        # Categorical feature which are modeled using the categorical distribution
-        # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
-        params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
-        cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
-        distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
-        col_sample <- torch::torch_argmax(distr$probs, -1)[, NULL]$to(dtype = torch::torch_float()) # Most likely class
+gauss_cat_sampler_most_likely <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+  gauss_cat_sampler_most_lik_tmp <- torch::nn_module(
+    classname = "gauss_cat_sampler_most_likely", # field classname Type of torch::nn_module
+
+    # description Initialize a gauss_cat_sampler_most_likely which generates the most likely
+    # sample from the generative distribution defined by the output of the neural network.
+    initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$min_sigma <- min_sigma
+      self$min_prob <- min_prob
+    },
+
+    # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering
+    # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the
+    # vaeac, but rather logits for the categorical distribution.
+    # return A tensor containing the generated data.
+    forward = function(distr_params) {
+      cur_distr_col <- 1 # A counter to keep track of which column to extract from
+      sample <- list() # List to store the samples sampled from the normal distr. with parameters from distr_params
+
+      # Iterate over the features
+      for (i in seq_along(self$one_hot_max_sizes)) {
+        size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
+
+        if (size <= 1) {
+          # Continuous feature which are modeled using the Gaussian distribution
+          params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2
+          cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
+          distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
+          col_sample <- distr$mean # We sample the mean (most likely value)
+        } else {
+          # Categorical feature which are modeled using the categorical distribution
+          # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
+          params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
+          cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
+          distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
+          col_sample <- torch::torch_argmax(distr$probs, -1)[, NULL]$to(dtype = torch::torch_float()) # Most lik class
+        }
+        sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
       }
-      sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
+      return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
     }
-    return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
-  }
-)
+  )
+
+  return(
+    gauss_cat_sampler_most_lik_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob)
+  )
+}
+
 
-## gauss_cat_sampler_random -----------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a gauss_cat_sampler_random
 #'
 #' @description The `gauss_cat_sampler_random` generates random samples from the generative distribution defined by the
@@ -1494,52 +1548,56 @@ gauss_cat_sampler_most_likely <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-gauss_cat_sampler_random <- torch::nn_module(
-  classname = "gauss_cat_sampler_random", # field classname Type of torch::nn_module
-
-  # description Initialize a gauss_cat_sampler_random which generates a sample from the
-  # generative distribution defined by the output of the neural network by random sampling.
-  # return A new `gauss_cat_sampler_random` object.
-  initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$min_sigma <- min_sigma
-    self$min_prob <- min_prob
-  },
-
-  # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering
-  # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the vaeac,
-  # but rather logits for the categorical distribution.
-  # return A tensor containing the generated data.
-  forward = function(distr_params) {
-    cur_distr_col <- 1 # A counter to keep track of which column to extract from
-    sample <- list() # List to store the samples sampled from the normal distribution with parameters from distr_params
-
-    # Iterate over the features
-    for (i in seq_along(self$one_hot_max_sizes)) {
-      size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
-
-      if (size <= 1) {
-        # Continuous feature which are modeled using the Gaussian distribution
-        params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2
-        cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
-        distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
-        col_sample <- distr$sample() # Sample from the inferred Gaussian distributions
-      } else {
-        # Categorical feature which are modeled using the categorical distribution
-        # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
-        params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
-        cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
-        distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
-        col_sample <- distr$sample()$unsqueeze(-1)$to(dtype = torch::torch_float()) # Sample class based on class prob
+gauss_cat_sampler_random <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+  gauss_cat_sampler_random_tmp <- torch::nn_module(
+    classname = "gauss_cat_sampler_random", # field classname Type of torch::nn_module
+
+    # description Initialize a gauss_cat_sampler_random which generates a sample from the
+    # generative distribution defined by the output of the neural network by random sampling.
+    # return A new `gauss_cat_sampler_random` object.
+    initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$min_sigma <- min_sigma
+      self$min_prob <- min_prob
+    },
+
+    # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only considering
+    # continuous features. For categorical features, we do NOT have mu and sigma for the decoder at the end of the
+    # vaeac, but rather logits for the categorical distribution.
+    # return A tensor containing the generated data.
+    forward = function(distr_params) {
+      cur_distr_col <- 1 # A counter to keep track of which column to extract from
+      sample <- list() # List to store the samples sampled from the normal distr with parameters from distr_params
+
+      # Iterate over the features
+      for (i in seq_along(self$one_hot_max_sizes)) {
+        size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
+
+        if (size <= 1) {
+          # Continuous feature which are modeled using the Gaussian distribution
+          params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2
+          cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
+          distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
+          col_sample <- distr$sample() # Sample from the inferred Gaussian distributions
+        } else {
+          # Categorical feature which are modeled using the categorical distribution
+          # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
+          params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
+          cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
+          distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
+          col_sample <- distr$sample()$unsqueeze(-1)$to(dtype = torch::torch_float()) # Sample class using class prob
+        }
+        sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
       }
-      sample <- append(sample, col_sample) # Add the vector of sampled values for the i´th feature to the sample list
+      return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
     }
-    return(torch::torch_cat(sample, -1)) # Create a 2D torch by column binding the vectors in the list
-  }
-)
+  )
+  return(
+    gauss_cat_sampler_random_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob)
+  )
+}
 
 
-## gauss_cat_parameters --------------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a `gauss_cat_parameters`
 #'
 #' @description The `gauss_cat_parameters` module extracts the parameters from the inferred generative Gaussian and
@@ -1558,54 +1616,55 @@ gauss_cat_sampler_random <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-gauss_cat_parameters <- torch::nn_module(
-  # field classname Type of torch::nn_module
-  classname = "gauss_cat_parameters",
-
-  # description Initialize a `gauss_cat_parameters` which extract the parameters from the generative distribution
-  # defined by the output of the neural network.
-  # return A new `gauss_cat_parameters` object.
-  initialize = function(one_hot_max_sizes,
-                        min_sigma = 1e-4,
-                        min_prob = 1e-4) {
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$min_sigma <- min_sigma
-    self$min_prob <- min_prob
-  },
-
-  # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only
-  # considering continuous features. For categorical features, we do NOT have mu and sigma for the
-  # decoder at the end of the vaeac, but rather logits for the categorical distribution.
-  # return A tensor containing the final parameters of the generative distributions (after transformations).
-  forward = function(distr_params) {
-    cur_distr_col <- 1 # A counter to keep track of which column to extract from
-    parameters <- list() # List to store the generative parameters from the normal and categorical distributions
-
-    # Iterate over the features
-    for (i in seq_along(self$one_hot_max_sizes)) {
-      size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
-
-      if (size <= 1) {
-        # Continuous feature which are modeled using the Gaussian distribution.
-        params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2
-        cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
-        distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
-        current_parameters <- torch::torch_cat(c(distr$mean, distr$scale), -1) # Combine the current parameters
-      } else {
-        # Categorical feature which are modeled using the categorical distribution
-        # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
-        params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
-        cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
-        distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
-        current_parameters <- distr$probs # Extract the current probabilities for each classs
+gauss_cat_parameters <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+  gauss_cat_parameters_tmp <- torch::nn_module(
+    # field classname Type of torch::nn_module
+    classname = "gauss_cat_parameters",
+
+    # description Initialize a `gauss_cat_parameters` which extract the parameters from the generative distribution
+    # defined by the output of the neural network.
+    # return A new `gauss_cat_parameters` object.
+    initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$min_sigma <- min_sigma
+      self$min_prob <- min_prob
+    },
+
+    # param dist_params A matrix of form batch_size x (mu_1, sigma_1, ..., mu_p, sigma_p), when only
+    # considering continuous features. For categorical features, we do NOT have mu and sigma for the
+    # decoder at the end of the vaeac, but rather logits for the categorical distribution.
+    # return A tensor containing the final parameters of the generative distributions (after transformations).
+    forward = function(distr_params) {
+      cur_distr_col <- 1 # A counter to keep track of which column to extract from
+      parameters <- list() # List to store the generative parameters from the normal and categorical distributions
+
+      # Iterate over the features
+      for (i in seq_along(self$one_hot_max_sizes)) {
+        size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
+
+        if (size <= 1) {
+          # Continuous feature which are modeled using the Gaussian distribution.
+          params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2
+          cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
+          distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
+          current_parameters <- torch::torch_cat(c(distr$mean, distr$scale), -1) # Combine the current parameters
+        } else {
+          # Categorical feature which are modeled using the categorical distribution
+          # Extract the logits for each of the K-classes for the ith feature. The dimension is batch_size x size.
+          params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1)]
+          cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
+          distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
+          current_parameters <- distr$probs # Extract the current probabilities for each classs
+        }
+        parameters <- append(parameters, current_parameters) # Add the i´th feature's parameters to the parameters list
       }
-      parameters <- append(parameters, current_parameters) # Add parameters for the i´th feature to the parameters list
+      return(torch::torch_cat(parameters, -1)) # Create a 2D torch_tensor by column binding the tensors in the list
     }
-    return(torch::torch_cat(parameters, -1)) # Create a 2D torch_tensor by column binding the tensors in the list
-  }
-)
+  )
+  return(gauss_cat_parameters_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob))
+}
+
 
-## gauss_cat_loss --------------------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a `gauss_cat_loss`
 #'
 #' @description The `gauss_cat_loss module` layer computes the log probability of the `groundtruth` for each object
@@ -1620,67 +1679,69 @@ gauss_cat_parameters <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-gauss_cat_loss <- torch::nn_module(
-  classname = "gauss_cat_loss", # field classname Type of torch::nn_module
-
-  # description Initialize a `gauss_cat_loss`.
-  # return A new `gauss_cat_loss` object.
-  initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
-    self$one_hot_max_sizes <- one_hot_max_sizes
-    self$min_sigma <- min_sigma
-    self$min_prob <- min_prob
-  },
-  forward = function(groundtruth, distr_params, mask) {
-    cur_distr_col <- 1 # A counter to keep track of which column to extract from
-    log_prob <- list() # List to store the log probabilities
-
-    # Iterate over the features
-    for (i in seq_along(self$one_hot_max_sizes)) {
-      size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
-      groundtruth_col <- groundtruth[, i, drop = FALSE] # Get at the ith column of the truth
-      mask_col <- mask[, i, drop = FALSE] # Get the ith column of the mask
-      gt_col_nansafe <- groundtruth_col$clone()$detach() # Copy the ground truth column, can now alter this object
-      nan_mask <- torch::torch_isnan(groundtruth_col) # Check if truth contains any missing values
-      gt_col_nansafe[nan_mask] <- 0 # Set any missing values to 0
-
-      # Mask_col masks both the nan/missing values and the artificially masked values. We want to compute the log prob
-      # only over the artificially missing features, so we omit the true missing values. We remove the masking of the
-      # missing values. So those ones in mask_col which are there due to missing values are now turned in to zeros.
-      mask_col <- mask_col * (torch::torch_logical_not(nan_mask))$to(dtype = torch::torch_float())
-
-      if (size <= 1) {
-        # Continuous feature which are modeled using the Gaussian distribution
-        params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean and sd, batch_size x 2
-        cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
-        distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
-
-        # Get the log-likelihood, but only of the masked values i.e., the ones hat are masked by the masking scheme
-        # MCARGenerator. This one is batch_size x 1 and is the log-lik of observing the ground truth given the current
-        # parameters, for only the artificially masked features.
-        col_log_prob <- distr$log_prob(gt_col_nansafe) * mask_col
-      } else {
-        # Categorical feature which are modeled using the categorical distribution
-        # Extract the probabilities for each of the K-classes for the ith feature. The dimension is batch_size x size.
-        params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1), drop = FALSE]
-        cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
-        distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
-        col_log_prob <- distr$log_prob(gt_col_nansafe$squeeze())[, NULL] * mask_col # Get the log-likelihood
+gauss_cat_loss <- function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+  gauss_cat_loss_tmp <- torch::nn_module(
+    classname = "gauss_cat_loss", # field classname Type of torch::nn_module
+
+    # description Initialize a `gauss_cat_loss`.
+    # return A new `gauss_cat_loss` object.
+    initialize = function(one_hot_max_sizes, min_sigma = 1e-4, min_prob = 1e-4) {
+      self$one_hot_max_sizes <- one_hot_max_sizes
+      self$min_sigma <- min_sigma
+      self$min_prob <- min_prob
+    },
+    forward = function(groundtruth, distr_params, mask) {
+      cur_distr_col <- 1 # A counter to keep track of which column to extract from
+      log_prob <- list() # List to store the log probabilities
+
+      # Iterate over the features
+      for (i in seq_along(self$one_hot_max_sizes)) {
+        size <- self$one_hot_max_sizes[i] # Get the number of one hot dummy features to see if feature is cont or cat
+        groundtruth_col <- groundtruth[, i, drop = FALSE] # Get at the ith column of the truth
+        mask_col <- mask[, i, drop = FALSE] # Get the ith column of the mask
+        gt_col_nansafe <- groundtruth_col$clone()$detach() # Copy the ground truth column, can now alter this object
+        nan_mask <- torch::torch_isnan(groundtruth_col) # Check if truth contains any missing values
+        gt_col_nansafe[nan_mask] <- 0 # Set any missing values to 0
+
+        # Mask_col masks both the nan/missing values and the artificially masked values. We want to compute the log prob
+        # only over the artificially missing features, so we omit the true missing values. We remove the masking of the
+        # missing values. So those ones in mask_col which are there due to missing values are now turned in to zeros.
+        mask_col <- mask_col * (torch::torch_logical_not(nan_mask))$to(dtype = torch::torch_float())
+
+        if (size <= 1) {
+          # Continuous feature which are modeled using the Gaussian distribution
+          params <- distr_params[, cur_distr_col:(cur_distr_col + 1), drop = FALSE] # Extract mean & sd, batch_size x 2
+          cur_distr_col <- cur_distr_col + 2 # Update the pointer index by two (mean and sd)
+          distr <- vaeac_normal_parse_params(params, self$min_sigma) # Create a Gaussian distribution based on params
+
+          # Get the log-likelihood, but only of the masked values i.e., the ones hat are masked by the masking scheme
+          # MCARGenerator. This one is batch_size x 1 and is the log-lik of observing the ground truth given the current
+          # parameters, for only the artificially masked features.
+          col_log_prob <- distr$log_prob(gt_col_nansafe) * mask_col
+        } else {
+          # Categorical feature which are modeled using the categorical distribution
+          # Extract the probabilities for each of the K-classes for the ith feature. The dimension is batch_size x size.
+          params <- distr_params[, cur_distr_col:(cur_distr_col + size - 1), drop = FALSE]
+          cur_distr_col <- cur_distr_col + size # Update the pointer index by the number of categorical levels
+          distr <- vaeac_categorical_parse_params(params, self$min_prob) # Create a categorical distr based on params
+          col_log_prob <- distr$log_prob(gt_col_nansafe$squeeze())[, NULL] * mask_col # Get the log-likelihood
+        }
+
+        # Append the column of log probabilities for the i-th feature for those instances that are masked into log_prob.
+        # log_prob is now a list of length n_features, where each element is a tensor batch_size x 1 containing the
+        # log-lik of the parameters of the masked values.
+        log_prob <- append(log_prob, col_log_prob)
       }
 
-      # Append the column of log probabilities for the i-th feature for those instances that are masked into log_prob.
-      # log_prob is now a list of length n_features, where each element is a tensor batch_size x 1 containing the
-      # log-lik of the parameters of the masked values.
-      log_prob <- append(log_prob, col_log_prob)
+      # Concatenate the list into a tensor of dim batch x features. Then sum along the the rows.
+      # That is, for each observation in the batch to get a tensor of length batch size.
+      return(torch::torch_cat(log_prob, 2)$sum(-1))
     }
-
-    # Concatenate the list into a tensor of dim batch x features. Then sum along the the rows.
-    # That is, for each observation in the batch to get a tensor of length batch size.
-    return(torch::torch_cat(log_prob, 2)$sum(-1))
-  }
-)
+  )
+  return(gauss_cat_loss_tmp(one_hot_max_sizes = one_hot_max_sizes, min_sigma = min_sigma, min_prob = min_prob))
+}
 
 
-## categorical_to_one_hot_layer ----------------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a `categorical_to_one_hot_layer`
 #'
 #' @description
@@ -1698,74 +1759,80 @@ gauss_cat_loss <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-categorical_to_one_hot_layer <- torch::nn_module(
-  classname = "categorical_to_one_hot_layer", # field classname Type of torch::nn_module
-
-  # description Initialize a `categorical_to_one_hot_layer`.
-  # return A new `categorical_to_one_hot_layer` object.
-  initialize = function(one_hot_max_sizes, add_nans_map_for_columns = NULL) {
-    # Here one_hot_max_sizes includes zeros at the end of the list: one_hot_max_sizes + [0] * len(one_hot_max_sizes)
-    # Thus, if features have this many categories [1, 2, 3, 1], then one_hot_max_sizes = [1, 2, 3, 1, 0, 0, 0, 0]
-    self$one_hot_max_sizes <- one_hot_max_sizes
-
-    # Always an empty column for the Masked Encoder network while it is a list [0, 1, ..., length(one_hot_max_sizes)-1)
-    # for the Full Encoder network. So for the Full Encoder network we apply the nan masks to each column/feature
-    self$add_nans_map_for_columns <- add_nans_map_for_columns
-  },
-  forward = function(input) {
-    # Input is torch::torch_cat(c(batch, mask), -1), so a torch of dimension batch_size x 2*sum(one_hot_max_sizes)
-    # for continuous data where one_hot_max_sizes only consists of ones. Recall that one_hot_max_sizes are padded with
-    # zeros at the end in this function.
-    n <- input$shape[1] # Get the number of instances in the input batch.
-    out_cols <- NULL # variable to store the out columns, i.e., the input columns / one hot encoding + is nan.mask.
-
-    # Iterate over the features. Note that i goes from 0 to 2*n_features-1.
-    for (i in seq_along(self$one_hot_max_sizes)) {
-      # Get the number of categories for each feature. For i in [n_features, 2*n_features-1], size <= 1, even for
-      # categorical features.
-      size <- self$one_hot_max_sizes[i]
-
-      # Distinguish between continuous and categorical features
-      if (size <= 1) {
-        # Continuous feature. Copy it and replace NaNs with either zeros or the last half of self.one_hot_max_sizes
-        # Take the ith column of the input (NOTE THAT THIS IS NOT A DEEP COPY, so changing out_col changes input)
-        out_col <- input[, i:i]
-        nan_mask <- torch::torch_isnan(out_col) # check if any of the values are nan, i.e., missing
-        out_col[nan_mask] <- 0 # set all the missing values to 0. (This changes the input too)
-      } else {
-        # Categorical feature. Get the categories for each instance for the ith feature and start to count at zero.
-        # So if we have 2 cat, then this vector will contains zeros and ones.
-        cat_idx <- input[, i:i]
-        nan_mask <- torch::torch_isnan(cat_idx) # Check if any of the categories are nan / missing
-        cat_idx[nan_mask] <- 0 # Set the nan values to 0
-
-        # Create a matrix, where the jth row is the one-hot encoding of the ith feature of the jth instance.
-        out_col <- matrix(0, nrow = n, ncol = size)
-        out_col[cbind(seq(n), as.matrix(cat_idx$cpu()))] <- 1
-        out_col <- torch::torch_tensor(out_col, device = input$device)
-      }
+categorical_to_one_hot_layer <- function(one_hot_max_sizes, add_nans_map_for_columns = NULL) {
+  cat_to_one_hot_layer_tmp <- torch::nn_module(
+    classname = "categorical_to_one_hot_layer", # field classname Type of torch::nn_module
+
+    # description Initialize a `categorical_to_one_hot_layer`.
+    # return A new `categorical_to_one_hot_layer` object.
+    initialize = function(one_hot_max_sizes, add_nans_map_for_columns = NULL) {
+      # Here one_hot_max_sizes includes zeros at the end of the list: one_hot_max_sizes + [0] * len(one_hot_max_sizes)
+      # Thus, if features have this many categories [1, 2, 3, 1], then one_hot_max_sizes = [1, 2, 3, 1, 0, 0, 0, 0]
+      self$one_hot_max_sizes <- one_hot_max_sizes
+
+      # Always an empty column for the Masked Encoder network while it's a list [0, 1, ..., length(one_hot_max_sizes)-1)
+      # for the Full Encoder network. So for the Full Encoder network we apply the nan masks to each column/feature
+      self$add_nans_map_for_columns <- add_nans_map_for_columns
+    },
+    forward = function(input) {
+      # Input is torch::torch_cat(c(batch, mask), -1), so a torch of dimension batch_size x 2*sum(one_hot_max_sizes)
+      # for continuous data where one_hot_max_sizes only consists of ones. Recall that one_hot_max_sizes are padded with
+      # zeros at the end in this function.
+      n <- input$shape[1] # Get the number of instances in the input batch.
+      out_cols <- NULL # variable to store the out columns, i.e., the input columns / one hot encoding + is nan.mask.
+
+      # Iterate over the features. Note that i goes from 0 to 2*n_features-1.
+      for (i in seq_along(self$one_hot_max_sizes)) {
+        # Get the number of categories for each feature. For i in [n_features, 2*n_features-1], size <= 1, even for
+        # categorical features.
+        size <- self$one_hot_max_sizes[i]
+
+        # Distinguish between continuous and categorical features
+        if (size <= 1) {
+          # Continuous feature. Copy it and replace NaNs with either zeros or the last half of self.one_hot_max_sizes
+          # Take the ith column of the input (NOTE THAT THIS IS NOT A DEEP COPY, so changing out_col changes input)
+          out_col <- input[, i:i]
+          nan_mask <- torch::torch_isnan(out_col) # check if any of the values are nan, i.e., missing
+          out_col[nan_mask] <- 0 # set all the missing values to 0. (This changes the input too)
+        } else {
+          # Categorical feature. Get the categories for each instance for the ith feature and start to count at zero.
+          # So if we have 2 cat, then this vector will contains zeros and ones.
+          cat_idx <- input[, i:i]
+          nan_mask <- torch::torch_isnan(cat_idx) # Check if any of the categories are nan / missing
+          cat_idx[nan_mask] <- 0 # Set the nan values to 0
+
+          # Create a matrix, where the jth row is the one-hot encoding of the ith feature of the jth instance.
+          out_col <- matrix(0, nrow = n, ncol = size)
+          out_col[cbind(seq(n), as.matrix(cat_idx$cpu()))] <- 1
+          out_col <- torch::torch_tensor(out_col, device = input$device)
+        }
 
-      # Append this feature column to the result. out_col is n x size = batch_size x n_categories_for_this_feature
-      out_cols <- torch::torch_cat(c(out_cols, out_col), dim = -1)
+        # Append this feature column to the result. out_col is n x size = batch_size x n_categories_for_this_feature
+        out_cols <- torch::torch_cat(c(out_cols, out_col), dim = -1)
 
-      # If necessary, append isnan mask of this feature to the result which we always do for the proposal network.
-      # This only happens for the first half of the i's, so for i = 1, ..., n_features.
-      if (i %in% self$add_nans_map_for_columns) {
-        # add the columns of nan_mask
-        out_cols <- torch::torch_cat(c(out_cols, nan_mask$to(dtype = torch::torch_float())), dim = -1)
+        # If necessary, append isnan mask of this feature to the result which we always do for the proposal network.
+        # This only happens for the first half of the i's, so for i = 1, ..., n_features.
+        if (i %in% self$add_nans_map_for_columns) {
+          # add the columns of nan_mask
+          out_cols <- torch::torch_cat(c(out_cols, nan_mask$to(dtype = torch::torch_float())), dim = -1)
+        }
       }
+
+      # ONLY FOR CONTINUOUS FEATURES: out_cols now is a list of n_features tensors of shape n x size = n x 1 for
+      # continuous variables. We concatenate them to a matrix of dim n x 2*n_features (in cont case) for prior net, but
+      # for proposal net, it is n x 3*n_features, and they take the form
+      # [batch1, is.nan1, batch2, is.nan2, …,  batch12, is.nan12, mask1, mask2, …, mask12]
+      return(out_cols)
     }
+  )
+
+  return(
+    cat_to_one_hot_layer_tmp(one_hot_max_sizes = one_hot_max_sizes, add_nans_map_for_columns = add_nans_map_for_columns)
+  )
+}
 
-    # ONLY FOR CONTINUOUS FEATURES: out_cols now is a list of n_features tensors of shape n x size = n x 1 for
-    # continuous variables. We concatenate them to a matrix of dim n x 2*n_features (in cont case) for prior net, but
-    # for proposal net, it is n x 3*n_features, and they take the form
-    # [batch1, is.nan1, batch2, is.nan2, …,  batch12, is.nan12, mask1, mask2, …, mask12]
-    return(out_cols)
-  }
-)
 
-# Mask Generators =====================================================================================================
-## mcar_mask_generator ------------------------------------------------------------------------------------------------
+# Mask Generators ======================================================================================================
 #' Missing Completely at Random (MCAR) Mask Generator
 #'
 #' @description A mask generator which masks the entries in the input completely at random.
@@ -1795,84 +1862,86 @@ categorical_to_one_hot_layer <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-mcar_mask_generator <- torch::nn_module(
-  name = "mcar_mask_generator", # field name Type of mask generator
-
-  # description Initialize a missing completely at random mask generator.
-  # param masking_ratio The probability for an entry in the generated mask to be 1 (masked).
-  # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}.
-  # If TRUE, then batch must be sampled using `paired_sampler` which creates batches where
-  # the first half and second half of the rows are duplicates of each other. That is,
-  # batch = [row1, row1, row2, row2, row3, row3, ...].
-  # return A new `mcar_mask_generator` object.
-  initialize = function(masking_ratio = 0.5, paired_sampling = FALSE) {
-    self$masking_ratio <- masking_ratio
-    self$paired_sampling <- paired_sampling
-  },
-
-  # description Generates a MCAR mask by calling self$mcar_mask_generator_function function.
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
-  # entries are missing. If any are missing, then the returned mask will ensure that
-  # these missing entries are masked.
-  forward = function(batch) {
-    self$mcar_mask_generator_function(batch, prob = self$masking_ratio, paired_sampling = self$paired_sampling)
-  },
-
-  # description Missing Completely At Random Mask Generator: A mask generator where the masking
-  # is determined by component-wise independent Bernoulli distribution.
-  #
-  # details Function that takes in a batch of observations and the probability
-  # of masking each element based on a component-wise independent Bernoulli
-  # distribution. Default value is 0.5, so all masks are equally likely to be trained.
-  # Function returns the mask of same shape as batch.
-  # Note that the batch can contain missing values, indicated by the "NaN" token.
-  # The mask will always mask missing values.
-  #
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
-  # entries are missing. If any are missing, then the returned mask will ensure that
-  # these missing entries are masked.
-  # param prob Numeric between 0 and 1. The probability that an entry will be masked.
-  # param seed Integer. Used to set the seed for the sampling process such that we
-  # can reproduce the same masks.
-  # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}.
-  # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
-  # the first half and second half of the rows are duplicates of each other. That is,
-  # batch = [row1, row1, row2, row2, row3, row3, ...].
-  #
-  # examples
-  # mcar_mask_generator_function(torch::torch_rand(c(5, 3)))
-  #
-  # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
-  # observed feature value will be masked. '0' means that the entry is NOT masked,
-  # i.e., the feature value will be observed/given/available.
-  mcar_mask_generator_function = function(batch, prob = 0.5, seed = NULL, paired_sampling = FALSE) {
-    if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility
-    size <- prod(batch$shape) # Get the number of entries in the batch.
-    nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
-
-    # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
-    if (paired_sampling) size <- size / 2
-
-    # # Torch version, but marginally slower than r version when batch_size <= 128 and n_features <= 50
-    # mask = torch::torch_bernoulli(torch::torch_full_like(batch, prob))
-    # Create the Bernoulli mask where an element is masked (1) with probability 'prob'.
-    mask <- torch::torch_tensor(
-      matrix(sample(c(0, 1), size = size, replace = TRUE, prob = c(prob, 1 - prob)), ncol = ncol(batch)),
-      dtype = torch::torch_float()
-    )
+mcar_mask_generator <- function(masking_ratio = 0.5, paired_sampling = FALSE) {
+  mcar_mask_gen_tmp <- torch::nn_module(
+    name = "mcar_mask_generator", # field name Type of mask generator
+
+    # description Initialize a missing completely at random mask generator.
+    # param masking_ratio The probability for an entry in the generated mask to be 1 (masked).
+    # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}.
+    # If TRUE, then batch must be sampled using `paired_sampler` which creates batches where
+    # the first half and second half of the rows are duplicates of each other. That is,
+    # batch = [row1, row1, row2, row2, row3, row3, ...].
+    # return A new `mcar_mask_generator` object.
+    initialize = function(masking_ratio = 0.5, paired_sampling = FALSE) {
+      self$masking_ratio <- masking_ratio
+      self$paired_sampling <- paired_sampling
+    },
+
+    # description Generates a MCAR mask by calling self$mcar_mask_generator_function function.
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
+    # entries are missing. If any are missing, then the returned mask will ensure that
+    # these missing entries are masked.
+    forward = function(batch) {
+      self$mcar_mask_generator_function(batch, prob = self$masking_ratio, paired_sampling = self$paired_sampling)
+    },
+
+    # description Missing Completely At Random Mask Generator: A mask generator where the masking
+    # is determined by component-wise independent Bernoulli distribution.
+    #
+    # details Function that takes in a batch of observations and the probability
+    # of masking each element based on a component-wise independent Bernoulli
+    # distribution. Default value is 0.5, so all masks are equally likely to be trained.
+    # Function returns the mask of same shape as batch.
+    # Note that the batch can contain missing values, indicated by the "NaN" token.
+    # The mask will always mask missing values.
+    #
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
+    # entries are missing. If any are missing, then the returned mask will ensure that
+    # these missing entries are masked.
+    # param prob Numeric between 0 and 1. The probability that an entry will be masked.
+    # param seed Integer. Used to set the seed for the sampling process such that we
+    # can reproduce the same masks.
+    # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \eqn{\bar{S}}.
+    # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
+    # the first half and second half of the rows are duplicates of each other. That is,
+    # batch = [row1, row1, row2, row2, row3, row3, ...].
+    #
+    # examples
+    # mcar_mask_generator_function(torch::torch_rand(c(5, 3)))
+    #
+    # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
+    # observed feature value will be masked. '0' means that the entry is NOT masked,
+    # i.e., the feature value will be observed/given/available.
+    mcar_mask_generator_function = function(batch, prob = 0.5, seed = NULL, paired_sampling = FALSE) {
+      if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility
+      size <- prod(batch$shape) # Get the number of entries in the batch.
+      nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
+
+      # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
+      if (paired_sampling) size <- size / 2
+
+      # # Torch version, but marginally slower than r version when batch_size <= 128 and n_features <= 50
+      # mask = torch::torch_bernoulli(torch::torch_full_like(batch, prob))
+      # Create the Bernoulli mask where an element is masked (1) with probability 'prob'.
+      mask <- torch::torch_tensor(
+        matrix(sample(c(0, 1), size = size, replace = TRUE, prob = c(prob, 1 - prob)), ncol = ncol(batch)),
+        dtype = torch::torch_float()
+      )
 
-    # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
-    if (paired_sampling) {
-      mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
-    }
+      # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
+      if (paired_sampling) {
+        mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
+      }
 
-    # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked.
-    return(mask + nan_mask >= 1)
-  }
-)
+      # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked.
+      return(mask + nan_mask >= 1)
+    }
+  )
+  return(mcar_mask_gen_tmp(masking_ratio = masking_ratio, paired_sampling = paired_sampling))
+}
 
 
-## specified_prob_mask_generator -------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a specified_prob_mask_generator
 #'
 #' @description A mask generator which masks the entries based on specified probabilities.
@@ -1907,79 +1976,83 @@ mcar_mask_generator <- torch::nn_module(
 #' }
 #'
 #' @keywords internal
-specified_prob_mask_generator <- torch::nn_module(
-  name = "specified_prob_mask_generator", # field name Type of mask generator
-
-  # description Initialize a specified_probability mask generator.
-  initialize = function(masking_probs, paired_sampling = FALSE) {
-    self$masking_probs <- masking_probs / sum(masking_probs)
-    self$paired_sampling <- paired_sampling
-  },
-
-  # description Generates a specified probability mask by calling the self$specified_prob_mask_generator_function.
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are
-  # missing. If any are missing, then the returned mask will ensure that these missing entries are masked.
-  forward = function(batch) {
-    self$specified_prob_mask_generator_function(
-      batch = batch,
-      masking_prob = self$masking_probs,
-      paired_sampling = self$paired_sampling
-    )
-  },
-
-  # description Specified Probability Mask Generator: A mask generator that first samples the number of entries 'd' to
-  # be masked in the 'M'-dimensional observation 'x' in the batch based on the given M+1 probabilities. The 'd' maskes
-  # are uniformly sampled from the 'M' possible feature indices. The d'th entry of the probability of having d-1 masked
-  # values.
-  #
-  # details Note that mcar_mask_generator with p = 0.5 is the same as using specified_prob_mask_generator
-  # with masking_ratio = choose(M, 0:M), where M is the number of features. This function was initially
-  # created to check if increasing the probability of having a masks with many masked features improved
-  # vaeac's performance by focusing more on these situations during training.
-  #
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are missing. If any
-  # are missing, then the returned mask will ensure that these missing entries are masked.
-  # param masking_probs An M+1 numerics containing the probabilities masking 'd' (0,...M) entries for each observation.
-  # param seed Integer. Used to set the seed for the sampling process such that we can reproduce the same masks.
-  # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}.
-  # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
-  # the first half and second half of the rows are duplicates of each other. That is,
-  # `batch = [row1, row1, row2, row2, row3, row3, ...]`.
-  #
-  # examples specified_prob_mask_generator_function(torch::torch_rand(c(5, 4)), masking_probs = c(2,7,5,3,3))
-  #
-  # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
-  # observed feature value will be masked. '0' means that the entry is NOT masked,
-  # i.e., the feature value will be observed/given/available.
-  specified_prob_mask_generator_function = function(batch, masking_probs, seed = NULL, paired_sampling = FALSE) {
-    if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility
-    n_features <- ncol(batch) # Get the number of features in the batch
-    size <- nrow(batch) # Get the number of observations in the batch
-    nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
-
-    # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
-    if (paired_sampling) size <- size / 2
-
-    # Sample the number of masked features in each row.
-    n_masked_each_row <- sample(x = seq(0, n_features), size = size, replace = TRUE, prob = masking_probs)
-
-    # Crate the mask matrix
-    mask <- torch::torch_zeros_like(batch)
-    for (i in seq(size)) {
-      if (n_masked_each_row[i] != 0) mask[i, sample(n_features, size = n_masked_each_row[i], replace = FALSE)] <- 1
-    }
+specified_prob_mask_generator <- function(masking_probs, paired_sampling = FALSE) {
+  specified_prob_mask_gen_tmp <- torch::nn_module(
+    name = "specified_prob_mask_generator", # field name Type of mask generator
+
+    # description Initialize a specified_probability mask generator.
+    initialize = function(masking_probs, paired_sampling = FALSE) {
+      self$masking_probs <- masking_probs / sum(masking_probs)
+      self$paired_sampling <- paired_sampling
+    },
+
+    # description Generates a specified probability mask by calling the self$specified_prob_mask_generator_function.
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are
+    # missing. If any are missing, then the returned mask will ensure that these missing entries are masked.
+    forward = function(batch) {
+      self$specified_prob_mask_generator_function(
+        batch = batch,
+        masking_prob = self$masking_probs,
+        paired_sampling = self$paired_sampling
+      )
+    },
+
+    # description Specified Probability Mask Generator: A mask generator that first samples the number of entries 'd' to
+    # be masked in the 'M'-dimensional observation 'x' in the batch based on the given M+1 probabilities. The 'd' masks
+    # are uniformly sampled from the 'M' possible feature indices. The d'th entry of the probability of having d-1
+    # masked values.
+    #
+    # details Note that mcar_mask_generator with p = 0.5 is the same as using specified_prob_mask_generator
+    # with masking_ratio = choose(M, 0:M), where M is the number of features. This function was initially
+    # created to check if increasing the probability of having a masks with many masked features improved
+    # vaeac's performance by focusing more on these situations during training.
+    #
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the entries are missing. If any
+    # are missing, then the returned mask will ensure that these missing entries are masked.
+    # param masking_probs An M+1 numerics containing the probabilities masking 'd' (0,...M) entries for each instance.
+    # param seed Integer. Used to set the seed for the sampling process such that we can reproduce the same masks.
+    # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}.
+    # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
+    # the first half and second half of the rows are duplicates of each other. That is,
+    # `batch = [row1, row1, row2, row2, row3, row3, ...]`.
+    #
+    # examples specified_prob_mask_generator_function(torch::torch_rand(c(5, 4)), masking_probs = c(2,7,5,3,3))
+    #
+    # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
+    # observed feature value will be masked. '0' means that the entry is NOT masked,
+    # i.e., the feature value will be observed/given/available.
+    specified_prob_mask_generator_function = function(batch, masking_probs, seed = NULL, paired_sampling = FALSE) {
+      if (!is.null(seed)) set.seed(seed) # If the user specify a seed for reproducibility
+      n_features <- ncol(batch) # Get the number of features in the batch
+      size <- nrow(batch) # Get the number of observations in the batch
+      nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
+
+      # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
+      if (paired_sampling) size <- size / 2
+
+      # Sample the number of masked features in each row.
+      n_masked_each_row <- sample(x = seq(0, n_features), size = size, replace = TRUE, prob = masking_probs)
+
+      # Crate the mask matrix
+      mask <- torch::torch_zeros_like(batch)
+      for (i in seq(size)) {
+        if (n_masked_each_row[i] != 0) mask[i, sample(n_features, size = n_masked_each_row[i], replace = FALSE)] <- 1
+      }
 
-    # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
-    if (paired_sampling) {
-      mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
+      # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
+      if (paired_sampling) {
+        mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
+      }
+
+      # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked.
+      return(mask + nan_mask >= 1)
     }
+  )
+
+  return(specified_prob_mask_gen_tmp(masking_probs = masking_probs, paired_sampling = paired_sampling))
+}
 
-    # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked.
-    return(mask + nan_mask >= 1)
-  }
-)
 
-## specified_masks_mask_generator -------------------------------------------------------------------------------------
 #' A [torch::nn_module()] Representing a specified_masks_mask_generator
 #'
 #' @description
@@ -2010,72 +2083,78 @@ specified_prob_mask_generator <- torch::nn_module(
 #'
 #' @author Lars Henry Berge Olsen
 #' @keywords internal
-specified_masks_mask_generator <- torch::nn_module(
-  name = "specified_masks_mask_generator", # field name Type of mask generator
-
-  # description Initialize a specified masks mask generator.
-  initialize = function(masks, masks_probs, paired_sampling = FALSE) {
-    self$masks <- masks
-    self$masks_probs <- masks_probs / sum(masks_probs)
-    self$paired_sampling <- paired_sampling
-  },
-
-  # description Generates a mask by calling self$specified_masks_mask_generator_function function.
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
-  # entries are missing. If any are missing, then the returned mask will ensure that
-  # these missing entries are masked.
-  forward = function(batch) {
-    self$specified_masks_mask_generator_function(
-      batch = batch,
-      masks = self$masks,
-      masks_probs = self$masks_probs,
-      paired_sampling = self$paired_sampling
-    )
-  },
-
-  # description Sampling Masks from the Provided Masks with the Given Probabilities
-  #
-  # details Function that takes in a 'batch' of observations and matrix of possible/allowed
-  # 'masks' which we are going to sample from based on the provided probability in 'masks_probs'.
-  # Function returns a mask of same shape as batch. Note that the batch can contain missing values,
-  # indicated by the "NaN" token. The mask will always mask missing values.
-  #
-  # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
-  # entries are missing. If any are missing, then the returned mask will ensure that
-  # these missing entries are masked.
-  # param masks Matrix/Tensor of possible/allowed 'masks' which we sample from.
-  # param masks_probs Array of 'probabilities' for each of the masks specified in 'masks'.
-  # Note that they do not need to be between 0 and 1. They are scaled, hence, they only need to be positive.
-  # param seed Integer. Used to set the seed for the sampling process such that we
-  # can reproduce the same masks.
-  # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}.
-  # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
-  # the first half and second half of the rows are duplicates of each other. That is,
-  # batch = [row1, row1, row2, row2, row3, row3, ...].
-  #
-  # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
-  # observed feature value will be masked. '0' means that the entry is NOT masked,
-  # i.e., the feature value will be observed/given/available.
-  specified_masks_mask_generator_function = function(batch, masks, masks_probs, seed = NULL, paired_sampling = FALSE) {
-    if (!is.null(seed)) set.seed(seed) # Set seed if the user specifies a seed for reproducibility.
-    nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
-    n_masks <- nrow(masks) # Get the number of masks to choose from
-    size <- nrow(batch) # Get the number of observations in the batch
-
-    # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
-    if (paired_sampling) size <- size / 2
-
-    # Sample 'n_observation' masks from the possible masks by first sampling the row indices
-    # based on the given mask probabilities and then use these indices to extract the masks.
-    mask_rows_indices <- sample.int(n = n_masks, size = size, replace = TRUE, prob = masks_probs)
-    mask <- torch::torch_tensor(masks[mask_rows_indices, ], dtype = torch::torch_float())
-
-    # If paired sampling, then concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
-    if (paired_sampling) {
-      mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
-    }
+specified_masks_mask_generator <- function(masks, masks_probs, paired_sampling = FALSE) {
+  specified_masks_mask_gen_tmp <- torch::nn_module(
+    name = "specified_masks_mask_generator", # field name Type of mask generator
+
+    # description Initialize a specified masks mask generator.
+    initialize = function(masks, masks_probs, paired_sampling = FALSE) {
+      self$masks <- masks
+      self$masks_probs <- masks_probs / sum(masks_probs)
+      self$paired_sampling <- paired_sampling
+    },
+
+    # description Generates a mask by calling self$specified_masks_mask_generator_function function.
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
+    # entries are missing. If any are missing, then the returned mask will ensure that
+    # these missing entries are masked.
+    forward = function(batch) {
+      self$specified_masks_mask_generator_function(
+        batch = batch,
+        masks = self$masks,
+        masks_probs = self$masks_probs,
+        paired_sampling = self$paired_sampling
+      )
+    },
+
+    # description Sampling Masks from the Provided Masks with the Given Probabilities
+    #
+    # details Function that takes in a 'batch' of observations and matrix of possible/allowed
+    # 'masks' which we are going to sample from based on the provided probability in 'masks_probs'.
+    # Function returns a mask of same shape as batch. Note that the batch can contain missing values,
+    # indicated by the "NaN" token. The mask will always mask missing values.
+    #
+    # param batch Matrix/Tensor. Only used to get the dimensions and to check if any of the
+    # entries are missing. If any are missing, then the returned mask will ensure that
+    # these missing entries are masked.
+    # param masks Matrix/Tensor of possible/allowed 'masks' which we sample from.
+    # param masks_probs Array of 'probabilities' for each of the masks specified in 'masks'.
+    # Note that they do not need to be between 0 and 1. They are scaled, hence, they only need to be positive.
+    # param seed Integer. Used to set the seed for the sampling process such that we
+    # can reproduce the same masks.
+    # param paired_sampling Boolean. If we are doing paired sampling. So include both S and \bar{S}.
+    # If TRUE, then batch must be sampled using 'paired_sampler' which creates batches where
+    # the first half and second half of the rows are duplicates of each other. That is,
+    # batch = [row1, row1, row2, row2, row3, row3, ...].
+    #
+    # return A binary matrix of the same size as 'batch'. An entry of '1' indicates that the
+    # observed feature value will be masked. '0' means that the entry is NOT masked,
+    # i.e., the feature value will be observed/given/available.
+    specified_masks_mask_generator_function =
+      function(batch, masks, masks_probs, seed = NULL, paired_sampling = FALSE) {
+        if (!is.null(seed)) set.seed(seed) # Set seed if the user specifies a seed for reproducibility.
+        nan_mask <- batch$isnan()$to(torch::torch_float()) # Check for missing values in the batch
+        n_masks <- nrow(masks) # Get the number of masks to choose from
+        size <- nrow(batch) # Get the number of observations in the batch
+
+        # If doing paired sampling, divide size by two as we later concatenate with the inverse mask.
+        if (paired_sampling) size <- size / 2
+
+        # Sample 'n_observation' masks from the possible masks by first sampling the row indices
+        # based on the given mask probabilities and then use these indices to extract the masks.
+        mask_rows_indices <- sample.int(n = n_masks, size = size, replace = TRUE, prob = masks_probs)
+        mask <- torch::torch_tensor(masks[mask_rows_indices, ], dtype = torch::torch_float())
+
+        # If paired sampling: concatenate the inverse mask and reorder to ensure correct order [m1, !m1, m2, !m2, ...].
+        if (paired_sampling) {
+          mask <- torch::torch_cat(c(mask, !mask), 1L)[c(matrix(seq_len(nrow(batch)), nrow = 2, byrow = TRUE)), ]
+        }
 
-    # Mask all entries that are missing or artificially masked by the Bernoulli mask. 1 means that the entry is masked.
-    return(mask + nan_mask >= 1)
-  }
-)
+        # Mask all missing or artificially masked entries by the Bernoulli mask. 1 means that the entry is masked.
+        return(mask + nan_mask >= 1)
+      }
+  )
+
+  # Initate the specified_masks_mask_generator and return it
+  return(specified_masks_mask_gen_tmp(masks = masks, masks_probs = masks_probs, paired_sampling = paired_sampling))
+}
diff --git a/R/zzz.R b/R/zzz.R
index cc55b6de6..f540d5c86 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -105,6 +105,7 @@
       "x_train",
       "x_train_preprocessed",
       "x_train_torch",
+      "self",
       "..current_comb",
       "..regression.response_var"
     )
diff --git a/man/memory_layer.Rd b/man/memory_layer.Rd
index f3b3ce54d..c0a5d1d37 100644
--- a/man/memory_layer.Rd
+++ b/man/memory_layer.Rd
@@ -4,11 +4,13 @@
 \alias{memory_layer}
 \title{A \code{\link[torch:nn_module]{torch::nn_module()}} Representing a Memory Layer}
 \usage{
-memory_layer(id, output = FALSE, add = FALSE, verbose = FALSE)
+memory_layer(id, shared_env, output = FALSE, add = FALSE, verbose = FALSE)
 }
 \arguments{
 \item{id}{A unique id to use as a key in the storage list.}
 
+\item{shared_env}{A shared environment for all instances of memory_layer where the inputs are stored.}
+
 \item{output}{Boolean variable indicating if the memory layer is to store input in storage or extract from storage.}
 
 \item{add}{Boolean variable indicating if the extracted value are to be added or concatenated to the input.
@@ -17,29 +19,34 @@ Only applicable when \code{output = TRUE}.}
 \item{verbose}{Boolean variable indicating if we want to give printouts to the user.}
 }
 \description{
-The layer is used to make skip-connections inside a \link[torch:nn_sequential]{torch::nn_sequential} network
-or between several \link[torch:nn_sequential]{torch::nn_sequential} networks without unnecessary code complication.
+The layer is used to make skip-connections inside a \code{\link[torch:nn_sequential]{torch::nn_sequential()}} network
+or between several \code{\link[torch:nn_sequential]{torch::nn_sequential()}} networks without unnecessary code complication.
 }
 \details{
-If \code{output = FALSE}, this layer stores its input in a static list \code{storage} with the key \verb{id`` and then passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If }output = TRUE\verb{, this layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If }add = TRUE\verb{, it returns sum of the stored vector and an }input\verb{, otherwise it returns their concatenation. If the tensor with specified }id\verb{is not in storage when the layer with}output = TRUE` is called, it would cause an exception.
+If \code{output = FALSE}, this layer stores its input in the \code{shared_env} with the key \code{id} and then
+passes the input to the next layer. I.e., when memory layer is used in the masked encoder. If \code{output = TRUE}, this
+layer takes stored tensor from the storage. I.e., when memory layer is used in the decoder. If \code{add = TRUE}, it
+returns sum of the stored vector and an \code{input}, otherwise it returns their concatenation. If the tensor with
+specified \code{id} is not in storage when the layer with \code{output = TRUE} is called, it would cause an exception.
 }
 \examples{
 \dontrun{
+memory_layer_env <- new.env()
 net1 <- torch::nn_sequential(
-  memory_layer("#1"),
-  memory_layer("#0.1"),
+  memory_layer("#1", shared_env = memory_layer_env),
+  memory_layer("#0.1", shared_env = memory_layer_env),
   torch::nn_linear(512, 256),
-  torch::nn_leaky_relu(),
-  # here add cannot be TRUE because the dimensions mismatch
-  memory_layer("#0.1", output = TRUE, add = FALSE),
+  torch::nn_leaky_relu(), # Here add cannot be TRUE because the dimensions mismatch
+  memory_layer("#0.1", shared_env = memory_layer_env, output = TRUE, add = FALSE),
   torch::nn_linear(768, 256),
   # the dimension after the concatenation with skip-connection is 512 + 256 = 768
 )
 net2 <- torch::nn_equential(
   torch::nn_linear(512, 512),
-  memory_layer("#1", output = TRUE, add = TRUE),
+  memory_layer("#1", shared_env = memory_layer_env, output = TRUE, add = TRUE),
   ...
 )
+# Here a and c must be of correct dimensions, e.g., a = torch::torch_ones(1,512).
 b <- net1(a)
 d <- net2(c) # net2 must be called after net1, otherwise tensor '#1' will not be in storage.
 }
diff --git a/man/shapr-package.Rd b/man/shapr-package.Rd
index 0cf15f8e8..1041460af 100644
--- a/man/shapr-package.Rd
+++ b/man/shapr-package.Rd
@@ -30,9 +30,9 @@ Authors:
 
 Other contributors:
 \itemize{
-  \item Anders L<U+00F8>land \email{Anders.Loland@nr.no} [contributor]
+  \item Anders Løland \email{Anders.Loland@nr.no} [contributor]
   \item Jens Christian Wahl \email{Jens.Christian.Wahl@nr.no} [contributor]
-  \item Camilla Lingj<U+00E6>rde [contributor]
+  \item Camilla Lingjærde [contributor]
   \item Norsk Regnesentral [copyright holder, funder]
 }
 
diff --git a/man/vaeac_get_extra_para_default.Rd b/man/vaeac_get_extra_para_default.Rd
index d3a35736a..f2229c3b3 100644
--- a/man/vaeac_get_extra_para_default.Rd
+++ b/man/vaeac_get_extra_para_default.Rd
@@ -40,8 +40,9 @@ then a name will be generated based on \code{\link[base:Sys.time]{base::Sys.time
 ensure a valid file name for all operating systems.}
 
 \item{vaeac.folder_to_save_model}{String (default is \code{\link[base:tempfile]{base::tempdir()}}). String specifying a path to a folder where
-the function is to save the fitted vaeac model. Note that  the path will be removed from the returned
-\code{\link[=explain]{explain()}} object if \code{vaeac.save_model = FALSE}.}
+the function is to save the fitted vaeac model. Note that the path will be removed from the returned
+\code{\link[=explain]{explain()}} object if \code{vaeac.save_model = FALSE}. Furthermore, the model cannot be moved from its
+original folder if we are to use the \code{\link[=vaeac_train_model_continue]{vaeac_train_model_continue()}} function to continue training the model.}
 
 \item{vaeac.pretrained_vaeac_model}{List or String (default is \code{NULL}). 1) Either a list of class
 \code{vaeac}, i.e., the list stored in \code{explanation$internal$parameters$vaeac} where \code{explanation} is the returned list
diff --git a/vignettes/understanding_shapr_vaeac.Rmd b/vignettes/understanding_shapr_vaeac.Rmd
index 6eb6088f9..79053b197 100644
--- a/vignettes/understanding_shapr_vaeac.Rmd
+++ b/vignettes/understanding_shapr_vaeac.Rmd
@@ -456,7 +456,7 @@ rbind(
 
 
 ## Progressr {#progress_bar}
-As discussed in the main vignette, the `shapr` package provides two ways for recieving information about the progress
+As discussed in the main vignette, the `shapr` package provides two ways for receiving information about the progress
 of the approach. First, the `shapr` package provides progress updates of the computation of the Shapley values through
 the `progressr` package. Second, the user can also get information by setting `verbose = 2` in `explain()`, which
 will print out extra information related to the `vaeac` approach. The `verbose` parameter works independently of the
@@ -466,7 +466,6 @@ two examples here, and refer the reader to the main vignette for more detailed i
 By setting `verbose = 2`, we get messages about the progress of the `vaeac` approach.
 
 ```r
-handlers("void") # To silence all progressr updates
 expl_with_messages <- explain(
   model = model,
   x_explain = x_explain,
@@ -528,7 +527,7 @@ approach and only get the progress bars. See the main vignette for examples for
 
 ```r
 library(progressr)
-progressr::handlers("cli")
+progressr::handlers("cli") # Use `progressr::handlers("void")` to silence all `progressr` updates
 progressr::with_progress({
   expl_with_progressr <- explain(
     model = model,
diff --git a/vignettes/understanding_shapr_vaeac.Rmd.orig b/vignettes/understanding_shapr_vaeac.Rmd.orig
index bb1673d90..9d12a64c5 100644
--- a/vignettes/understanding_shapr_vaeac.Rmd.orig
+++ b/vignettes/understanding_shapr_vaeac.Rmd.orig
@@ -367,7 +367,7 @@ rbind(
 
 
 ## Progressr {#progress_bar}
-As discussed in the main vignette, the `shapr` package provides two ways for recieving information about the progress
+As discussed in the main vignette, the `shapr` package provides two ways for receiving information about the progress
 of the approach. First, the `shapr` package provides progress updates of the computation of the Shapley values through
 the `progressr` package. Second, the user can also get information by setting `verbose = 2` in `explain()`, which
 will print out extra information related to the `vaeac` approach. The `verbose` parameter works independently of the
@@ -376,7 +376,6 @@ two examples here, and refer the reader to the main vignette for more detailed i
 
 By setting `verbose = 2`, we get messages about the progress of the `vaeac` approach.
 ```{r progressr-false-verbose-2, cache = TRUE}
-handlers("void") # To silence all progressr updates
 expl_with_messages <- explain(
   model = model,
   x_explain = x_explain,
@@ -396,7 +395,7 @@ step for the final `vaeac` model. Note that one can set `verbose = 0` to not get
 approach and only get the progress bars. See the main vignette for examples for how to change the progress bar.
 ```{r progressr-true-verbose-2, cache = TRUE}
 library(progressr)
-progressr::handlers("cli")
+progressr::handlers("cli") # Use `progressr::handlers("void")` to silence all `progressr` updates
 progressr::with_progress({
   expl_with_progressr <- explain(
     model = model,