From 96e51c2dde8f685645a78bf91cd98058df2efd64 Mon Sep 17 00:00:00 2001
From: Jouni Helske <jouni.helske@utu.fi>
Date: Mon, 21 Oct 2024 13:15:03 +0300
Subject: [PATCH] remove penalty

---
 R/ame.R                              |  2 +-
 R/bootstrap.R                        | 20 ++++--------
 R/check_build_arguments.R            |  8 ++---
 R/estimate_mnhmm.R                   |  4 +--
 R/estimate_nhmm.R                    |  8 ++---
 R/fit_mnhmm.R                        | 38 ++++++++++++----------
 R/fit_nhmm.R                         | 47 +++++++++++++++-------------
 R/get_probs.R                        |  2 +-
 R/model_matrix.R                     | 12 +++----
 R/utilities.R                        |  3 ++
 man/bootstrap.Rd                     |  5 ---
 man/estimate_mnhmm.Rd                |  4 ---
 man/estimate_nhmm.Rd                 |  6 +---
 tests/testthat/test-simulate_mnhmm.R |  6 ++--
 14 files changed, 74 insertions(+), 91 deletions(-)

diff --git a/R/ame.R b/R/ame.R
index 60cd261f..bb56f9c2 100644
--- a/R/ame.R
+++ b/R/ame.R
@@ -216,7 +216,7 @@ ame.mnhmm <- function(
   newdata[[variable]] <- values[2]
   model2 <- update(model, newdata)
   
-  if (!attr(model$X_omega, "iv")) {
+  if (!attr(model$X_cluster, "iv")) {
     X1 <- model1$X_cluster[, 1L, drop = FALSE]
     X2 <- model2$X_cluster[, 1L, drop = FALSE]
   } else {
diff --git a/R/bootstrap.R b/R/bootstrap.R
index 190def1d..b9ab4341 100644
--- a/R/bootstrap.R
+++ b/R/bootstrap.R
@@ -67,8 +67,6 @@ permute_clusters <- function(model, pcp_mle) {
 #' nonparametric or parametric bootstrap should be used. The former samples 
 #' sequences with replacement, whereas the latter simulates new datasets based 
 #' on the model.
-#' @param penalty penalty term for model estimation. By default, same penalty is used 
-#' as was in model estimation by [estimate_nhmm()] or [estimate_mnhmm()].
 #' @param verbose Should the progress bar be displayed? Default is `FALSE`.
 #' @param ... Additional arguments to [nloptr()].
 #' @return The original model with additional element `model$boot`, which 
@@ -83,16 +81,13 @@ bootstrap_coefs <- function(model, ...) {
 #' @export
 bootstrap_coefs.nhmm <- function(model, B = 1000, 
                                  method = c("nonparametric", "parametric"),
-                                 penalty, verbose = FALSE, ...) {
+                                 verbose = FALSE, ...) {
   method <- match.arg(method)
   stopifnot_(
     checkmate::test_int(x = B, lower = 0L), 
     "Argument {.arg B} must be a single positive integer."
   )
   init <- model$etas
-  if (missing(penalty)) {
-    penalty <- model$estimation_results$penalty
-  }
   gammas_mle <- model$gammas
   gamma_pi <- replicate(B, gammas_mle$pi, simplify = FALSE)
   gamma_A <- replicate(B, gammas_mle$A, simplify = FALSE)
@@ -102,7 +97,7 @@ bootstrap_coefs.nhmm <- function(model, B = 1000,
     for (i in seq_len(B)) {
       mod <- bootstrap_model(model)
       fit <- fit_nhmm(mod, init, init_sd = 0, restarts = 0, threads = 1, 
-                      penalty = penalty, ...)
+                      ...)
       fit$gammas <- permute_states(fit$gammas, gammas_mle)
       gamma_pi[[i]] <- fit$gammas$pi
       gamma_A[[i]] <- fit$gammas$A
@@ -125,7 +120,7 @@ bootstrap_coefs.nhmm <- function(model, B = 1000,
         N, T_, M, S, formula_pi, formula_A, formula_B,
         data = d, time, id, init)$model
       fit <- fit_nhmm(mod, init, init_sd = 0, restarts = 0, threads = 1, 
-                       penalty = penalty, ...)
+                      ...)
       fit$gammas <- permute_states(fit$gammas, gammas_mle)
       gamma_pi[[i]] <- fit$gammas$pi
       gamma_A[[i]] <- fit$gammas$A
@@ -141,16 +136,13 @@ bootstrap_coefs.nhmm <- function(model, B = 1000,
 #' @export
 bootstrap_coefs.mnhmm <- function(model, B = 1000, 
                                   method = c("nonparametric", "parametric"),
-                                  penalty, verbose = FALSE, ...) {
+                                  verbose = FALSE, ...) {
   method <- match.arg(method)
   stopifnot_(
     checkmate::test_int(x = B, lower = 0L), 
     "Argument {.arg B} must be a single positive integer."
   )
   init <- model$etas
-  if (missing(penalty)) {
-    penalty <- model$estimation_results$penalty
-  }
   gammas_mle <- model$gammas
   pcp_mle <- posterior_cluster_probabilities(model)
   gamma_pi <- replicate(B, gammas_mle$pi, simplify = FALSE)
@@ -163,7 +155,7 @@ bootstrap_coefs.mnhmm <- function(model, B = 1000,
     for (i in seq_len(B)) {
       mod <- bootstrap_model(model)
       fit <- fit_mnhmm(mod, init, init_sd = 0, restarts = 0, threads = 1, 
-                       penalty = penalty, ...)
+                       ...)
       fit <- permute_clusters(fit, pcp_mle)
       for (j in seq_len(D)) {
         out <- permute_states(
@@ -197,7 +189,7 @@ bootstrap_coefs.mnhmm <- function(model, B = 1000,
         N, T_, M, S, D, formula_pi, formula_A, formula_B, formula_omega,
         data = d, time, id, init)$model
       fit <- fit_mnhmm(mod, init, init_sd = 0, restarts = 0, threads = 1, 
-                        penalty = penalty, ...)
+                       ...)
       fit <- permute_clusters(fit, pcp_mle)
       for (j in seq_len(D)) {
         out <- permute_states(
diff --git a/R/check_build_arguments.R b/R/check_build_arguments.R
index 793a2f95..6b1bf7fd 100644
--- a/R/check_build_arguments.R
+++ b/R/check_build_arguments.R
@@ -122,14 +122,10 @@
   dim(sequence_lengths) <- length(sequence_lengths)
   if (n_channels > 1L) {
     nobs <- sum(sapply(x, function(x) {
-      sum(!(x == attr(x[[1]], "nr") |
-              x == attr(x[[1]], "void") |
-              is.na(x)))
+      sum(!(x == attr(x, "nr") | x == attr(x, "void") | is.na(x)))
     })) / n_channels
   } else {
-    nobs <- sum(!(x == attr(x, "nr") |
-                    x == attr(x, "void") |
-                    is.na(x)))
+    nobs <- sum(!(x == attr(x, "nr") | x == attr(x, "void") | is.na(x)))
   }
   
   attr(x, "n_channels") <- ifelse(
diff --git a/R/estimate_mnhmm.R b/R/estimate_mnhmm.R
index 3b62b26f..aff93324 100644
--- a/R/estimate_mnhmm.R
+++ b/R/estimate_mnhmm.R
@@ -44,7 +44,7 @@ estimate_mnhmm <- function(
     transition_formula = ~1, emission_formula = ~1, cluster_formula = ~1,
     data = NULL, time = NULL, id = NULL, state_names = NULL, 
     channel_names = NULL, cluster_names = NULL, inits = "random", init_sd = 2, 
-    restarts = 0L, threads = 1L, store_data = TRUE, penalty = 0, ...) {
+    restarts = 0L, threads = 1L, store_data = TRUE, ...) {
   
   call <- match.call()
   model <- build_mnhmm(
@@ -59,7 +59,7 @@ estimate_mnhmm <- function(
   if (store_data) {
     model$data <- data
   }
-  out <- fit_mnhmm(model, inits, init_sd, restarts, threads, penalty, ...)
+  out <- fit_mnhmm(model, inits, init_sd, restarts, threads, ...)
   
   attr(out, "call") <- call
   out
diff --git a/R/estimate_nhmm.R b/R/estimate_nhmm.R
index 6478a43b..d16964c8 100644
--- a/R/estimate_nhmm.R
+++ b/R/estimate_nhmm.R
@@ -5,7 +5,7 @@
 #' (potentially) depend on covariates.
 #' 
 #' By default, the model parameters are estimated using LBFGS algorithm of 
-#' [nloptr::nloptr()]. The log-likelihood or the penalized log-likelihood is 
+#' [nloptr::nloptr()]. The log-likelihood is 
 #' scaled by the number of non-missing observations (`nobs(model)`), and the 
 #' convergence is claimed when either the absolute or relative change of this 
 #' objective function is less than `1e-8`, or the absolute change of the 
@@ -51,8 +51,6 @@
 #' is stored to the model object. For large datasets, this can be set to 
 #' `FALSE`, in which case you might need to pass the data separately to some 
 #' post-prosessing functions.
-#' @param penalty Penalty for penalized maximum likelihood. Default is `0` 
-#' i.e. no penalization as the penalty is of form `0.5 * sum(pars^2) * penalty`.
 #' @param ... Additional arguments to [nloptr::nloptr()]. Most importantly,
 #' argument `maxeval` defines the maximum number of iterations for optimization.
 #' The default is `1000` for restarts and `10000` for the final optimization. 
@@ -81,7 +79,7 @@ estimate_nhmm <- function(
     transition_formula = ~1, emission_formula = ~1, 
     data = NULL, time = NULL, id = NULL, state_names = NULL, channel_names = NULL, 
     inits = "random", init_sd = 2, restarts = 0L, threads = 1L, 
-    store_data = TRUE, penalty = 0, ...) {
+    store_data = TRUE, ...) {
   
   call <- match.call()
   
@@ -97,7 +95,7 @@ estimate_nhmm <- function(
   if (store_data) {
     model$data <- data
   }
-  out <- fit_nhmm(model, inits, init_sd, restarts, threads, penalty, ...)
+  out <- fit_nhmm(model, inits, init_sd, restarts, threads, ...)
   attr(out, "call") <- call
   out
 }
diff --git a/R/fit_mnhmm.R b/R/fit_mnhmm.R
index d980ea96..e93d2203 100644
--- a/R/fit_mnhmm.R
+++ b/R/fit_mnhmm.R
@@ -1,7 +1,7 @@
 #' Estimate a Mixture Non-homogeneous Hidden Markov Model
 #'
 #' @noRd
-fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty, 
+fit_mnhmm <- function(model, inits, init_sd, restarts, threads, 
                       save_all_solutions = FALSE, ...) {
   stopifnot_(
     checkmate::test_int(x = threads, lower = 1L), 
@@ -11,14 +11,15 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
     checkmate::test_int(x = restarts, lower = 0L), 
     "Argument {.arg restarts} must be a single integer."
   )
-  obs <- create_obsArray(model)
-  if (model$n_channels == 1) {
-    obs <- array(obs, dim(obs)[2:3])
-  }
   M <- model$n_symbols
   S <- model$n_states
   D <- model$n_clusters
   T_ <- model$length_of_sequences
+  C <- model$n_channels
+  obs <- create_obsArray(model)
+  if (C == 1) {
+    obs <- array(obs, dim(obs)[2:3])
+  }
   if (identical(inits, "random")) {
     inits <- list(
       initial_probs = NULL, 
@@ -37,7 +38,7 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
   iv_pi <- attr(model$X_initial, "iv")
   iv_A <- attr(model$X_transition, "iv")
   iv_B <- attr(model$X_emission, "iv")
-  iv_omega <- attr(model$X_omega, "iv")
+  iv_omega <- attr(model$X_cluster, "iv")
   tv_A <- attr(model$X_transition, "tv")
   tv_B <- attr(model$X_emission, "tv")
   X_i <- model$X_initial
@@ -68,7 +69,7 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
     model$gammas$A <- c(eta_to_gamma_cube_field(
       model$etas$A
     ))
-    if (model$n_channels == 1L) {
+    if (C == 1L) {
       model$etas$B <- create_eta_B_mnhmm(
         pars[n_i + n_s + seq_len(n_o)], S, M, K_o, D
       )
@@ -107,7 +108,7 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
   if (is.null(dots$check_derivatives)) 
     dots$check_derivatives <- FALSE
   
-  if (model$n_channels == 1L) {
+  if (C == 1L) {
     Qs <- t(create_Q(S))
     Qm <- t(create_Q(M))
     Qd <- t(create_Q(D))
@@ -126,8 +127,10 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
           eta_omega, X_d, obs, iv_pi, iv_A, iv_B, tv_A, tv_B, iv_omega, 
           Ti
         )
-        list(objective = - (out$loglik + 0.5 * sum(pars^2) * penalty) / n_obs,
-             gradient = - (unlist(out[-1]) + pars * penalty) / n_obs)
+        list(
+          objective = - out$loglik / n_obs, 
+          gradient = - unlist(out[-1]) / n_obs
+        )
       }
     } else {
       objectivef <- function(pars) {
@@ -144,7 +147,7 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
           eta_omega, X_d, obs
         )
         
-        - (sum(apply(out[, T_, ], 2, logSumExp)) + 0.5 * sum(pars^2) * penalty) / n_obs
+        - sum(apply(out[, T_, ], 2, logSumExp)) / n_obs
       }
     }
   } else {
@@ -172,8 +175,10 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
           eta_omega, X_d, obs, M, iv_pi, iv_A, iv_B, tv_A, tv_B, iv_omega,
           Ti
         )
-        list(objective = - (out$loglik + 0.5 * sum(pars^2) * penalty) / n_obs,
-             gradient = - (unlist(out[-1]) + pars * penalty) / n_obs)
+        list(
+          objective = - out$loglik / n_obs,
+          gradient = - unlist(out[-1]) / n_obs
+        )
       }
     } else {
       objectivef <- function(pars) {
@@ -196,8 +201,8 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
           eta_B, X_o,
           eta_omega, X_d,
           obs, M)
-        
-        - (sum(apply(out[, T_, ], 2, logSumExp)) + 0.5 * sum(pars^2) * penalty) / n_obs
+
+        - sum(apply(out[, T_, ], 2, logSumExp)) / n_obs
       }
     }
   }
@@ -267,7 +272,7 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
   model$gammas$A <- c(eta_to_gamma_cube_field(
     model$etas$A
   ))
-  if (model$n_channels == 1L) {
+  if (C == 1L) {
     model$etas$B <- create_eta_B_mnhmm(
       pars[n_i + n_s + seq_len(n_o)], S, M, K_o, D
     )
@@ -297,7 +302,6 @@ fit_mnhmm <- function(model, inits, init_sd, restarts, threads, penalty,
     logliks_of_restarts = if(restarts > 0L) logliks else NULL, 
     return_codes_of_restarts = if(restarts > 0L) return_codes else NULL,
     all_solutions = all_solutions,
-    penalty = penalty,
     time = end_time - start_time
   )
   model
diff --git a/R/fit_nhmm.R b/R/fit_nhmm.R
index 015210ec..ef3d3057 100644
--- a/R/fit_nhmm.R
+++ b/R/fit_nhmm.R
@@ -1,7 +1,7 @@
 #' Estimate a Non-homogeneous Hidden Markov Model
 #'
 #' @noRd
-fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all_solutions = FALSE, ...) {
+fit_nhmm <- function(model, inits, init_sd, restarts, threads, save_all_solutions = FALSE, ...) {
   stopifnot_(
     checkmate::test_int(x = threads, lower = 1L), 
     "Argument {.arg threads} must be a single positive integer."
@@ -10,14 +10,14 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
     checkmate::test_int(x = restarts, lower = 0L), 
     "Argument {.arg restarts} must be a single integer."
   )
-  obs <- create_obsArray(model)
-  if (model$n_channels == 1) {
-    obs <- array(obs, dim(obs)[2:3])
-  }
   M <- model$n_symbols
   S <- model$n_states
   T_ <- model$length_of_sequences
-  
+  C <- model$n_channels
+  obs <- create_obsArray(model)
+  if (C == 1) {
+    obs <- array(obs, dim(obs)[2:3])
+  }
   if (identical(inits, "random")) {
     inits <- list(
       initial_probs = NULL, 
@@ -45,6 +45,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
   K_o <- nrow(X_o)
   Ti <- model$sequence_lengths
   n_obs <- nobs(model)
+ 
   dots <- list(...)
   if (isTRUE(dots$maxeval < 0)) {
     pars <- unlist(create_initial_values(
@@ -54,7 +55,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
     model$gammas$pi <- eta_to_gamma_mat(model$etas$pi)
     model$etas$A <- create_eta_A_nhmm(pars[n_i + seq_len(n_s)], S, K_s)
     model$gammas$A <- eta_to_gamma_cube(model$etas$A)
-    if (model$n_channels == 1L) {
+    if (C == 1L) {
       model$etas$B <- create_eta_B_nhmm(
         pars[n_i + n_s + seq_len(n_o)], S, M, K_o
       )
@@ -85,7 +86,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
   if (is.null(dots$check_derivatives)) 
     dots$check_derivatives <- FALSE
   
-  if (model$n_channels == 1L) {
+  if (C == 1L) {
     Qs <- t(create_Q(S))
     Qm <- t(create_Q(M))
     if (need_grad) {
@@ -99,8 +100,10 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
           Qs, Qm, eta_pi, X_i, eta_A, X_s, eta_B, X_o, obs,
           iv_pi, iv_A, iv_B, tv_A, tv_B, Ti
         )
-        list(objective = - (out$loglik + 0.5 * sum(pars^2) * penalty) / n_obs,
-             gradient = - (unlist(out[-1]) + pars * penalty)/ n_obs)
+        list(
+          objective = - out$loglik / n_obs, 
+          gradient = - unlist(out[-1]) / n_obs
+        )
       }
     } else {
       objectivef <- function(pars) {
@@ -112,8 +115,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
         out <- forward_nhmm_singlechannel(
           eta_pi, X_i, eta_A, X_s, eta_B, X_o, obs
         )
-        
-        - (sum(apply(out[, T_, ], 2, logSumExp))  + 0.5 * sum(pars^2) * penalty) / n_obs
+        - sum(apply(out[, T_, ], 2, logSumExp)) / n_obs
       }
     }
   } else {
@@ -130,8 +132,10 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
           Qs, Qm, eta_pi, X_i, eta_A, X_s, eta_B, X_o, obs, M,
           iv_pi, iv_A, iv_B, tv_A, tv_B, Ti
         )
-        list(objective = - (out$loglik + 0.5 * sum(pars^2) * penalty) / n_obs,
-             gradient = - (unlist(out[-1]) + pars * penalty)/ n_obs)
+        list(
+          objective = - out$loglik / n_obs, 
+          gradient = - unlist(out[-1]) / n_obs
+        )
       }
     } else {
       objectivef <- function(pars) {
@@ -143,7 +147,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
         out <- forward_nhmm_multichannel(
           eta_pi, X_i, eta_A, X_s, eta_B, X_o, obs, M
         )
-        - (sum(apply(out[, T_, ], 2, logSumExp)) + 0.5 * sum(pars^2) * penalty) / n_obs
+        - sum(apply(out[, T_, ], 2, logSumExp)) / n_obs
       }
     }
   }
@@ -193,7 +197,7 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
       inits, S, M, init_sd, K_i, K_s, K_o
     ))
   }
-
+  
   out <- nloptr(
     x0 = init, eval_f = objectivef,
     opts = dots
@@ -207,15 +211,15 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
   model$gammas$pi <- eta_to_gamma_mat(model$etas$pi)
   model$etas$A <- create_eta_A_nhmm(pars[n_i + seq_len(n_s)], S, K_s)
   model$gammas$A <- eta_to_gamma_cube(model$etas$A)
-  if (model$n_channels == 1L) {
+  if (C == 1L) {
     model$etas$B <- create_eta_B_nhmm(
-        pars[n_i + n_s + seq_len(n_o)], S, M, K_o
-      )
+      pars[n_i + n_s + seq_len(n_o)], S, M, K_o
+    )
     model$gammas$B <- eta_to_gamma_cube(model$etas$B)
   } else {
     model$etas$B <- create_eta_multichannel_B_nhmm(
-        pars[n_i + n_s + seq_len(n_o)], S, M, K_o
-      )
+      pars[n_i + n_s + seq_len(n_o)], S, M, K_o
+    )
     model$gammas$B <- eta_to_gamma_cube_field(model$etas$B)
   }
   
@@ -227,7 +231,6 @@ fit_nhmm <- function(model, inits, init_sd, restarts, threads, penalty, save_all
     logliks_of_restarts = if(restarts > 0L) logliks else NULL, 
     return_codes_of_restarts = if(restarts > 0L) return_codes else NULL,
     all_solutions = all_solutions,
-    penalty = penalty,
     time = end_time - start_time
   )
   model
diff --git a/R/get_probs.R b/R/get_probs.R
index 19eb28e7..f7cf0370 100644
--- a/R/get_probs.R
+++ b/R/get_probs.R
@@ -271,7 +271,7 @@ get_cluster_probs.mnhmm <- function(model, probs, ...) {
   } else {
     ids <- rownames(model$observations[[1]])
   }
-  if (!attr(model$X_omega, "iv")) {
+  if (!attr(model$X_cluster, "iv")) {
     X <- model$X_cluster[, 1L, drop = FALSE]
   } else {
     X <- model$X_cluster
diff --git a/R/model_matrix.R b/R/model_matrix.R
index 94147100..33c96d35 100644
--- a/R/model_matrix.R
+++ b/R/model_matrix.R
@@ -76,7 +76,7 @@ model_matrix_initial_formula <- function(formula, data, n_sequences,
 #' @noRd
 model_matrix_transition_formula <- function(formula, data, n_sequences,
                                             length_of_sequences, n_states,
-                                            time, id, sequence_length, 
+                                            time, id, sequence_lengths, 
                                             X_mean, X_sd) {
   icp_only <- intercept_only(formula)
   if (icp_only) {
@@ -155,8 +155,8 @@ model_matrix_emission_formula <- function(formula, data, n_sequences,
     coef_names <- "(Intercept)"
     iv <- tv <- FALSE
     missing_values <- integer(0)
-    attr(X, "X_mean") <- NULL
-    attr(X, "X_sd") <- NULL
+    X_mean <- NULL
+    X_sd <- NULL
   } else {
     X <- stats::model.matrix.lm(
       formula, 
@@ -223,8 +223,8 @@ model_matrix_cluster_formula <- function(formula, data, n_sequences, n_clusters,
     X <- matrix(1, n_sequences, 1)
     coef_names <- "(Intercept)"
     iv <- FALSE
-    attr(X, "X_mean") <- NULL
-    attr(X, "X_sd") <- NULL
+    X_mean <- NULL
+    X_sd <- NULL
   } else {
     first_time_point <- min(data[[time]])
     X <- stats::model.matrix.lm(
@@ -232,7 +232,7 @@ model_matrix_cluster_formula <- function(formula, data, n_sequences, n_clusters,
       data = data[data[[time]] == first_time_point, ], 
       na.action = stats::na.pass
     )
-    cols <- which(colnames(X) != "(Intercept)")
+    cols <- which(colnames(X) != "(Intercept)") #always first column(?)
     if (missing(X_mean)) {
       X_mean <- X_sd <- TRUE
     }
diff --git a/R/utilities.R b/R/utilities.R
index e8d456c8..5424e86d 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -1,3 +1,6 @@
+no_itcp_idx <- function(x) {
+  which(arrayInd(seq_along(x), dim(x))[ , 2] != 1)
+}
 #' Split mnhmm for multiple nhmms for get_probs
 #' 
 #' @noRd
diff --git a/man/bootstrap.Rd b/man/bootstrap.Rd
index 99b15d15..6dc7e92f 100644
--- a/man/bootstrap.Rd
+++ b/man/bootstrap.Rd
@@ -12,7 +12,6 @@ bootstrap_coefs(model, ...)
   model,
   B = 1000,
   method = c("nonparametric", "parametric"),
-  penalty,
   verbose = FALSE,
   ...
 )
@@ -21,7 +20,6 @@ bootstrap_coefs(model, ...)
   model,
   B = 1000,
   method = c("nonparametric", "parametric"),
-  penalty,
   verbose = FALSE,
   ...
 )
@@ -38,9 +36,6 @@ nonparametric or parametric bootstrap should be used. The former samples
 sequences with replacement, whereas the latter simulates new datasets based
 on the model.}
 
-\item{penalty}{penalty term for model estimation. By default, same penalty is used
-as was in model estimation by \code{\link[=estimate_nhmm]{estimate_nhmm()}} or \code{\link[=estimate_mnhmm]{estimate_mnhmm()}}.}
-
 \item{verbose}{Should the progress bar be displayed? Default is \code{FALSE}.}
 }
 \value{
diff --git a/man/estimate_mnhmm.Rd b/man/estimate_mnhmm.Rd
index 5cf10838..222116fb 100644
--- a/man/estimate_mnhmm.Rd
+++ b/man/estimate_mnhmm.Rd
@@ -23,7 +23,6 @@ estimate_mnhmm(
   restarts = 0L,
   threads = 1L,
   store_data = TRUE,
-  penalty = 0,
   ...
 )
 }
@@ -92,9 +91,6 @@ is stored to the model object. For large datasets, this can be set to
 \code{FALSE}, in which case you might need to pass the data separately to some
 post-prosessing functions.}
 
-\item{penalty}{Penalty for penalized maximum likelihood. Default is \code{0}
-i.e. no penalization as the penalty is of form \code{0.5 * sum(pars^2) * penalty}.}
-
 \item{...}{Additional arguments to \code{\link[nloptr:nloptr]{nloptr::nloptr()}}. Most importantly,
 argument \code{maxeval} defines the maximum number of iterations for optimization.
 The default is \code{1000} for restarts and \code{10000} for the final optimization.
diff --git a/man/estimate_nhmm.Rd b/man/estimate_nhmm.Rd
index b5244f02..30b580a2 100644
--- a/man/estimate_nhmm.Rd
+++ b/man/estimate_nhmm.Rd
@@ -20,7 +20,6 @@ estimate_nhmm(
   restarts = 0L,
   threads = 1L,
   store_data = TRUE,
-  penalty = 0,
   ...
 )
 }
@@ -79,9 +78,6 @@ is stored to the model object. For large datasets, this can be set to
 \code{FALSE}, in which case you might need to pass the data separately to some
 post-prosessing functions.}
 
-\item{penalty}{Penalty for penalized maximum likelihood. Default is \code{0}
-i.e. no penalization as the penalty is of form \code{0.5 * sum(pars^2) * penalty}.}
-
 \item{...}{Additional arguments to \code{\link[nloptr:nloptr]{nloptr::nloptr()}}. Most importantly,
 argument \code{maxeval} defines the maximum number of iterations for optimization.
 The default is \code{1000} for restarts and \code{10000} for the final optimization.
@@ -100,7 +96,7 @@ Function \code{estimate_nhmm} estimates a hidden Markov model object of class
 }
 \details{
 By default, the model parameters are estimated using LBFGS algorithm of
-\code{\link[nloptr:nloptr]{nloptr::nloptr()}}. The log-likelihood or the penalized log-likelihood is
+\code{\link[nloptr:nloptr]{nloptr::nloptr()}}. The log-likelihood is
 scaled by the number of non-missing observations (\code{nobs(model)}), and the
 convergence is claimed when either the absolute or relative change of this
 objective function is less than \code{1e-8}, or the absolute change of the
diff --git a/tests/testthat/test-simulate_mnhmm.R b/tests/testthat/test-simulate_mnhmm.R
index 2fb012f6..e22c8c2c 100644
--- a/tests/testthat/test-simulate_mnhmm.R
+++ b/tests/testthat/test-simulate_mnhmm.R
@@ -25,9 +25,9 @@ test_that("simulate_mnhmm, coef and get_probs works", {
   expect_equal(
     table(unlist(sim$states)),
     structure(
-      c(`Cluster 1: State 1` = 88L, `Cluster 1: State 2` = 142L, 
-        `Cluster 2: State 1` = 140L, `Cluster 2: State 2` = 40L, 
-        `Cluster 3: State 1` = 46L, `Cluster 3: State 2` = 44L,
+      c(`Cluster 1: State 1` = 98L, `Cluster 1: State 2` = 162L, 
+        `Cluster 2: State 1` = 158L, `Cluster 2: State 2` = 42L, 
+        `Cluster 3: State 1` = 21L, `Cluster 3: State 2` = 19L,
         `*` = 0L, `%` = 0L), dim = 8L, 
       dimnames = structure(list(
         c("Cluster 1: State 1", "Cluster 1: State 2", "Cluster 2: State 1",