Merge pull request #78 from NIEHS/main-sciome

Parallel createUMultivariate and test fixes
NIEHS · Dec 6, 2024 · 39c0ffc · 39c0ffc
2 parents c3937f4 + b7e6887
commit 39c0ffc
Show file tree

Hide file tree

Showing 12 changed files with 136 additions and 267 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: PrestoGP
 Type: Package
 Title: Penalized Regression for Spatio-Temporal Outcomes via Gaussian Processes 
-Version: 0.2.0.9036
+Version: 0.2.0.9037
 Authors@R: c(
     person(given = "Eric",
            family = "Bair",
@@ -23,7 +23,7 @@ Description: Simultaneous variable seletion and estimation of LUR models with sp
 Depends:
     R (>= 3.5.0)
 LinkingTo:
-    Rcpp, RcppArmadillo
+    Rcpp, RcppArmadillo, BH
 Imports:
     GPvecchia,
     fields,

diff --git a/R/PrestoGP_Model.R b/R/PrestoGP_Model.R
@@ -826,16 +826,31 @@ setMethod(
     model <- specify(model)
 
     if (is.null(beta.hat)) {
+      if (!quiet) {
+        cat("\n")
+      }
       if (sum(!model@Y_obs) > 0 & min(lodv) < Inf) {
+        if (!quiet) {
+          cat("Imputing missing y's and estimating initial beta...", "\n")
+        }
         cur.mi <- lod_reg_mi(model@Y_train, model@X_train, lodv, !model@Y_obs,
-          parallel = parallel, foldid = foldid)
+          parallel = parallel, foldid = foldid, verbose = verbose)
         model@Y_train[!model@Y_obs] <- cur.mi$y.impute
         beta.hat <- as.matrix(cur.mi$coef)
+        if (!quiet) {
+          cat("Estimation of initial beta complete", "\n")
+        }
       } else {
+        if (!quiet) {
+          cat("Estimating initial beta...", "\n")
+        }
         beta0.glmnet <- cv.glmnet(model@X_train, model@Y_train,
           parallel = parallel, foldid = foldid)
         beta.hat <- as.matrix(predict(beta0.glmnet,
             type = "coefficients", s = beta0.glmnet$lambda.1se))
+        if (!quiet) {
+          cat("Estimation of initial beta complete", "\n")
+        }
       }
     }
     Y.hat <- beta.hat[1, 1] + model@X_train %*% beta.hat[-1, ]
@@ -927,7 +942,7 @@ setMethod(
       }
       if (!quiet) {
         cat("Iteration", iter, "complete", "\n")
-        cat("Current penalized likelihood:", model@error, "\n")
+        cat("Current penalized negative log likelihood:", model@error, "\n")
         cat("Current MSE:", mean(model@res^2), "\n")
       }
       iter <- iter + 1

diff --git a/R/PrestoGP_Multivariate_Vecchia.R b/R/PrestoGP_Multivariate_Vecchia.R
@@ -435,38 +435,33 @@ setMethod("impute_y_lod", "MultivariateVecchiaModel", function(model, lod,
     yhat.ni <- X %*% cur.coef[-1]
     yhat.ni <- yhat.ni + mean(y[!miss]) - mean(yhat.ni[!miss])
 
+    coef.mat <- matrix(nrow = n.mi, ncol = (ncol(X) + 1))
     if (parallel) {
-      coef.mat <- foreach(i = seq_len(n.mi), .combine = rbind) %dopar% {
-        yi <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
-          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn, covmat = Sigma.hat)
-        yi <- yi + yhat.ni
-
-        tiid <- transform_miid(cbind(yi, X), vecchia.approx, params)
-        yt <- tiid[, 1]
-        Xt <- as.matrix(tiid[, -1])
-
-        cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt),
-          alpha = model@alpha, family = family, nfolds = nfolds,
-          foldid = foldid)
-        as.vector(coef(cur.glmnet, s = "lambda.min"))
+      yi <- foreach(i = seq_len(n.mi), .combine = cbind) %dopar% {
+        out <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
+          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn,
+          covmat = Sigma.hat)
+        out + yhat.ni
       }
     } else {
-      coef.mat <- matrix(nrow = n.mi, ncol = (ncol(X) + 1))
+      yi <- matrix(nrow = length(yhat.ni), ncol = n.mi)
       for (i in seq_len(n.mi)) {
-        yi <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
-          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn, covmat = Sigma.hat)
-        yi <- yi + yhat.ni
-
-        tiid <- transform_miid(cbind(yi, X), vecchia.approx, params)
-        yt <- tiid[, 1]
-        Xt <- as.matrix(tiid[, -1])
-
-        cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt),
-          alpha = model@alpha, family = family, nfolds = nfolds,
-          foldid = foldid)
-        coef.mat[i, ] <- as.matrix(coef(cur.glmnet, s = "lambda.min"))
+        yi[, i] <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
+          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn,
+          covmat = Sigma.hat)
+        yi[, i] <- yi[, i] + yhat.ni
       }
     }
+    tiid <- transform_miid(cbind(yi, X), vecchia.approx, params)
+    yt <- tiid[, seq_len(ncol(yi))]
+    Xt <- as.matrix(tiid[, -(seq_len(ncol(yi)))])
+
+    for (i in seq_len(n.mi)) {
+      cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt[, i]),
+        alpha = model@alpha, family = family, nfolds = nfolds,
+        foldid = foldid, parallel = parallel)
+      coef.mat[i, ] <- as.matrix(coef(cur.glmnet, s = "lambda.min"))
+    }
     last.coef <- cur.coef
     cur.coef <- colMeans(coef.mat)
     if (verbose) {

diff --git a/R/PrestoGP_Util_Functions.R b/R/PrestoGP_Util_Functions.R
@@ -351,7 +351,7 @@ eliminate_dupes <- function(locs, locs.pred = NULL) {
 }
 
 lod_reg_mi <- function(y, X, lod, miss, n.mi = 10, eps = 0.01, maxit = 10,
-  parallel, foldid) {
+  parallel, foldid, verbose) {
   lod <- lod[miss]
   last.coef <- rep(Inf, ncol(X) + 1)
   cur.glmnet <- cv.glmnet(X, y, parallel = parallel, foldid = foldid,
@@ -375,6 +375,11 @@ lod_reg_mi <- function(y, X, lod, miss, n.mi = 10, eps = 0.01, maxit = 10,
           s = "lambda.min"))
     }
     cur.coef <- colMeans(coef.mat)
+    if (verbose) {
+      cat("LOD imputation iteration", itn, "complete", "\n")
+      cat("Current coefficients:", "\n")
+      print(cur.coef)
+    }
   }
   miss.means <- X[miss, ] %*% cur.coef[-1] + cur.coef[1]
   obs.means <- X[!miss, ] %*% cur.coef[-1] + cur.coef[1]

diff --git a/R/PrestoGP_Vecchia.R b/R/PrestoGP_Vecchia.R
@@ -261,40 +261,34 @@ setMethod("impute_y_lod", "VecchiaModel", function(model, lod, n.mi = 10,
     yhat.ni <- X %*% cur.coef[-1]
     yhat.ni <- yhat.ni + mean(y[!miss]) - mean(yhat.ni[!miss])
 
+    coef.mat <- matrix(nrow = n.mi, ncol = (ncol(X) + 1))
     if (parallel) {
-      coef.mat <- foreach(i = seq_len(n.mi), .combine = rbind) %dopar% {
-        yi <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
-          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn, covmat = Sigma.hat)
-        yi <- yi + yhat.ni
-
-        tiid <- transform_iid(cbind(yi, X), vecchia.approx, params[1:3],
-          params[4])
-        yt <- tiid[, 1]
-        Xt <- as.matrix(tiid[, -1])
-
-        cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt),
-          alpha = model@alpha, family = family, nfolds = nfolds,
-          foldid = foldid)
-        as.vector(coef(cur.glmnet, s = "lambda.min"))
+      yi <- foreach(i = seq_len(n.mi), .combine = cbind) %dopar% {
+        out <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
+          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn,
+          covmat = Sigma.hat)
+        out + yhat.ni
       }
     } else {
-      coef.mat <- matrix(nrow = n.mi, ncol = (ncol(X) + 1))
+      yi <- matrix(nrow = length(yhat.ni), ncol = n.mi)
       for (i in seq_len(n.mi)) {
-        yi <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
-          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn, covmat = Sigma.hat)
-        yi <- yi + yhat.ni
-
-        tiid <- transform_iid(cbind(yi, X), vecchia.approx, params[1:3],
-          params[4])
-        yt <- tiid[, 1]
-        Xt <- as.matrix(tiid[, -1])
-
-        cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt),
-          alpha = model@alpha, family = family, nfolds = nfolds,
-          foldid = foldid)
-        coef.mat[i, ] <- as.matrix(coef(cur.glmnet, s = "lambda.min"))
+        yi[, i] <- rtmvn_snn(y - yhat.ni, rep(-Inf, length(y)),
+          rep(lod, length(y)) - yhat.ni, is.na(y), locs.nn,
+          covmat = Sigma.hat)
+        yi[, i] <- yi[, i] + yhat.ni
       }
     }
+    tiid <- transform_iid(cbind(yi, X), vecchia.approx, params[1:3],
+      params[4])
+    yt <- tiid[, seq_len(ncol(yi))]
+    Xt <- as.matrix(tiid[, -(seq_len(ncol(yi)))])
+
+    for (i in seq_len(n.mi)) {
+      cur.glmnet <- cv.glmnet(as.matrix(Xt), as.matrix(yt[, i]),
+        alpha = model@alpha, family = family, nfolds = nfolds,
+        foldid = foldid, parallel = parallel)
+      coef.mat[i, ] <- as.matrix(coef(cur.glmnet, s = "lambda.min"))
+    }
     last.coef <- cur.coef
     cur.coef <- colMeans(coef.mat)
     if (verbose) {

diff --git a/src/createUMC.cpp b/src/createUMC.cpp
@@ -1,60 +1,47 @@
 #define ARMA_64BIT_WORD 1
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include <RcppArmadillo.h>
 // [[Rcpp::depends(RcppArmadillo)]]
 // [[Rcpp::plugins(cpp17)]]
+// [[Rcpp::plugins(openmp)]]
 #include <cmath>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/gamma.hpp>
 
 double matern1(const double &x, const double &smooth, const double &alpha) {
   if (smooth == 0.5) {
     return exp(-x * alpha);
   } else if (smooth == 1.5) {
-    double normcon = (pow(alpha, 3) / sqrt(M_PI)) * (tgamma(2.0) / tgamma(1.5));
+    double normcon = (pow(alpha, 3) / sqrt(M_PI)) *
+      (boost::math::tgamma(2.0) / boost::math::tgamma(1.5));
     return normcon * 0.5 * M_PI * pow(alpha, -3) * exp(-x * alpha) *
            (1 + x * alpha);
   } else if (smooth == 2.5) {
-    double normcon = (pow(alpha, 5) / sqrt(M_PI)) * (tgamma(3.0) / tgamma(2.5));
+    double normcon = (pow(alpha, 5) / sqrt(M_PI)) *
+      (boost::math::tgamma(3.0) / boost::math::tgamma(2.5));
     double scaled_x = x * alpha;
     return normcon * 0.125 * M_PI * pow(alpha, -5) * exp(-scaled_x) *
            (3 + 3 * scaled_x + scaled_x * scaled_x);
   } else if (x == 0.0) {
     return 1.0;
   } else {
-    double normcon = pow(2.0, 1.0 - smooth) / tgamma(smooth);
+    double normcon = pow(2.0, 1.0 - smooth) / boost::math::tgamma(smooth);
     double scaled_x = x * alpha;
-    return normcon * pow(scaled_x, smooth) * R::bessel_k(scaled_x, smooth, 1);
+    return normcon * pow(scaled_x, smooth) *
+      boost::math::cyl_bessel_k(smooth, scaled_x);
   }
 }
 
 // computes Euclidean distance between 2 vectors ([slightly slower] helper
 // function for rdist in C++)
 double dist1(const arma::rowvec &a, const arma::rowvec &b) {
-  double temp = 0.0;
-  // loop over each element and square the distance, add to the placeholder
-  for (arma::uword i = 0; i < a.n_elem; i++) {
-    temp += (a(i) - b(i)) * (a(i) - b(i));
-  }
-  return sqrt(temp);
+  return arma::norm(a - b, 2);
 }
 
 // [[Rcpp::export]]
-arma::vec na_omit_c(arma::vec x) {
-  // make placeholder vector r
-  arma::vec r(x.size());
-  // make counter for number of non-NAs
-  int k = 0;
-  for (unsigned j = 0; j < x.size(); j++) {
-    // if not NA, add 1 to counter and set r(j) = x(j)
-    if (x(j) == x(j)) {
-      r(j) = x(j);
-      k++;
-    }
-  }
-  // resize r to non-NA size (removing NAs)
-  r.resize(k);
-  // subtract one because C++ indexes from 0 while inputs from R index from 1
-  // r -= 1.0;
-  return r;
-}
+arma::vec na_omit_c(arma::vec x) { return x(arma::find_finite(x)); }
 
 // [[Rcpp::export]]
 arma::sp_mat createU_helper_mat(const arma::mat &olocs, const arma::vec &ondx,
@@ -81,6 +68,9 @@ arma::sp_mat createU_helper_mat(const arma::mat &olocs, const arma::vec &ondx,
   // feeder(2, arma::span(0,6)) = U_beginning.t();
   int ind = 7;
   // int ind = 0;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
   for (arma::uword i = 3; i < (ondx.n_elem + 1); i++) {
     double temppi = ondx(i - 1);
     arma::vec cy = na_omit_c(curqys.col(i - 1));
@@ -109,10 +99,8 @@ arma::sp_mat createU_helper_mat(const arma::mat &olocs, const arma::vec &ondx,
         k2(j, j) += nugget(temppj - 1);
       }
     }
-    // //make symmetric
-    k2 += k2.t();
-    // //but diagonal is double counted
-    k2.diag() /= 2.0;
+    // make symmetric
+    k2 = arma::symmatu(k2);
     // Rcpp::Rcout << k2;
     arma::vec bi(nq);
     bool status = arma::solve(bi, k2, k1);
@@ -136,18 +124,22 @@ arma::sp_mat createU_helper_mat(const arma::mat &olocs, const arma::vec &ondx,
     // arma::vec bi = arma::solve(k2, k1);
     // uhat(2*i - 1, 2*i - 1) = pow(nugget(temppi - 1), -0.5);
     // uhat(2*i - 2, 2*i - 1) = -1.0 * uhat(2*i - 1, 2*i - 1);
-
+    int siz = cq.n_elem + 3;
+    arma::umat ndx_private(2, siz);
+    arma::vec vals_private(siz);
     // uhat(2*i - 2, 2*i - 2) = 1.0 / ri;
     for (arma::uword j = 0; j < cq.n_elem; j++) {
       arma::uword xind = 2 * cq(j) - 2;
       if (j >= cy.n_elem) {
         xind += 1;
       }
       // feeder.col(ind) = {(double)xind,(double) 2*i - 2, bi(j)};
-      ndx.col(ind) = {xind, 2 * i - 2};
-      vals(ind) = bi(j);
+      // ndx.col(ind) = {xind, 2 * i - 2};
+      // vals(ind) = bi(j);
+      ndx_private.col(j) = {xind, 2 * i - 2};
+      vals_private(j) = bi(j);
       // vals[ind] = bi(j);
-      ind++;
+      // ind++;
       // uhat(xind, 2.0*i - 2.0) = bi(j);
     }
     // vals[ind] = 1.0 / ri;
@@ -156,17 +148,28 @@ arma::sp_mat createU_helper_mat(const arma::mat &olocs, const arma::vec &ondx,
             -0.5); // used twice so temp storage to avoid recomputation
     // feeder.col(ind) = {(double)2*i - 2,(double)2*i - 2, 1.0 / ri};
     // feeder.col(ind + 2) = {(double)2*i - 1,(double) 2*i - 1, feed_temp};
-    ndx.col(ind) = {2 * i - 2, 2 * i - 2};
-    vals(ind) = 1.0 / ri;
-    ndx.col(ind + 2) = {2 * i - 1, 2 * i - 1};
-    vals(ind + 2) = feed_temp;
+    // ndx.col(ind) = {2 * i - 2, 2 * i - 2};
+    // vals(ind) = 1.0 / ri;
+    // ndx.col(ind + 2) = {2 * i - 1, 2 * i - 1};
+    // vals(ind + 2) = feed_temp;
+    ndx_private.col(nq) = {2 * i - 2, 2 * i - 2};
+    vals_private(nq) = 1.0 / ri;
+    ndx_private.col(nq + 2) = {2 * i - 1, 2 * i - 1};
+    vals_private(nq + 2) = feed_temp;
     // vals[ind + 2] = pow(nugget(temppi - 1), -0.5);
     // vals[ind + 1] = -1.0 * vals[ind + 2];
     // feeder.col(ind + 1) = {(double)2*i - 2,(double) 2*i - 1, -1.0 *
     // feed_temp};
-    ndx.col(ind + 1) = {2 * i - 2, 2 * i - 1};
-    vals(ind + 1) = -1.0 * feed_temp;
-    ind += 3;
+    // ndx.col(ind + 1) = {2 * i - 2, 2 * i - 1};
+    // vals(ind + 1) = -1.0 * feed_temp;
+    // ind += 3;
+    ndx_private.col(nq + 1) = {2 * i - 2, 2 * i - 1};
+    vals_private(nq + 1) = -1.0 * feed_temp;
+    int local_ind;
+ #pragma omp atomic capture
+    {local_ind = ind; ind += siz;}
+    ndx.cols(local_ind, local_ind + siz - 1) = ndx_private;
+    vals.subvec(local_ind, local_ind + siz - 1) = vals_private;
   }
   arma::sp_mat uhat(ndx, vals);
   // 0 indexing -> 1 indexing for R