statdivlab · svteichman · Mar 10, 2024 · Mar 11, 2024 · Mar 13, 2024
diff --git a/R/emuFit.R b/R/emuFit.R
@@ -4,8 +4,8 @@
 #' @param X an n x p matrix or dataframe of covariates (optional)
 #' @param formula a one-sided formula specifying the form of the mean model to be fit
 #' @param data an n x p data frame containing variables given in \code{formula}
-#' @param cluster a numeric vector giving cluster membership for each row of Y to 
-#' be used in computing GEE test statistics. Default is NULL, in which case rows of 
+#' @param cluster a vector giving cluster membership for each row of Y or the name of a variable included in \code{data} object 
+#' to be used in computing GEE test statistics. Default is NULL, in which case rows of 
 #' Y are treated as independent.
 #' @param penalize logical: should Firth penalty be used in fitting model? Default is TRUE.
 #' @param B starting value of coefficient matrix (p x J). If not provided,
@@ -174,13 +174,21 @@ have no observations. These samples must be excluded before fitting model.")
   }
 
   #check that cluster is correctly type if provided
-  if(!is.null(cluster)){
-    if(!is.numeric(cluster)){
-      stop("If provided, argument 'cluster' must be a numeric vector.")
-    }
-    if(length(cluster)!=nrow(Y)){
-      stop("If provided, argument 'cluster' must be a numeric vector with 
+  if(length(paste0(substitute(cluster))) > 0){
+    cluster_name <- paste0(substitute(cluster))
+    if(!exists(cluster_name, envir = rlang::caller_env())){
+      # if cluster argument doesn't exist in environment, check whether it is 
+      # the name of a variable in data 
+      if (cluster_name %in% names(data)) {
+        cluster <- data[, which(names(data) == cluster_name)]
+      } else {
+        stop("Argument 'cluster' does not refer to a vector in the environment or the name of a variable in 'data'.")
+      }
+    } else {
+      if(length(cluster)!=nrow(Y)){
+        stop("If provided as a vector, argument 'cluster' must have 
 length equal to n (the number of rows in Y).")
+      }
     }
     if(length(unique(cluster)) == nrow(Y)){
       warning("Number of unique values in 'cluster' equal to number of rows of Y; 

diff --git a/man/emuFit.Rd b/man/emuFit.Rd
diff --git a/tests/testthat/test-cluster.R b/tests/testthat/test-cluster.R
@@ -4,31 +4,54 @@ test_that("clusters work as I want", {
   n <- 64
   J <- 50
   Y <- matrix(rpois(n*J, 100)*rbinom(n*J, 100, 0.7), nrow=n, ncol = J)
-  cage <- rep(c(1:16), 4) 
-  treatment <- (cage <= 8)
-  XX <- data.frame(cage, treatment)
-  ef <- emuFit(formula = ~ treatment, 
+  cage_num <- rep(c(1:16), 4) 
+  treatment <- (cage_num <= 8)
+  XX <- data.frame(treatment)
+
+  # check that cluster argument works as a numeric vector 
+  ef_num <- emuFit(formula = ~ treatment, 
                data = XX, 
                Y = Y, 
-               cluster=cage, 
+               cluster=cage_num, 
                run_score_tests=FALSE) #### very fast
-  expect_equal(ef$coef %>% class, "data.frame")
+  expect_equal(ef_num$coef %>% class, "data.frame")
 
-  set.seed(101)
-  n <- 64
-  J <- 50
-  Y <- matrix(rpois(n*J, 100)*rbinom(n*J, 100, 0.7), nrow=n, ncol = J)
-  cage <- rep(c(LETTERS[1:16]), 4) %>% as.factor
-  treatment <- (cage %in% LETTERS[1:8])
-  XX <- data.frame(cage, treatment)
+  # check that cluster argument works as character vector and gives 
+  # equivalent results to numeric vector 
+  cage_char <- rep(c(LETTERS[1:16]), 4)
+  ef_char <- emuFit(formula = ~ treatment, 
+                   data = XX, 
+                   Y = Y, 
+                   cluster=cage_char, 
+                   run_score_tests=FALSE) 
+  expect_equal(ef_num$coef, ef_char$coef)
+
+  # check that cluster argument works as factor and gives equivalent results
+  # to numeric vector 
+  cage_fact <- cage_char %>% as.factor
+  ef_fact <- emuFit(formula = ~ treatment, 
+                    data = XX, 
+                    Y = Y, 
+                    cluster=cage_fact, 
+                    run_score_tests=FALSE) 
+  expect_equal(ef_num$coef, ef_fact$coef)
+
+  # check that cluster argument works as variable name in quotes 
+  XX$cage <- cage_num 
+  ef_name <- emuFit(formula = ~ treatment, 
+                    data = XX, 
+                    Y = Y, 
+                    cluster="cage", 
+                    run_score_tests=FALSE) 
+  expect_equal(ef_num$coef, ef_name$coef)
 
-  ##### FAILS
-  # expect_equal(emuFit(formula = ~ treatment, 
-  #                      data = XX, 
-  #                      Y = Y, 
-  #                      cluster=cage, 
-  #                      run_score_tests=FALSE) %>% class, 
-  #              "data.frame")
+  # check that cluster argument works as variable name not in quotes
+  ef_name2 <- emuFit(formula = ~ treatment, 
+                    data = XX, 
+                    Y = Y, 
+                    cluster=cage, 
+                    run_score_tests=FALSE) 
+  expect_equal(ef_num$coef, ef_name2$coef)
 
 })
 

diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/radEmu_clustered_data.Rmd b/vignettes/radEmu_clustered_data.Rmd
@@ -0,0 +1,109 @@
+---
+title: "Using radEmu on clustered data"
+author: "Sarah Teichman"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Using radEmu on clustered data}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+First, we will install `radEmu`, if we haven't already. 
+
+```{r, eval = FALSE}
+# if (!require("remotes", quietly = TRUE))
+#     install.packages("remotes")
+# 
+# remotes::install_github("statdivlab/radEmu")
+```
+
+Next, we can load `radEmu` as well as the `tidyverse` package suite. 
+
+```{r setup, message = FALSE}
+library(magrittr)
+library(dplyr)
+library(ggplot2)
+library(stringr)
+library(radEmu)
+```
+
+## Introduction 
+
+In this vignette we will explore the use of `radEmu` with clustered data. Data with cluster dependence is a common phenomenon in microbiome studies. This type of dependence can come from experimental factors such as shared cages or tanks for study animals. When cluster dependence is not accounted for, statistical methods often operate under the assumption that all samples are independent. This will typically lead to anti-conservative inference (i.e. p-values that are smaller than they should be). 
+
+Luckily, we have tools to account for cluster dependence in statistical inference, and they are implemented in `radEmu` through the `cluster` argument! This argument is only implemented from `radEmu` v1.2.0.0 forward, so if you are having trouble using the `cluster` argument, check that you have a recent enough version so use this functionality. 
+
+## Generating data with cluster dependence
+
+To start, let's generate a small data example in which there is cluster dependence within our data. 
+
+```{r}
+set.seed(10)
+# 6 categories
+J <- 6
+# 2 columns in the design matrix 
+p <- 2
+# 60 samples 
+n <- 60
+# generate design matrix 
+X <- cbind(1, rnorm(n))
+cov_dat <- data.frame(cov = X[, 2])
+# sample-specific effects
+z <- rnorm(n) + 5
+# cluster membership 
+cluster <- rep(1:4, each = 15)
+cov_dat$cluster <- cluster
+# cluster effects 
+cluster_effs <- lapply(1:4, function(i) log(matrix(rexp(2*J), nrow = 2)))
+# intercepts for each category
+b0 <- rnorm(J)
+# coefficients for X1 for each category 
+b1 <- seq(1, 5, length.out = J)
+# mean center the coefficients
+b1 <- b1 - mean(b1)
+# set the coefficient for the 3rd category to 0
+b1[3] <- 0
+# generate B coefficient matrix 
+b <- rbind(b0, b1)
+
+# set up response matrix 
+Y <- matrix(0, ncol = J, nrow = n)
+for (i in 1:n) {
+  for(j in 1:J){
+    # mean model is exp(X_i %*% B_j + cluster_effect + z_i)
+    temp_mean <- exp(X[i, , drop = FALSE] %*% 
+                       (b[, j, drop = FALSE] + 
+                          cluster_effs[[ cluster[i] ]][,j]) + z[i])
+    # draw from a zero-inflated negative binomial with our mean 
+    Y[i,j] <- rnbinom(1, mu = temp_mean, size = 5) * rbinom(1, 1, 0.8)
+  }
+}
+```
+
+Now that we have our data, we can get estimates of our parameters using the radEmu model and do inference. We will specifically test the log fold difference parameter for the 3rd category, because we know that the true log fold difference for the third category associated with our covariate is $0$. 
+```{r}
+ef_no_cluster <- emuFit(formula = ~ cov,
+                        data = cov_dat, 
+                        Y = Y,
+                        test_kj = data.frame(k = 2, j = 3))
+```
+
+When we ignore clustering, we get an estimate of the log fold difference in category 3 across values of our covariate of `r round(ef_no_cluster$coef$estimate[3], 3)` and a p-value of `r round(ef_no_cluster$coef$pval[3], 3)`.
+
+```{r}
+ef_cluster <- emuFit(formula = ~ cov,
+                     data = cov_dat, 
+                     Y = Y,
+                     cluster = cluster, 
+                     test_kj = data.frame(k = 2, j = 3))
+```
+
+Here, when we account for clustering, we get an estimate of the log fold difference in category 3 across values of our covariate of `r round(ef_cluster$coef$estimate[3], 3)` and a p-value of `r round(ef_cluster$coef$pval[3], 3)`. We can see that our estimates are the same whether or not we account for cluster, but our p-values are different.