replaced variance for loop with apply in findArtifacts. Removed featu…

…re for loops.
MicTott · Apr 10, 2024 · 900c30e · 900c30e
1 parent fe8a56e
commit 900c30e
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 109 deletions.
diff --git a/R/findArtifacts.R b/R/findArtifacts.R
@@ -80,60 +80,46 @@ findArtifacts <- function(
       stop("'n_rings' must be a positive integer.")
     }
 
-
-    # log transform specified features
-    features <- c(mito_percent, mito_sum)
-    features_to_use <- character()
-    if (log) {
-        for (feature in features) {
-            feature_log <- paste0(feature, "_log")
-            colData(spe)[feature_log] <- log1p(colData(spe)[[feature]])
-            features_to_use <- c(features_to_use, feature_log)
-        }
-    } else {
-        features_to_use <- features
-    }
-
     # get unique sample IDs
-    sample_ids <- unique(colData(spe)[[samples]])
+    unique_sample_ids <- unique(colData(spe)[[samples]])
 
     # Initialize a list to store spe for each sample
-    colData_list <- list()
+    columnData_list <- sapply(unique_sample_ids, FUN = function(x) NULL)
 
-    for (sample in sample_ids) {
+    for (sample in unique_sample_ids) {
         # subset by sample
         spe.temp <- spe[, colData(spe)[[samples]] == sample]
 
         # ======= Calculate local mito variance ========
-        var_matrix <- c()
-        for (i in seq_len(n_rings)) {
-            # ==== local mito variance ==== get n neighbors for i rings
-            n_neighbors <- 3 * i * (i + 1)
-            tmp.name <- paste0("k", n_neighbors)
-
-            # get local variance of mito ratio
-            spe.temp <- localVariance(spe.temp,
-                features = mito_percent,
-                n_neighbors = n_neighbors,
-                name = tmp.name
-            )
-
-            # add to var_matrix
-            var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
-        }
-
-
-        # ========== PCA and clustering ========== add mito and mito_percent to
-        # var dataframe
+        # Use vapply to iterate over rings_seq
+        var_matrix <- sapply(seq_len(n_rings), function(i) {
+          # Calculate n_neighbors for the current ring
+          n_neighbors <- 3 * i * (i + 1)
+          tmp.name <- paste0("k", n_neighbors)
+
+          # Apply local variance calculation
+          spe.temp <<- localVariance(spe.temp,
+                                    metric = mito_percent,
+                                    n_neighbors = n_neighbors,
+                                    name = tmp.name)
+
+          # Extract and return the column corresponding to tmp.name from colData
+          colData(spe.temp)[[tmp.name]]
+        })
+
+
+        # ========== PCA and clustering ==========
         var_matrix <- cbind(
-            var_matrix, colData(spe.temp)[[mito_percent]],
+            var_matrix,
+            colData(spe.temp)[[mito_percent]],
             colData(spe.temp)[[mito_sum]]
         )
 
         var_df <- data.frame(var_matrix)
 
         # Run PCA and add to reduced dims
         pc <- prcomp(var_df, center = TRUE, scale. = TRUE)
+        rownames(pc$x) <- colnames(spe.temp) # assign rownames to avoid error
         reducedDim(spe.temp, "PCA_artifacts") <- pc$x
 
         # Cluster using kmeans and add to temp sce
@@ -143,13 +129,9 @@ findArtifacts <- function(
         # =========== Artifact annotation ===========
 
         # calculate average local variance of the two clusters
-        clus1_mean <- mean(colData(spe.temp)[[paste0(
-            "k",
-            18
+        clus1_mean <- mean(colData(spe.temp)[[paste0("k",18
         )]][spe.temp$Kmeans == 1])
-        clus2_mean <- mean(colData(spe.temp)[[paste0(
-            "k",
-            18
+        clus2_mean <- mean(colData(spe.temp)[[paste0("k",18
         )]][spe.temp$Kmeans == 2])
 
         artifact_clus <- which.min(c(clus1_mean, clus2_mean))
@@ -158,16 +140,18 @@ findArtifacts <- function(
         # 1 to 'TRUE'
         spe.temp$artifact <- FALSE
         spe.temp$artifact[spe.temp$Kmeans == artifact_clus] <- TRUE
+        spe.temp$Kmeans <- NULL
 
         # add to sample list
-        colData_list[[sample]] <- colData(spe.temp)
+        columnData_list[[sample]] <- colData(spe.temp)
     }
 
     # rbind the list of dataframes
-    colData_aggregated <- do.call(rbind, colData_list)
+    columnData_aggregated <- do.call(rbind, columnData_list)
 
     # replace SPE column data with aggregated data
-    colData(spe) <- colData_aggregated
+    colData(spe) <- columnData_aggregated
+    reducedDims(spe) <- reducedDims(spe.temp)
 
     # return sce
     return(spe)

diff --git a/R/localVariance.R b/R/localVariance.R
@@ -6,12 +6,12 @@
 #'        sample_id, sum_umi, sum_gene
 #' @param n_neighbors Number of nearest neighbors to use for variance
 #'        calculation
-#' @param features Features to use for variance calculation
+#' @param metric metric to use for variance calculation
 #' @param samples Column in colData to use for sample ID
-#' @param log Whether to log1p transform the features
+#' @param log Whether to log1p transform the metric
 #' @param name Name of the new column to add to colData
 #'
-#' @return SpatialExperiment object with feature variance added to colData
+#' @return SpatialExperiment object with metric variance added to colData
 #'
 #' @importFrom SummarizedExperiment colData
 #' @importFrom BiocNeighbors findKNN
@@ -42,20 +42,20 @@
 #' # Identifying the mitochondrial transcripts in our SpatialExperiment.
 #' is.mito <- rownames(spe)[grepl("^MT-", rownames(spe))]
 #'
-#' # Calculating QC metrics for each spot using scuttle
+#' # Calculating QC metric for each spot using scuttle
 #' spe <- scuttle::addPerCellQCMetrics(spe, subsets = list(Mito = is.mito))
 #' colnames(colData(spe))
 #'
 #' spe <- localVariance(spe,
-#'     features = "subsets_Mito_percent",
+#'     metric = "subsets_Mito_percent",
 #'     n_neighbors = 36,
 #'     name = "local_mito_variance_k36"
 #'     )
 #'
 #' plotQC(spe, metric="local_mito_variance_k36")
 #'
 localVariance <- function(spe, n_neighbors = 36,
-                          features = c("expr_chrM_ratio"),
+                          metric = c("expr_chrM_ratio"),
                           samples = "sample_id", log = FALSE, name = NULL) {
 
     # ===== Validity checks =====
@@ -64,14 +64,14 @@ localVariance <- function(spe, n_neighbors = 36,
       stop("Input data must be a SpatialExperiment object.")
     }
 
-    # Validate 'features' is a character vector
-    if (!is.character(features)) {
-      stop("'features' must be a character vector.")
+    # Validate 'metric' is a character vector
+    if (!is.character(metric)) {
+      stop("'metric' must be a character vector.")
     }
 
-    # validate 'features' are valid colData
-    if (!all(features %in% colnames(colData(spe)))) {
-      stop("All features must be present in colData.")
+    # validate 'metric' are valid colData
+    if (!all(metric %in% colnames(colData(spe)))) {
+      stop("Metric must be present in colData.")
     }
 
     # validate samples is valid colData
@@ -87,17 +87,13 @@ localVariance <- function(spe, n_neighbors = 36,
     }
 
     # ===== start function =====
-    # log1p transform specified features
+    # log1p transform specified metric
     if (log) {
-      features_log <- lapply(features, function(feature) {
-        feature_log <- paste0(feature, "_log")
-        colData(spe)[[feature_log]] <- log1p(colData(spe)[[feature]])
-        return(feature_log)  # Return the new feature name
-      })
-
-      features_to_use <- c(features, unlist(features_log))
+      metric_log <- paste0(metric, "_log")
+      colData(spe)[metric_log] <- log1p(colData(spe)[[metric]])
+      metric_to_use <- metric_log
     } else {
-      features_to_use <- features
+      metric_to_use <- metric
     }
 
     # Get a list of unique sample IDs
@@ -112,7 +108,7 @@ localVariance <- function(spe, n_neighbors = 36,
         sample <- unique_sample_ids[sample_id]
         spe_subset <- subset(spe, , sample_id == sample)
 
-        # Create a list of spatial coordinates and qc features
+        # Create a list of spatial coordinates and qc metric
         columnData <- colData(spe_subset)
         columnData$coords <- spatialCoords(spe_subset)
 
@@ -122,53 +118,33 @@ localVariance <- function(spe, n_neighbors = 36,
             warn.ties = FALSE
         )$index
 
-        #  === Get local variance ===
-        # Initialize a matrix to store variance for each feature
-        var_matrix <- matrix(NA, nrow(columnData), length(features_to_use))
-        colnames(var_matrix) <- features_to_use
-
-        mean_matrix <- matrix(NA, nrow(columnData), length(features_to_use))
-        colnames(mean_matrix) <- features_to_use
+        #  === Compute local variance ===
+        # get neighborhood metric
+        neighborhoods <- lapply(seq_len(nrow(dnn)), function(i) {
+          indices <- dnn[i, ]
+          indices <- indices[indices != 0]
+          indices <- c(i, indices)
 
-        # Loop through each row in the nearest neighbor index matrix
-        for (i in seq_len(nrow(dnn))) {
-            dnn.idx <- dnn[i, ]
-            for (j in seq_along(features_to_use)) {
-                neighborhood <- columnData[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]
-
-                var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
-                mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
-            }
-        }
+          columnData[indices, metric_to_use]
+        })
 
-        # Handle non-finite values
-        var_matrix[!is.finite(var_matrix)] <- 0
-        mean_matrix[!is.finite(mean_matrix)] <- 0
+        # Compute variance and mean
+        stats_matrix <- t(sapply(neighborhoods, function(x) {
+          c(var = var(x, na.rm = TRUE), mean = mean(x, na.rm = TRUE))
+        }))
 
-        # == Regress out mean-variance bias ==
-        for (feature_idx in seq_along(features_to_use)) {
-            # Prepare data.frame for current feature
-            mito_var_df <- data.frame(
-                mito_var = log2(var_matrix[, feature_idx]),
-                mito_mean = log2(mean_matrix[, feature_idx])
-            )
+        # Perform robust linear regression to regress out mean-var bias
+        fit.irls <- MASS::rlm(log2(var) ~ mean,
+                              data = as.data.frame(stats_matrix))
 
-            # Perform robust linear regression (IRLS) of variance vs mean
-            fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)
-
-            # Get residuals and update the variance matrix
-            resid.irls <- resid(fit.irls)
-
-            # Replace original variance values with residuals
-            var_matrix[, feature_idx] <- resid.irls
-        }
+        var_resid <- resid(fit.irls) # Get residuals
 
         # add local variance to columnData dataframe
         if (!is.null(name)) {
-            columnData[name] <- var_matrix[, j]
+            columnData[name] <- var_resid
         } else {
-            feature_var <- paste0(features[j], "_var")
-            columnData[feature_var] <- var_matrix[, j]
+            metric_var <- paste0(metric, "_var")
+            columnData[metric_var] <- var_resid
         }
 
         # Store the modified columnData dataframe in the list

diff --git a/tests/testthat/test-findArtifacts.R b/tests/testthat/test-findArtifacts.R
@@ -21,5 +21,5 @@ test_that("example object contains artifacts colDta", {
 })
 
 test_that("examples gives correct number of artifact spots", {
-  expect_equal(sum(spe$artifact), 349)
+  expect_equal(sum(spe$artifact), 355)
 })