MicTott · MicTott · Apr 8, 2024 · Apr 8, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,13 +2,9 @@ Package: SpotSweeper
 Title: Spatially-aware quality control for spatial transcriptomics
 Version: 0.99.1
 Date: 2023-10-30
-Authors@R: c(
+Authors@R: 
     person("Michael", "Totty", ,"[email protected]", role = c("aut", "cre"),
-           comment = c(ORCID = "0000-0002-9292-8556")),
-    person("Boyi", "Guo", ,email = "[email protected]", role = c("aut"),
-           comment = c(ORCID = "0000-0003-2950-2349"))
-)
-
+           comment = c(ORCID = "0000-0002-9292-8556"))
 Description: Spatially-aware quality control (QC) software for both spot-level 
     and artifact-level QC in spot-based spatial transcripomics, such as 10x 
     Visium. These methods calculate local (nearest-neighbors) mean and variance 

diff --git a/R/findArtifacts.R b/R/findArtifacts.R
@@ -60,21 +60,11 @@ findArtifacts <- function(
     features <- c(mito_percent, mito_sum)
     features_to_use <- character()
     if (log2) {
-        # for (feature in features) {
-        #     feature_log2 <- paste0(feature, "_log2")
-        #     colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
-        #     features_to_use <- c(features_to_use, feature_log2)
-        # }
-      log_mat <- apply(
-        colData(spe)[, features],
-        MARGIN = 2,
-        FUN = log2
-      )
-      colnames(log_mat) <- paste0(colnames(log_mat), "_log2")
-      # Merge to spe
-      colData(spe)[colnames(log_mat)] <- log_mat
-      features_to_use <- colnames(log_mat)
-
+        for (feature in features) {
+            feature_log2 <- paste0(feature, "_log2")
+            colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
+            features_to_use <- c(features_to_use, feature_log2)
+        }
     } else {
         features_to_use <- features
     }
@@ -90,43 +80,23 @@ findArtifacts <- function(
         spe.temp <- spe[, colData(spe)[[samples]] == sample]
 
         # ======= Calculate local mito variance ========
-
-        var_mat <- sapply(
-          seq_len(n_rings),
-          FUN = function(i){
+        var_matrix <- c()
+        for (i in seq_len(n_rings)) {
+            # ==== local mito variance ==== get n neighbors for i rings
             n_neighbors <- 3 * i * (i + 1)
             tmp.name <- paste0("k", n_neighbors)
-
-                spe.temp <- localVariance(spe.temp,
-                    features = mito_percent,
-                    n_neighbors = n_neighbors,
-                    name = tmp.name
-                )
-
-                colData(spe.temp)[[tmp.name]]
-                # # add to var_matrix
-                # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
-          }
-        )
-
-        # var_matrix <- c()
-        # for (i in seq_len(n_rings)) {
-        #     # ==== local mito variance ==== get n neighbors for i rings
-        #     n_neighbors <- 3 * i * (i + 1)
-        #     tmp.name <- paste0("k", n_neighbors)
-        # 
-        #     # get local variance of mito ratio
-        #     spe.temp <- localVariance(spe.temp,
-        #         features = mito_percent,
-        #         n_neighbors = n_neighbors,
-        #         name = tmp.name
-        #     )
-        # 
-        #     # add to var_matrix
-        #     var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
-        # }
-
-        # browser()
+
+            # get local variance of mito ratio
+            spe.temp <- localVariance(spe.temp,
+                features = mito_percent,
+                n_neighbors = n_neighbors,
+                name = tmp.name
+            )
+
+            # add to var_matrix
+            var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
+        }
+
 
         # ========== PCA and clustering ========== add mito and mito_percent to
         # var dataframe

diff --git a/R/localOutliers.R b/R/localOutliers.R
@@ -84,9 +84,8 @@ localOutliers <- function(
     # Get a list of unique sample IDs
     unique_sample_ids <- unique(colData(spe)[[samples]])
 
-    # Initialize named list to store each spaQC dataframe
-    # spaQC_list <- vector("list", length(unique_sample_ids))
-    spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL)
+    # Initialize list to store each spaQC dataframe
+    spaQC_list <- vector("list", length(unique_sample_ids))
 
     # Loop through each unique sample ID
     for (sample in unique_sample_ids) {
@@ -102,46 +101,20 @@ localOutliers <- function(
             k = n_neighbors, warn.ties = FALSE
         )$index
 
-        dnn_aug <- cbind(
-          seq_len(nrow(dnn)), # Index of the center spot
-          dnn
-          )
-        mod_z_matrix <- apply(
-          dnn_aug, MARGIN = 1,
-          FUN = function(.idx, spaQC, metric_to_use ){
-            # browser()
-            .idx <- .idx[.idx!=0]
-            neighborhood <- spaQC[.idx, metric_to_use] |> as.matrix()
-            return(
-              spatialEco::outliers(neighborhood)[1] # The spot among neighbors
-            )
-          },
-          spaQC = spaQC,
-          metric_to_use = metric_to_use
-        )
-
-        # browser()
         # Initialize a matrix to store z-scores
-        # mod_z_matrix <- array(NA, nrow(dnn))
+        mod_z_matrix <- array(NA, nrow(dnn))
 
         # Loop through each row in the nearest neighbor index matrix
-        # for (i in seq_len(nrow(dnn))) {
-        #     dnn.idx <- dnn[i, ]
-        #     neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]]
-        # 
-        #     # modified z-score
-        #     mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1]
-        # }
+        for (i in seq_len(nrow(dnn))) {
+            dnn.idx <- dnn[i, ]
+            neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]]
 
-        # Handle non-finite values
-        if(any(!is.finite(mod_z_matrix))){
-          warning(
-            sample_id,
-            " contains infinite value mod_z_matrix. Forcing to be 0s."
-          )
-          mod_z_matrix[!is.finite(mod_z_matrix)] <- 0
+            # modified z-score
+            mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1]
         }
 
+        # Handle non-finite values
+        mod_z_matrix[!is.finite(mod_z_matrix)] <- 0
 
         # find outliers based on cutoff, store in colData
         metric_outliers <- paste0(metric, "_outliers")

diff --git a/R/localVariance.R b/R/localVariance.R
@@ -50,11 +50,6 @@
 #'     n_neighbors = 36,
 #'     name = "local_mito_variance_k36"
 #'     )
-#'     
-#' spe2 <- localVariance(spe,
-#'     features = c("subsets_Mito_percent", "total"),
-#'     n_neighbors = 36
-#'     )
 #'
 #' plotOutliers(spe, metric="local_mito_variance_k36")
 #'
@@ -65,160 +60,88 @@ localVariance <- function(spe, n_neighbors = 36,
     # log2 transform specified features
     features_to_use <- character()
     if (log2) {
-      log_mat <- apply(
-        colData(spe)[, features],
-        MARGIN = 2,
-        FUN = log2
-      )
-      colnames(log_mat) <- paste0(colnames(log_mat), "_log2")
-      # for (feature in features) {
-      #     # feature_log2 <- 
-      #     colData(spe)[paste0(feature, "_log2")] <- log2(colData(spe)[[feature]])
-      #     # features_to_use <- c(features_to_use, feature_log2)
-      # }
-      # features_to_use <- paste0(features, "_log2")
-
-      # Merge to spe
-      colData(spe)[colnames(log_mat)] <- log_mat
-      features_to_use <- colnames(log_mat)
-
+        for (feature in features) {
+            feature_log2 <- paste0(feature, "_log2")
+            colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
+            features_to_use <- c(features_to_use, feature_log2)
+        }
     } else {
         features_to_use <- features
     }
 
-    if (!is.null(name)) {
-      stopifnot(
-        "name has different length as feature" = 
-          length(names) == length(features_to_use))
-    } 
-
     # Get a list of unique sample IDs
     unique_sample_ids <- unique(colData(spe)[[samples]])
 
-    # Initialize a named list equal the length of unique_sample_ids
-    # spaQC_list <- vector("list", length(unique_sample_ids))
-    spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL)
+    # Initialize list to store each spaQC dataframe
+    spaQC_list <- vector("list", length(unique_sample_ids))
 
     # Loop through each unique sample ID
-    # NOTE: use for loop to avoid copying any large spe object.
-    # for (sample_id in seq_along(unique_sample_ids)) {
-    for (.sample_id in unique_sample_ids) {
+    for (sample_id in seq_along(unique_sample_ids)) {
         # Subset the data for the current sample
-        # sample <- unique_sample_ids[sample_id]
-        spe_subset <- subset(spe, , sample_id == .sample_id)
+        sample <- unique_sample_ids[sample_id]
+        spe_subset <- subset(spe, , sample_id == sample)
 
         # Create a list of spatial coordinates and qc features
         spaQC <- colData(spe_subset)
-        # NOTE: Doens't seem useful.
-        # spaQC$coords <- spatialCoords(spe_subset)
+        spaQC$coords <- spatialCoords(spe_subset)
 
         # Find nearest neighbors
         dnn <- BiocNeighbors::findKNN(spatialCoords(spe_subset),
             k = n_neighbors,
             warn.ties = FALSE
         )$index
+
         #  === Get local variance ===
         # Initialize a matrix to store variance for each feature
-        # var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
-        # colnames(var_matrix) <- features_to_use
-        # 
-        # mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
-        # colnames(mean_matrix) <- features_to_use
-
-        # for (i in seq_len(nrow(dnn))) {
-        #   dnn.idx <- dnn[i, ]
-        #   for (j in seq_along(features_to_use)) {
-        dnn_aug <- cbind(dnn, seq_len(nrow(dnn)))
-        mean_var_mat <- apply(dnn_aug, 1, FUN = function(.idx, spaQC, features_to_use ){
-          neighborhood <- spaQC[.idx, features_to_use, drop=FALSE] |> as.matrix()
-          means_vec <- colMeans(neighborhood, na.rm = TRUE)
-          var_vec <- colVars(neighborhood, na.rm = TRUE)
-
-          names(means_vec) <- paste0(names(means_vec), "_mean")
-          names(var_vec) <- paste0(names(var_vec), "_var")
-
-          c(means_vec, var_vec)
-
-          # [[features_to_use[j]]]
-          # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
-          # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
-        # # Loop through each row in the nearest neighbor index matrix
-        # 
-        #         neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]
-        # 
-        #         var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
-        #         mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
-        #     }
-        # }
-        },
-        spaQC = spaQC,
-        features_to_use = features_to_use ) |>
-          t() # Transpose to spots-by-features
+        var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
+        colnames(var_matrix) <- features_to_use
 
-        # Handle non-finite values
-        # var_matrix[!is.finite(var_matrix)] <- 0
-        # mean_matrix[!is.finite(mean_matrix)] <- 0
-        if(any(!is.finite(mean_var_mat))){
-          warning(
-            .sample_id,
-            " contains infinite value in local vars. Forcing to be 0s."
-            )
-          # NOTE to micheal: I'm not sure if this would cause problem....
-          mean_var_mat[!is.finite(mean_var_mat)] <- 0
+        mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
+        colnames(mean_matrix) <- features_to_use
+
+        # Loop through each row in the nearest neighbor index matrix
+        for (i in seq_len(nrow(dnn))) {
+            dnn.idx <- dnn[i, ]
+            for (j in seq_along(features_to_use)) {
+                neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]
+
+                var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
+                mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
+            }
         }
-
+
+        # Handle non-finite values
+        var_matrix[!is.finite(var_matrix)] <- 0
+        mean_matrix[!is.finite(mean_matrix)] <- 0
+
         # == Regress out mean-variance bias ==
-        # for (feature_idx in seq_along(features_to_use)) {
-        var_matrix <- sapply(
-          features_to_use,
-          FUN = function(feature, mean_var_mat){
-
+        for (feature_idx in seq_along(features_to_use)) {
             # Prepare data.frame for current feature
-            feat_var <- log2(mean_var_mat[, paste0(feature, "_var")])
-            feat_mean <- log2(mean_var_mat[, paste0(feature, "_mean")])
-            # Prepare data.frame for current feature
-            # mito_var_df <- data.frame(
-            #     mito_var = log2(var_matrix[, feature_idx]),
-            #     mito_mean = log2(mean_matrix[, feature_idx])
-            # )
+            mito_var_df <- data.frame(
+                mito_var = log2(var_matrix[, feature_idx]),
+                mito_mean = log2(mean_matrix[, feature_idx])
+            )
 
             # Perform robust linear regression (IRLS) of variance vs mean
-            # fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)
-            fit.irls <- MASS::rlm(feat_var ~ feat_mean)
+            fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)
 
             # Get residuals and update the variance matrix
             resid.irls <- resid(fit.irls)
 
             # Replace original variance values with residuals
-            resid.irls
-        },
-        mean_var_mat = mean_var_mat
-      )
+            var_matrix[, feature_idx] <- resid.irls
+        }
 
         # add local variance to spaQC dataframe
-        # if (!is.null(name)) {
-        #   stopifnot(
-        #     "name has different length as feature" = 
-        #               length(names) == length(features_to_use))
-        #     spaQC[name] <- var_matrix[, j]
-        # } else {
-        #     feature_var <- paste0(features[j], "_var")
-        #     spaQC[feature_var] <- var_matrix[, j]
-        # }
-
         if (!is.null(name)) {
-          stopifnot(
-            "name has different length as feature" =
-                      length(names) == length(features_to_use))
-          spaQC[name] <- var_matrix
+            spaQC[name] <- var_matrix[, j]
         } else {
-            feature_var <- paste0(features, "_var")
-            colnames(var_matrix) <- feature_var
-            spaQC[feature_var] <- var_matrix
+            feature_var <- paste0(features[j], "_var")
+            spaQC[feature_var] <- var_matrix[, j]
         }
 
         # Store the modified spaQC dataframe in the list
-        spaQC_list[[.sample_id]] <- spaQC
+        spaQC_list[[sample_id]] <- spaQC
     }
 
     # rbind the list of dataframes

diff --git a/man/localVariance.Rd b/man/localVariance.Rd