From f286f7ee71feed43bc60487ff460059990ec2b98 Mon Sep 17 00:00:00 2001
From: Boyi Guo <boyi.guo.work@gmail.com>
Date: Thu, 4 Apr 2024 15:40:30 -0400
Subject: [PATCH 1/4] rewrite for loop with apply (#12)

---
 R/localVariance.R | 163 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 120 insertions(+), 43 deletions(-)

diff --git a/R/localVariance.R b/R/localVariance.R
index eb87387..2b67b1a 100644
--- a/R/localVariance.R
+++ b/R/localVariance.R
@@ -51,6 +51,11 @@
 #'     n_neighbors = 36,
 #'     name = "local_mito_variance_k36"
 #'     )
+#'     
+#' spe2 <- localVariance(spe,
+#'     features = c("subsets_Mito_percent", "total"),
+#'     n_neighbors = 36
+#'     )
 #'
 #' plotOutliers(spe, metric="local_mito_variance_k36")
 #'
@@ -61,88 +66,160 @@ localVariance <- function(spe, n_neighbors = 36,
     # log2 transform specified features
     features_to_use <- character()
     if (log2) {
-        for (feature in features) {
-            feature_log2 <- paste0(feature, "_log2")
-            colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
-            features_to_use <- c(features_to_use, feature_log2)
-        }
+      log_mat <- apply(
+        colData(spe)[, features],
+        MARGIN = 2,
+        FUN = log2
+      )
+      colnames(log_mat) <- paste0(colnames(log_mat), "_log2")
+      # for (feature in features) {
+      #     # feature_log2 <- 
+      #     colData(spe)[paste0(feature, "_log2")] <- log2(colData(spe)[[feature]])
+      #     # features_to_use <- c(features_to_use, feature_log2)
+      # }
+      # features_to_use <- paste0(features, "_log2")
+      
+      # Merge to spe
+      colData(spe)[colnames(log_mat)] <- log_mat
+      features_to_use <- colnames(log_mat)
+      
     } else {
         features_to_use <- features
     }
 
+    if (!is.null(name)) {
+      stopifnot(
+        "name has different length as feature" = 
+          length(names) == length(features_to_use))
+    } 
+    
     # Get a list of unique sample IDs
     unique_sample_ids <- unique(colData(spe)[[samples]])
 
-    # Initialize list to store each spaQC dataframe
-    spaQC_list <- vector("list", length(unique_sample_ids))
+    # Initialize a named list equal the length of unique_sample_ids
+    # spaQC_list <- vector("list", length(unique_sample_ids))
+    spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL)
 
     # Loop through each unique sample ID
-    for (sample_id in seq_along(unique_sample_ids)) {
+    # NOTE: use for loop to avoid copying any large spe object.
+    # for (sample_id in seq_along(unique_sample_ids)) {
+    for (.sample_id in unique_sample_ids) {
         # Subset the data for the current sample
-        sample <- unique_sample_ids[sample_id]
-        spe_subset <- subset(spe, , sample_id == sample)
+        # sample <- unique_sample_ids[sample_id]
+        spe_subset <- subset(spe, , sample_id == .sample_id)
 
         # Create a list of spatial coordinates and qc features
         spaQC <- colData(spe_subset)
-        spaQC$coords <- spatialCoords(spe_subset)
+        # NOTE: Doens't seem useful.
+        # spaQC$coords <- spatialCoords(spe_subset)
 
         # Find nearest neighbors
         dnn <- BiocNeighbors::findKNN(spatialCoords(spe_subset),
             k = n_neighbors,
             warn.ties = FALSE
         )$index
-
         #  === Get local variance ===
         # Initialize a matrix to store variance for each feature
-        var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
-        colnames(var_matrix) <- features_to_use
-
-        mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
-        colnames(mean_matrix) <- features_to_use
-
-        # Loop through each row in the nearest neighbor index matrix
-        for (i in seq_len(nrow(dnn))) {
-            dnn.idx <- dnn[i, ]
-            for (j in seq_along(features_to_use)) {
-                neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]
-
-                var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
-                mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
-            }
-        }
+        # var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
+        # colnames(var_matrix) <- features_to_use
+        # 
+        # mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use))
+        # colnames(mean_matrix) <- features_to_use
+
+        # for (i in seq_len(nrow(dnn))) {
+        #   dnn.idx <- dnn[i, ]
+        #   for (j in seq_along(features_to_use)) {
+        dnn_aug <- cbind(dnn, seq_len(nrow(dnn)))
+        mean_var_mat <- apply(dnn_aug, 1, FUN = function(.idx, spaQC, features_to_use ){
+          neighborhood <- spaQC[.idx, features_to_use, drop=FALSE] |> as.matrix()
+          means_vec <- colMeans(neighborhood, na.rm = TRUE)
+          var_vec <- colVars(neighborhood, na.rm = TRUE)
+          
+          names(means_vec) <- paste0(names(means_vec), "_mean")
+          names(var_vec) <- paste0(names(var_vec), "_var")
+          
+          c(means_vec, var_vec)
+          
+          # [[features_to_use[j]]]
+          # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
+          # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
+        # # Loop through each row in the nearest neighbor index matrix
+        # 
+        #         neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]
+        # 
+        #         var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
+        #         mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
+        #     }
+        # }
+        },
+        spaQC = spaQC,
+        features_to_use = features_to_use ) |>
+          t() # Transpose to spots-by-features
 
         # Handle non-finite values
-        var_matrix[!is.finite(var_matrix)] <- 0
-        mean_matrix[!is.finite(mean_matrix)] <- 0
-
+        # var_matrix[!is.finite(var_matrix)] <- 0
+        # mean_matrix[!is.finite(mean_matrix)] <- 0
+        if(any(!is.finite(mean_var_mat))){
+          warning(
+            .sample_id,
+            " contains infinite value in local vars. Forcing to be 0s."
+            )
+          # NOTE to micheal: I'm not sure if this would cause problem....
+          mean_var_mat[!is.finite(mean_var_mat)] <- 0
+        }
+          
         # == Regress out mean-variance bias ==
-        for (feature_idx in seq_along(features_to_use)) {
+        # for (feature_idx in seq_along(features_to_use)) {
+        var_matrix <- sapply(
+          features_to_use,
+          FUN = function(feature, mean_var_mat){
+          
             # Prepare data.frame for current feature
-            mito_var_df <- data.frame(
-                mito_var = log2(var_matrix[, feature_idx]),
-                mito_mean = log2(mean_matrix[, feature_idx])
-            )
+            feat_var <- log2(mean_var_mat[, paste0(feature, "_var")])
+            feat_mean <- log2(mean_var_mat[, paste0(feature, "_mean")])
+            # Prepare data.frame for current feature
+            # mito_var_df <- data.frame(
+            #     mito_var = log2(var_matrix[, feature_idx]),
+            #     mito_mean = log2(mean_matrix[, feature_idx])
+            # )
 
             # Perform robust linear regression (IRLS) of variance vs mean
-            fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)
+            # fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)
+            fit.irls <- MASS::rlm(feat_var ~ feat_mean)
 
             # Get residuals and update the variance matrix
             resid.irls <- resid(fit.irls)
 
             # Replace original variance values with residuals
-            var_matrix[, feature_idx] <- resid.irls
-        }
+            resid.irls
+        },
+        mean_var_mat = mean_var_mat
+      )
 
         # add local variance to spaQC dataframe
+        # if (!is.null(name)) {
+        #   stopifnot(
+        #     "name has different length as feature" = 
+        #               length(names) == length(features_to_use))
+        #     spaQC[name] <- var_matrix[, j]
+        # } else {
+        #     feature_var <- paste0(features[j], "_var")
+        #     spaQC[feature_var] <- var_matrix[, j]
+        # }
+        
         if (!is.null(name)) {
-            spaQC[name] <- var_matrix[, j]
+          stopifnot(
+            "name has different length as feature" =
+                      length(names) == length(features_to_use))
+          spaQC[name] <- var_matrix
         } else {
-            feature_var <- paste0(features[j], "_var")
-            spaQC[feature_var] <- var_matrix[, j]
+            feature_var <- paste0(features, "_var")
+            colnames(var_matrix) <- feature_var
+            spaQC[feature_var] <- var_matrix
         }
 
         # Store the modified spaQC dataframe in the list
-        spaQC_list[[sample_id]] <- spaQC
+        spaQC_list[[.sample_id]] <- spaQC
     }
 
     # rbind the list of dataframes

From 00c107754c5203d5979f07c4aaae822a5f45baf4 Mon Sep 17 00:00:00 2001
From: Boyi Guo <boyi.guo.work@gmail.com>
Date: Thu, 4 Apr 2024 16:19:14 -0400
Subject: [PATCH 2/4] turn 1 for loop to apply, kept the other regarding memory
 concern (#12)

---
 R/localOutliers.R | 47 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/R/localOutliers.R b/R/localOutliers.R
index 611a639..005df67 100644
--- a/R/localOutliers.R
+++ b/R/localOutliers.R
@@ -84,8 +84,9 @@ localOutliers <- function(
     # Get a list of unique sample IDs
     unique_sample_ids <- unique(colData(spe)[[samples]])
 
-    # Initialize list to store each spaQC dataframe
-    spaQC_list <- vector("list", length(unique_sample_ids))
+    # Initialize named list to store each spaQC dataframe
+    # spaQC_list <- vector("list", length(unique_sample_ids))
+    spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL)
 
     # Loop through each unique sample ID
     for (sample in unique_sample_ids) {
@@ -101,20 +102,46 @@ localOutliers <- function(
             k = n_neighbors, warn.ties = FALSE
         )$index
 
+        dnn_aug <- cbind(
+          seq_len(nrow(dnn)), # Index of the center spot
+          dnn
+          )
+        mod_z_matrix <- apply(
+          dnn_aug, MARGIN = 1,
+          FUN = function(.idx, spaQC, metric_to_use ){
+            # browser()
+            .idx <- .idx[.idx!=0]
+            neighborhood <- spaQC[.idx, metric_to_use] |> as.matrix()
+            return(
+              spatialEco::outliers(neighborhood)[1] # The spot among neighbors
+            )
+          },
+          spaQC = spaQC,
+          metric_to_use = metric_to_use
+        )
+        
+        # browser()
         # Initialize a matrix to store z-scores
-        mod_z_matrix <- array(NA, nrow(dnn))
+        # mod_z_matrix <- array(NA, nrow(dnn))
 
         # Loop through each row in the nearest neighbor index matrix
-        for (i in seq_len(nrow(dnn))) {
-            dnn.idx <- dnn[i, ]
-            neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]]
+        # for (i in seq_len(nrow(dnn))) {
+        #     dnn.idx <- dnn[i, ]
+        #     neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]]
+        # 
+        #     # modified z-score
+        #     mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1]
+        # }
 
-            # modified z-score
-            mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1]
+        # Handle non-finite values
+        if(any(!is.finite(mod_z_matrix))){
+          warning(
+            sample_id,
+            " contains infinite value mod_z_matrix. Forcing to be 0s."
+          )
+          mod_z_matrix[!is.finite(mod_z_matrix)] <- 0
         }
 
-        # Handle non-finite values
-        mod_z_matrix[!is.finite(mod_z_matrix)] <- 0
 
         # find outliers based on cutoff, store in colData
         metric_outliers <- paste0(metric, "_outliers")

From 50a367b75731244b1e3db78377136ab2be5af51c Mon Sep 17 00:00:00 2001
From: Boyi Guo <boyi.guo.work@gmail.com>
Date: Thu, 4 Apr 2024 16:46:09 -0400
Subject: [PATCH 3/4] Fix for loops. (#12)

---
 R/findArtifacts.R | 70 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/R/findArtifacts.R b/R/findArtifacts.R
index 0d4ef24..b250fb4 100644
--- a/R/findArtifacts.R
+++ b/R/findArtifacts.R
@@ -60,11 +60,21 @@ findArtifacts <- function(
     features <- c(mito_percent, mito_sum)
     features_to_use <- character()
     if (log2) {
-        for (feature in features) {
-            feature_log2 <- paste0(feature, "_log2")
-            colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
-            features_to_use <- c(features_to_use, feature_log2)
-        }
+        # for (feature in features) {
+        #     feature_log2 <- paste0(feature, "_log2")
+        #     colData(spe)[feature_log2] <- log2(colData(spe)[[feature]])
+        #     features_to_use <- c(features_to_use, feature_log2)
+        # }
+      log_mat <- apply(
+        colData(spe)[, features],
+        MARGIN = 2,
+        FUN = log2
+      )
+      colnames(log_mat) <- paste0(colnames(log_mat), "_log2")
+      # Merge to spe
+      colData(spe)[colnames(log_mat)] <- log_mat
+      features_to_use <- colnames(log_mat)
+      
     } else {
         features_to_use <- features
     }
@@ -80,23 +90,43 @@ findArtifacts <- function(
         spe.temp <- spe[, colData(spe)[[samples]] == sample]
 
         # ======= Calculate local mito variance ========
-        var_matrix <- c()
-        for (i in seq_len(n_rings)) {
-            # ==== local mito variance ==== get n neighbors for i rings
+        
+        var_mat <- sapply(
+          seq_len(n_rings),
+          FUN = function(i){
             n_neighbors <- 3 * i * (i + 1)
             tmp.name <- paste0("k", n_neighbors)
-
-            # get local variance of mito ratio
-            spe.temp <- localVariance(spe.temp,
-                features = mito_percent,
-                n_neighbors = n_neighbors,
-                name = tmp.name
-            )
-
-            # add to var_matrix
-            var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
-        }
-
+            
+                spe.temp <- localVariance(spe.temp,
+                    features = mito_percent,
+                    n_neighbors = n_neighbors,
+                    name = tmp.name
+                )
+
+                colData(spe.temp)[[tmp.name]]
+                # # add to var_matrix
+                # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
+          }
+        )
+        
+        # var_matrix <- c()
+        # for (i in seq_len(n_rings)) {
+        #     # ==== local mito variance ==== get n neighbors for i rings
+        #     n_neighbors <- 3 * i * (i + 1)
+        #     tmp.name <- paste0("k", n_neighbors)
+        # 
+        #     # get local variance of mito ratio
+        #     spe.temp <- localVariance(spe.temp,
+        #         features = mito_percent,
+        #         n_neighbors = n_neighbors,
+        #         name = tmp.name
+        #     )
+        # 
+        #     # add to var_matrix
+        #     var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
+        # }
+
+        # browser()
 
         # ========== PCA and clustering ========== add mito and mito_percent to
         # var dataframe

From 8eef40599e9812878a60b7ce6802cda85e358b54 Mon Sep 17 00:00:00 2001
From: Boyi Guo <boyi.guo.work@gmail.com>
Date: Thu, 4 Apr 2024 16:55:36 -0400
Subject: [PATCH 4/4] Add Boyi's name as author.

---
 DESCRIPTION          | 8 ++++++--
 man/localVariance.Rd | 5 +++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 45acbbb..5efb572 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,9 +2,13 @@ Package: SpotSweeper
 Title: Spatially-aware quality control for spatial transcriptomics
 Version: 0.99.1
 Date: 2023-10-30
-Authors@R: 
+Authors@R: c(
     person("Michael", "Totty", ,"mictott@gmail.com", role = c("aut", "cre"),
-           comment = c(ORCID = "0000-0002-9292-8556"))
+           comment = c(ORCID = "0000-0002-9292-8556")),
+    person("Boyi", "Guo", ,email = "boyi.guo.work@gmail.com", role = c("aut"),
+           comment = c(ORCID = "0000-0003-2950-2349"))
+)
+
 Description: Spatially-aware quality control (QC) software for both spot-level 
     and artifact-level QC in spot-based spatial transcripomics, such as 10x 
     Visium. These methods calculate local (nearest-neighbors) mean and variance 
diff --git a/man/localVariance.Rd b/man/localVariance.Rd
index dd8bab6..e1dcf9a 100644
--- a/man/localVariance.Rd
+++ b/man/localVariance.Rd
@@ -67,6 +67,11 @@ spe <- localVariance(spe,
     n_neighbors = 36,
     name = "local_mito_variance_k36"
     )
+    
+spe2 <- localVariance(spe,
+    features = c("subsets_Mito_percent", "total"),
+    n_neighbors = 36
+    )
 
 plotOutliers(spe, metric="local_mito_variance_k36")