From f286f7ee71feed43bc60487ff460059990ec2b98 Mon Sep 17 00:00:00 2001 From: Boyi Guo Date: Thu, 4 Apr 2024 15:40:30 -0400 Subject: [PATCH 1/4] rewrite for loop with apply (#12) --- R/localVariance.R | 163 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 43 deletions(-) diff --git a/R/localVariance.R b/R/localVariance.R index eb87387..2b67b1a 100644 --- a/R/localVariance.R +++ b/R/localVariance.R @@ -51,6 +51,11 @@ #' n_neighbors = 36, #' name = "local_mito_variance_k36" #' ) +#' +#' spe2 <- localVariance(spe, +#' features = c("subsets_Mito_percent", "total"), +#' n_neighbors = 36 +#' ) #' #' plotOutliers(spe, metric="local_mito_variance_k36") #' @@ -61,88 +66,160 @@ localVariance <- function(spe, n_neighbors = 36, # log2 transform specified features features_to_use <- character() if (log2) { - for (feature in features) { - feature_log2 <- paste0(feature, "_log2") - colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) - features_to_use <- c(features_to_use, feature_log2) - } + log_mat <- apply( + colData(spe)[, features], + MARGIN = 2, + FUN = log2 + ) + colnames(log_mat) <- paste0(colnames(log_mat), "_log2") + # for (feature in features) { + # # feature_log2 <- + # colData(spe)[paste0(feature, "_log2")] <- log2(colData(spe)[[feature]]) + # # features_to_use <- c(features_to_use, feature_log2) + # } + # features_to_use <- paste0(features, "_log2") + + # Merge to spe + colData(spe)[colnames(log_mat)] <- log_mat + features_to_use <- colnames(log_mat) + } else { features_to_use <- features } + if (!is.null(name)) { + stopifnot( + "name has different length as feature" = + length(names) == length(features_to_use)) + } + # Get a list of unique sample IDs unique_sample_ids <- unique(colData(spe)[[samples]]) - # Initialize list to store each spaQC dataframe - spaQC_list <- vector("list", length(unique_sample_ids)) + # Initialize a named list equal the length of unique_sample_ids + # spaQC_list <- vector("list", length(unique_sample_ids)) + spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL) # Loop through each unique sample ID - for (sample_id in seq_along(unique_sample_ids)) { + # NOTE: use for loop to avoid copying any large spe object. + # for (sample_id in seq_along(unique_sample_ids)) { + for (.sample_id in unique_sample_ids) { # Subset the data for the current sample - sample <- unique_sample_ids[sample_id] - spe_subset <- subset(spe, , sample_id == sample) + # sample <- unique_sample_ids[sample_id] + spe_subset <- subset(spe, , sample_id == .sample_id) # Create a list of spatial coordinates and qc features spaQC <- colData(spe_subset) - spaQC$coords <- spatialCoords(spe_subset) + # NOTE: Doens't seem useful. + # spaQC$coords <- spatialCoords(spe_subset) # Find nearest neighbors dnn <- BiocNeighbors::findKNN(spatialCoords(spe_subset), k = n_neighbors, warn.ties = FALSE )$index - # === Get local variance === # Initialize a matrix to store variance for each feature - var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) - colnames(var_matrix) <- features_to_use - - mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) - colnames(mean_matrix) <- features_to_use - - # Loop through each row in the nearest neighbor index matrix - for (i in seq_len(nrow(dnn))) { - dnn.idx <- dnn[i, ] - for (j in seq_along(features_to_use)) { - neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]] - - var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] - mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] - } - } + # var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) + # colnames(var_matrix) <- features_to_use + # + # mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) + # colnames(mean_matrix) <- features_to_use + + # for (i in seq_len(nrow(dnn))) { + # dnn.idx <- dnn[i, ] + # for (j in seq_along(features_to_use)) { + dnn_aug <- cbind(dnn, seq_len(nrow(dnn))) + mean_var_mat <- apply(dnn_aug, 1, FUN = function(.idx, spaQC, features_to_use ){ + neighborhood <- spaQC[.idx, features_to_use, drop=FALSE] |> as.matrix() + means_vec <- colMeans(neighborhood, na.rm = TRUE) + var_vec <- colVars(neighborhood, na.rm = TRUE) + + names(means_vec) <- paste0(names(means_vec), "_mean") + names(var_vec) <- paste0(names(var_vec), "_var") + + c(means_vec, var_vec) + + # [[features_to_use[j]]] + # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] + # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] + # # Loop through each row in the nearest neighbor index matrix + # + # neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]] + # + # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] + # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] + # } + # } + }, + spaQC = spaQC, + features_to_use = features_to_use ) |> + t() # Transpose to spots-by-features # Handle non-finite values - var_matrix[!is.finite(var_matrix)] <- 0 - mean_matrix[!is.finite(mean_matrix)] <- 0 - + # var_matrix[!is.finite(var_matrix)] <- 0 + # mean_matrix[!is.finite(mean_matrix)] <- 0 + if(any(!is.finite(mean_var_mat))){ + warning( + .sample_id, + " contains infinite value in local vars. Forcing to be 0s." + ) + # NOTE to micheal: I'm not sure if this would cause problem.... + mean_var_mat[!is.finite(mean_var_mat)] <- 0 + } + # == Regress out mean-variance bias == - for (feature_idx in seq_along(features_to_use)) { + # for (feature_idx in seq_along(features_to_use)) { + var_matrix <- sapply( + features_to_use, + FUN = function(feature, mean_var_mat){ + # Prepare data.frame for current feature - mito_var_df <- data.frame( - mito_var = log2(var_matrix[, feature_idx]), - mito_mean = log2(mean_matrix[, feature_idx]) - ) + feat_var <- log2(mean_var_mat[, paste0(feature, "_var")]) + feat_mean <- log2(mean_var_mat[, paste0(feature, "_mean")]) + # Prepare data.frame for current feature + # mito_var_df <- data.frame( + # mito_var = log2(var_matrix[, feature_idx]), + # mito_mean = log2(mean_matrix[, feature_idx]) + # ) # Perform robust linear regression (IRLS) of variance vs mean - fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df) + # fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df) + fit.irls <- MASS::rlm(feat_var ~ feat_mean) # Get residuals and update the variance matrix resid.irls <- resid(fit.irls) # Replace original variance values with residuals - var_matrix[, feature_idx] <- resid.irls - } + resid.irls + }, + mean_var_mat = mean_var_mat + ) # add local variance to spaQC dataframe + # if (!is.null(name)) { + # stopifnot( + # "name has different length as feature" = + # length(names) == length(features_to_use)) + # spaQC[name] <- var_matrix[, j] + # } else { + # feature_var <- paste0(features[j], "_var") + # spaQC[feature_var] <- var_matrix[, j] + # } + if (!is.null(name)) { - spaQC[name] <- var_matrix[, j] + stopifnot( + "name has different length as feature" = + length(names) == length(features_to_use)) + spaQC[name] <- var_matrix } else { - feature_var <- paste0(features[j], "_var") - spaQC[feature_var] <- var_matrix[, j] + feature_var <- paste0(features, "_var") + colnames(var_matrix) <- feature_var + spaQC[feature_var] <- var_matrix } # Store the modified spaQC dataframe in the list - spaQC_list[[sample_id]] <- spaQC + spaQC_list[[.sample_id]] <- spaQC } # rbind the list of dataframes From 00c107754c5203d5979f07c4aaae822a5f45baf4 Mon Sep 17 00:00:00 2001 From: Boyi Guo Date: Thu, 4 Apr 2024 16:19:14 -0400 Subject: [PATCH 2/4] turn 1 for loop to apply, kept the other regarding memory concern (#12) --- R/localOutliers.R | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/R/localOutliers.R b/R/localOutliers.R index 611a639..005df67 100644 --- a/R/localOutliers.R +++ b/R/localOutliers.R @@ -84,8 +84,9 @@ localOutliers <- function( # Get a list of unique sample IDs unique_sample_ids <- unique(colData(spe)[[samples]]) - # Initialize list to store each spaQC dataframe - spaQC_list <- vector("list", length(unique_sample_ids)) + # Initialize named list to store each spaQC dataframe + # spaQC_list <- vector("list", length(unique_sample_ids)) + spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL) # Loop through each unique sample ID for (sample in unique_sample_ids) { @@ -101,20 +102,46 @@ localOutliers <- function( k = n_neighbors, warn.ties = FALSE )$index + dnn_aug <- cbind( + seq_len(nrow(dnn)), # Index of the center spot + dnn + ) + mod_z_matrix <- apply( + dnn_aug, MARGIN = 1, + FUN = function(.idx, spaQC, metric_to_use ){ + # browser() + .idx <- .idx[.idx!=0] + neighborhood <- spaQC[.idx, metric_to_use] |> as.matrix() + return( + spatialEco::outliers(neighborhood)[1] # The spot among neighbors + ) + }, + spaQC = spaQC, + metric_to_use = metric_to_use + ) + + # browser() # Initialize a matrix to store z-scores - mod_z_matrix <- array(NA, nrow(dnn)) + # mod_z_matrix <- array(NA, nrow(dnn)) # Loop through each row in the nearest neighbor index matrix - for (i in seq_len(nrow(dnn))) { - dnn.idx <- dnn[i, ] - neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]] + # for (i in seq_len(nrow(dnn))) { + # dnn.idx <- dnn[i, ] + # neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]] + # + # # modified z-score + # mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1] + # } - # modified z-score - mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1] + # Handle non-finite values + if(any(!is.finite(mod_z_matrix))){ + warning( + sample_id, + " contains infinite value mod_z_matrix. Forcing to be 0s." + ) + mod_z_matrix[!is.finite(mod_z_matrix)] <- 0 } - # Handle non-finite values - mod_z_matrix[!is.finite(mod_z_matrix)] <- 0 # find outliers based on cutoff, store in colData metric_outliers <- paste0(metric, "_outliers") From 50a367b75731244b1e3db78377136ab2be5af51c Mon Sep 17 00:00:00 2001 From: Boyi Guo Date: Thu, 4 Apr 2024 16:46:09 -0400 Subject: [PATCH 3/4] Fix for loops. (#12) --- R/findArtifacts.R | 70 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/R/findArtifacts.R b/R/findArtifacts.R index 0d4ef24..b250fb4 100644 --- a/R/findArtifacts.R +++ b/R/findArtifacts.R @@ -60,11 +60,21 @@ findArtifacts <- function( features <- c(mito_percent, mito_sum) features_to_use <- character() if (log2) { - for (feature in features) { - feature_log2 <- paste0(feature, "_log2") - colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) - features_to_use <- c(features_to_use, feature_log2) - } + # for (feature in features) { + # feature_log2 <- paste0(feature, "_log2") + # colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) + # features_to_use <- c(features_to_use, feature_log2) + # } + log_mat <- apply( + colData(spe)[, features], + MARGIN = 2, + FUN = log2 + ) + colnames(log_mat) <- paste0(colnames(log_mat), "_log2") + # Merge to spe + colData(spe)[colnames(log_mat)] <- log_mat + features_to_use <- colnames(log_mat) + } else { features_to_use <- features } @@ -80,23 +90,43 @@ findArtifacts <- function( spe.temp <- spe[, colData(spe)[[samples]] == sample] # ======= Calculate local mito variance ======== - var_matrix <- c() - for (i in seq_len(n_rings)) { - # ==== local mito variance ==== get n neighbors for i rings + + var_mat <- sapply( + seq_len(n_rings), + FUN = function(i){ n_neighbors <- 3 * i * (i + 1) tmp.name <- paste0("k", n_neighbors) - - # get local variance of mito ratio - spe.temp <- localVariance(spe.temp, - features = mito_percent, - n_neighbors = n_neighbors, - name = tmp.name - ) - - # add to var_matrix - var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) - } - + + spe.temp <- localVariance(spe.temp, + features = mito_percent, + n_neighbors = n_neighbors, + name = tmp.name + ) + + colData(spe.temp)[[tmp.name]] + # # add to var_matrix + # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) + } + ) + + # var_matrix <- c() + # for (i in seq_len(n_rings)) { + # # ==== local mito variance ==== get n neighbors for i rings + # n_neighbors <- 3 * i * (i + 1) + # tmp.name <- paste0("k", n_neighbors) + # + # # get local variance of mito ratio + # spe.temp <- localVariance(spe.temp, + # features = mito_percent, + # n_neighbors = n_neighbors, + # name = tmp.name + # ) + # + # # add to var_matrix + # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) + # } + + # browser() # ========== PCA and clustering ========== add mito and mito_percent to # var dataframe From 8eef40599e9812878a60b7ce6802cda85e358b54 Mon Sep 17 00:00:00 2001 From: Boyi Guo Date: Thu, 4 Apr 2024 16:55:36 -0400 Subject: [PATCH 4/4] Add Boyi's name as author. --- DESCRIPTION | 8 ++++++-- man/localVariance.Rd | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 45acbbb..5efb572 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,9 +2,13 @@ Package: SpotSweeper Title: Spatially-aware quality control for spatial transcriptomics Version: 0.99.1 Date: 2023-10-30 -Authors@R: +Authors@R: c( person("Michael", "Totty", ,"mictott@gmail.com", role = c("aut", "cre"), - comment = c(ORCID = "0000-0002-9292-8556")) + comment = c(ORCID = "0000-0002-9292-8556")), + person("Boyi", "Guo", ,email = "boyi.guo.work@gmail.com", role = c("aut"), + comment = c(ORCID = "0000-0003-2950-2349")) +) + Description: Spatially-aware quality control (QC) software for both spot-level and artifact-level QC in spot-based spatial transcripomics, such as 10x Visium. These methods calculate local (nearest-neighbors) mean and variance diff --git a/man/localVariance.Rd b/man/localVariance.Rd index dd8bab6..e1dcf9a 100644 --- a/man/localVariance.Rd +++ b/man/localVariance.Rd @@ -67,6 +67,11 @@ spe <- localVariance(spe, n_neighbors = 36, name = "local_mito_variance_k36" ) + +spe2 <- localVariance(spe, + features = c("subsets_Mito_percent", "total"), + n_neighbors = 36 + ) plotOutliers(spe, metric="local_mito_variance_k36")