diff --git a/DESCRIPTION b/DESCRIPTION index 5efb572..45acbbb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,13 +2,9 @@ Package: SpotSweeper Title: Spatially-aware quality control for spatial transcriptomics Version: 0.99.1 Date: 2023-10-30 -Authors@R: c( +Authors@R: person("Michael", "Totty", ,"mictott@gmail.com", role = c("aut", "cre"), - comment = c(ORCID = "0000-0002-9292-8556")), - person("Boyi", "Guo", ,email = "boyi.guo.work@gmail.com", role = c("aut"), - comment = c(ORCID = "0000-0003-2950-2349")) -) - + comment = c(ORCID = "0000-0002-9292-8556")) Description: Spatially-aware quality control (QC) software for both spot-level and artifact-level QC in spot-based spatial transcripomics, such as 10x Visium. These methods calculate local (nearest-neighbors) mean and variance diff --git a/R/findArtifacts.R b/R/findArtifacts.R index 8e87309..1d0e57e 100644 --- a/R/findArtifacts.R +++ b/R/findArtifacts.R @@ -60,21 +60,11 @@ findArtifacts <- function( features <- c(mito_percent, mito_sum) features_to_use <- character() if (log2) { - # for (feature in features) { - # feature_log2 <- paste0(feature, "_log2") - # colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) - # features_to_use <- c(features_to_use, feature_log2) - # } - log_mat <- apply( - colData(spe)[, features], - MARGIN = 2, - FUN = log2 - ) - colnames(log_mat) <- paste0(colnames(log_mat), "_log2") - # Merge to spe - colData(spe)[colnames(log_mat)] <- log_mat - features_to_use <- colnames(log_mat) - + for (feature in features) { + feature_log2 <- paste0(feature, "_log2") + colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) + features_to_use <- c(features_to_use, feature_log2) + } } else { features_to_use <- features } @@ -90,43 +80,23 @@ findArtifacts <- function( spe.temp <- spe[, colData(spe)[[samples]] == sample] # ======= Calculate local mito variance ======== - - var_mat <- sapply( - seq_len(n_rings), - FUN = function(i){ + var_matrix <- c() + for (i in seq_len(n_rings)) { + # ==== local mito variance ==== get n neighbors for i rings n_neighbors <- 3 * i * (i + 1) tmp.name <- paste0("k", n_neighbors) - - spe.temp <- localVariance(spe.temp, - features = mito_percent, - n_neighbors = n_neighbors, - name = tmp.name - ) - - colData(spe.temp)[[tmp.name]] - # # add to var_matrix - # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) - } - ) - - # var_matrix <- c() - # for (i in seq_len(n_rings)) { - # # ==== local mito variance ==== get n neighbors for i rings - # n_neighbors <- 3 * i * (i + 1) - # tmp.name <- paste0("k", n_neighbors) - # - # # get local variance of mito ratio - # spe.temp <- localVariance(spe.temp, - # features = mito_percent, - # n_neighbors = n_neighbors, - # name = tmp.name - # ) - # - # # add to var_matrix - # var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) - # } - - # browser() + + # get local variance of mito ratio + spe.temp <- localVariance(spe.temp, + features = mito_percent, + n_neighbors = n_neighbors, + name = tmp.name + ) + + # add to var_matrix + var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) + } + # ========== PCA and clustering ========== add mito and mito_percent to # var dataframe diff --git a/R/localOutliers.R b/R/localOutliers.R index 005df67..611a639 100644 --- a/R/localOutliers.R +++ b/R/localOutliers.R @@ -84,9 +84,8 @@ localOutliers <- function( # Get a list of unique sample IDs unique_sample_ids <- unique(colData(spe)[[samples]]) - # Initialize named list to store each spaQC dataframe - # spaQC_list <- vector("list", length(unique_sample_ids)) - spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL) + # Initialize list to store each spaQC dataframe + spaQC_list <- vector("list", length(unique_sample_ids)) # Loop through each unique sample ID for (sample in unique_sample_ids) { @@ -102,46 +101,20 @@ localOutliers <- function( k = n_neighbors, warn.ties = FALSE )$index - dnn_aug <- cbind( - seq_len(nrow(dnn)), # Index of the center spot - dnn - ) - mod_z_matrix <- apply( - dnn_aug, MARGIN = 1, - FUN = function(.idx, spaQC, metric_to_use ){ - # browser() - .idx <- .idx[.idx!=0] - neighborhood <- spaQC[.idx, metric_to_use] |> as.matrix() - return( - spatialEco::outliers(neighborhood)[1] # The spot among neighbors - ) - }, - spaQC = spaQC, - metric_to_use = metric_to_use - ) - - # browser() # Initialize a matrix to store z-scores - # mod_z_matrix <- array(NA, nrow(dnn)) + mod_z_matrix <- array(NA, nrow(dnn)) # Loop through each row in the nearest neighbor index matrix - # for (i in seq_len(nrow(dnn))) { - # dnn.idx <- dnn[i, ] - # neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]] - # - # # modified z-score - # mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1] - # } + for (i in seq_len(nrow(dnn))) { + dnn.idx <- dnn[i, ] + neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[metric_to_use]] - # Handle non-finite values - if(any(!is.finite(mod_z_matrix))){ - warning( - sample_id, - " contains infinite value mod_z_matrix. Forcing to be 0s." - ) - mod_z_matrix[!is.finite(mod_z_matrix)] <- 0 + # modified z-score + mod_z_matrix[i] <- spatialEco::outliers(neighborhood)[1] } + # Handle non-finite values + mod_z_matrix[!is.finite(mod_z_matrix)] <- 0 # find outliers based on cutoff, store in colData metric_outliers <- paste0(metric, "_outliers") diff --git a/R/localVariance.R b/R/localVariance.R index 3e0bb41..c69edac 100644 --- a/R/localVariance.R +++ b/R/localVariance.R @@ -50,11 +50,6 @@ #' n_neighbors = 36, #' name = "local_mito_variance_k36" #' ) -#' -#' spe2 <- localVariance(spe, -#' features = c("subsets_Mito_percent", "total"), -#' n_neighbors = 36 -#' ) #' #' plotOutliers(spe, metric="local_mito_variance_k36") #' @@ -65,160 +60,88 @@ localVariance <- function(spe, n_neighbors = 36, # log2 transform specified features features_to_use <- character() if (log2) { - log_mat <- apply( - colData(spe)[, features], - MARGIN = 2, - FUN = log2 - ) - colnames(log_mat) <- paste0(colnames(log_mat), "_log2") - # for (feature in features) { - # # feature_log2 <- - # colData(spe)[paste0(feature, "_log2")] <- log2(colData(spe)[[feature]]) - # # features_to_use <- c(features_to_use, feature_log2) - # } - # features_to_use <- paste0(features, "_log2") - - # Merge to spe - colData(spe)[colnames(log_mat)] <- log_mat - features_to_use <- colnames(log_mat) - + for (feature in features) { + feature_log2 <- paste0(feature, "_log2") + colData(spe)[feature_log2] <- log2(colData(spe)[[feature]]) + features_to_use <- c(features_to_use, feature_log2) + } } else { features_to_use <- features } - if (!is.null(name)) { - stopifnot( - "name has different length as feature" = - length(names) == length(features_to_use)) - } - # Get a list of unique sample IDs unique_sample_ids <- unique(colData(spe)[[samples]]) - # Initialize a named list equal the length of unique_sample_ids - # spaQC_list <- vector("list", length(unique_sample_ids)) - spaQC_list <- sapply(unique_sample_ids, FUN = function(x) NULL) + # Initialize list to store each spaQC dataframe + spaQC_list <- vector("list", length(unique_sample_ids)) # Loop through each unique sample ID - # NOTE: use for loop to avoid copying any large spe object. - # for (sample_id in seq_along(unique_sample_ids)) { - for (.sample_id in unique_sample_ids) { + for (sample_id in seq_along(unique_sample_ids)) { # Subset the data for the current sample - # sample <- unique_sample_ids[sample_id] - spe_subset <- subset(spe, , sample_id == .sample_id) + sample <- unique_sample_ids[sample_id] + spe_subset <- subset(spe, , sample_id == sample) # Create a list of spatial coordinates and qc features spaQC <- colData(spe_subset) - # NOTE: Doens't seem useful. - # spaQC$coords <- spatialCoords(spe_subset) + spaQC$coords <- spatialCoords(spe_subset) # Find nearest neighbors dnn <- BiocNeighbors::findKNN(spatialCoords(spe_subset), k = n_neighbors, warn.ties = FALSE )$index + # === Get local variance === # Initialize a matrix to store variance for each feature - # var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) - # colnames(var_matrix) <- features_to_use - # - # mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) - # colnames(mean_matrix) <- features_to_use - - # for (i in seq_len(nrow(dnn))) { - # dnn.idx <- dnn[i, ] - # for (j in seq_along(features_to_use)) { - dnn_aug <- cbind(dnn, seq_len(nrow(dnn))) - mean_var_mat <- apply(dnn_aug, 1, FUN = function(.idx, spaQC, features_to_use ){ - neighborhood <- spaQC[.idx, features_to_use, drop=FALSE] |> as.matrix() - means_vec <- colMeans(neighborhood, na.rm = TRUE) - var_vec <- colVars(neighborhood, na.rm = TRUE) - - names(means_vec) <- paste0(names(means_vec), "_mean") - names(var_vec) <- paste0(names(var_vec), "_var") - - c(means_vec, var_vec) - - # [[features_to_use[j]]] - # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] - # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] - # # Loop through each row in the nearest neighbor index matrix - # - # neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]] - # - # var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] - # mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] - # } - # } - }, - spaQC = spaQC, - features_to_use = features_to_use ) |> - t() # Transpose to spots-by-features + var_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) + colnames(var_matrix) <- features_to_use - # Handle non-finite values - # var_matrix[!is.finite(var_matrix)] <- 0 - # mean_matrix[!is.finite(mean_matrix)] <- 0 - if(any(!is.finite(mean_var_mat))){ - warning( - .sample_id, - " contains infinite value in local vars. Forcing to be 0s." - ) - # NOTE to micheal: I'm not sure if this would cause problem.... - mean_var_mat[!is.finite(mean_var_mat)] <- 0 + mean_matrix <- matrix(NA, nrow(spaQC), length(features_to_use)) + colnames(mean_matrix) <- features_to_use + + # Loop through each row in the nearest neighbor index matrix + for (i in seq_len(nrow(dnn))) { + dnn.idx <- dnn[i, ] + for (j in seq_along(features_to_use)) { + neighborhood <- spaQC[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]] + + var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] + mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] + } } - + + # Handle non-finite values + var_matrix[!is.finite(var_matrix)] <- 0 + mean_matrix[!is.finite(mean_matrix)] <- 0 + # == Regress out mean-variance bias == - # for (feature_idx in seq_along(features_to_use)) { - var_matrix <- sapply( - features_to_use, - FUN = function(feature, mean_var_mat){ - + for (feature_idx in seq_along(features_to_use)) { # Prepare data.frame for current feature - feat_var <- log2(mean_var_mat[, paste0(feature, "_var")]) - feat_mean <- log2(mean_var_mat[, paste0(feature, "_mean")]) - # Prepare data.frame for current feature - # mito_var_df <- data.frame( - # mito_var = log2(var_matrix[, feature_idx]), - # mito_mean = log2(mean_matrix[, feature_idx]) - # ) + mito_var_df <- data.frame( + mito_var = log2(var_matrix[, feature_idx]), + mito_mean = log2(mean_matrix[, feature_idx]) + ) # Perform robust linear regression (IRLS) of variance vs mean - # fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df) - fit.irls <- MASS::rlm(feat_var ~ feat_mean) + fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df) # Get residuals and update the variance matrix resid.irls <- resid(fit.irls) # Replace original variance values with residuals - resid.irls - }, - mean_var_mat = mean_var_mat - ) + var_matrix[, feature_idx] <- resid.irls + } # add local variance to spaQC dataframe - # if (!is.null(name)) { - # stopifnot( - # "name has different length as feature" = - # length(names) == length(features_to_use)) - # spaQC[name] <- var_matrix[, j] - # } else { - # feature_var <- paste0(features[j], "_var") - # spaQC[feature_var] <- var_matrix[, j] - # } - if (!is.null(name)) { - stopifnot( - "name has different length as feature" = - length(names) == length(features_to_use)) - spaQC[name] <- var_matrix + spaQC[name] <- var_matrix[, j] } else { - feature_var <- paste0(features, "_var") - colnames(var_matrix) <- feature_var - spaQC[feature_var] <- var_matrix + feature_var <- paste0(features[j], "_var") + spaQC[feature_var] <- var_matrix[, j] } # Store the modified spaQC dataframe in the list - spaQC_list[[.sample_id]] <- spaQC + spaQC_list[[sample_id]] <- spaQC } # rbind the list of dataframes diff --git a/man/localVariance.Rd b/man/localVariance.Rd index 3bf9a1e..bdca9c8 100644 --- a/man/localVariance.Rd +++ b/man/localVariance.Rd @@ -65,11 +65,6 @@ spe <- localVariance(spe, n_neighbors = 36, name = "local_mito_variance_k36" ) - -spe2 <- localVariance(spe, - features = c("subsets_Mito_percent", "total"), - n_neighbors = 36 - ) plotOutliers(spe, metric="local_mito_variance_k36")