diff --git a/R/findArtifacts.R b/R/findArtifacts.R index cb3ee65..ce52698 100644 --- a/R/findArtifacts.R +++ b/R/findArtifacts.R @@ -80,53 +80,38 @@ findArtifacts <- function( stop("'n_rings' must be a positive integer.") } - - # log transform specified features - features <- c(mito_percent, mito_sum) - features_to_use <- character() - if (log) { - for (feature in features) { - feature_log <- paste0(feature, "_log") - colData(spe)[feature_log] <- log1p(colData(spe)[[feature]]) - features_to_use <- c(features_to_use, feature_log) - } - } else { - features_to_use <- features - } - # get unique sample IDs - sample_ids <- unique(colData(spe)[[samples]]) + unique_sample_ids <- unique(colData(spe)[[samples]]) # Initialize a list to store spe for each sample - colData_list <- list() + columnData_list <- sapply(unique_sample_ids, FUN = function(x) NULL) - for (sample in sample_ids) { + for (sample in unique_sample_ids) { # subset by sample spe.temp <- spe[, colData(spe)[[samples]] == sample] # ======= Calculate local mito variance ======== - var_matrix <- c() - for (i in seq_len(n_rings)) { - # ==== local mito variance ==== get n neighbors for i rings - n_neighbors <- 3 * i * (i + 1) - tmp.name <- paste0("k", n_neighbors) - - # get local variance of mito ratio - spe.temp <- localVariance(spe.temp, - features = mito_percent, - n_neighbors = n_neighbors, - name = tmp.name - ) - - # add to var_matrix - var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]]) - } - - - # ========== PCA and clustering ========== add mito and mito_percent to - # var dataframe + # Use vapply to iterate over rings_seq + var_matrix <- sapply(seq_len(n_rings), function(i) { + # Calculate n_neighbors for the current ring + n_neighbors <- 3 * i * (i + 1) + tmp.name <- paste0("k", n_neighbors) + + # Apply local variance calculation + spe.temp <<- localVariance(spe.temp, + metric = mito_percent, + n_neighbors = n_neighbors, + name = tmp.name) + + # Extract and return the column corresponding to tmp.name from colData + colData(spe.temp)[[tmp.name]] + }) + + + # ========== PCA and clustering ========== var_matrix <- cbind( - var_matrix, colData(spe.temp)[[mito_percent]], + var_matrix, + colData(spe.temp)[[mito_percent]], colData(spe.temp)[[mito_sum]] ) @@ -134,6 +119,7 @@ findArtifacts <- function( # Run PCA and add to reduced dims pc <- prcomp(var_df, center = TRUE, scale. = TRUE) + rownames(pc$x) <- colnames(spe.temp) # assign rownames to avoid error reducedDim(spe.temp, "PCA_artifacts") <- pc$x # Cluster using kmeans and add to temp sce @@ -143,13 +129,9 @@ findArtifacts <- function( # =========== Artifact annotation =========== # calculate average local variance of the two clusters - clus1_mean <- mean(colData(spe.temp)[[paste0( - "k", - 18 + clus1_mean <- mean(colData(spe.temp)[[paste0("k",18 )]][spe.temp$Kmeans == 1]) - clus2_mean <- mean(colData(spe.temp)[[paste0( - "k", - 18 + clus2_mean <- mean(colData(spe.temp)[[paste0("k",18 )]][spe.temp$Kmeans == 2]) artifact_clus <- which.min(c(clus1_mean, clus2_mean)) @@ -158,16 +140,18 @@ findArtifacts <- function( # 1 to 'TRUE' spe.temp$artifact <- FALSE spe.temp$artifact[spe.temp$Kmeans == artifact_clus] <- TRUE + spe.temp$Kmeans <- NULL # add to sample list - colData_list[[sample]] <- colData(spe.temp) + columnData_list[[sample]] <- colData(spe.temp) } # rbind the list of dataframes - colData_aggregated <- do.call(rbind, colData_list) + columnData_aggregated <- do.call(rbind, columnData_list) # replace SPE column data with aggregated data - colData(spe) <- colData_aggregated + colData(spe) <- columnData_aggregated + reducedDims(spe) <- reducedDims(spe.temp) # return sce return(spe) diff --git a/R/localVariance.R b/R/localVariance.R index c577977..fb6bccf 100644 --- a/R/localVariance.R +++ b/R/localVariance.R @@ -6,12 +6,12 @@ #' sample_id, sum_umi, sum_gene #' @param n_neighbors Number of nearest neighbors to use for variance #' calculation -#' @param features Features to use for variance calculation +#' @param metric metric to use for variance calculation #' @param samples Column in colData to use for sample ID -#' @param log Whether to log1p transform the features +#' @param log Whether to log1p transform the metric #' @param name Name of the new column to add to colData #' -#' @return SpatialExperiment object with feature variance added to colData +#' @return SpatialExperiment object with metric variance added to colData #' #' @importFrom SummarizedExperiment colData #' @importFrom BiocNeighbors findKNN @@ -42,12 +42,12 @@ #' # Identifying the mitochondrial transcripts in our SpatialExperiment. #' is.mito <- rownames(spe)[grepl("^MT-", rownames(spe))] #' -#' # Calculating QC metrics for each spot using scuttle +#' # Calculating QC metric for each spot using scuttle #' spe <- scuttle::addPerCellQCMetrics(spe, subsets = list(Mito = is.mito)) #' colnames(colData(spe)) #' #' spe <- localVariance(spe, -#' features = "subsets_Mito_percent", +#' metric = "subsets_Mito_percent", #' n_neighbors = 36, #' name = "local_mito_variance_k36" #' ) @@ -55,7 +55,7 @@ #' plotQC(spe, metric="local_mito_variance_k36") #' localVariance <- function(spe, n_neighbors = 36, - features = c("expr_chrM_ratio"), + metric = c("expr_chrM_ratio"), samples = "sample_id", log = FALSE, name = NULL) { # ===== Validity checks ===== @@ -64,14 +64,14 @@ localVariance <- function(spe, n_neighbors = 36, stop("Input data must be a SpatialExperiment object.") } - # Validate 'features' is a character vector - if (!is.character(features)) { - stop("'features' must be a character vector.") + # Validate 'metric' is a character vector + if (!is.character(metric)) { + stop("'metric' must be a character vector.") } - # validate 'features' are valid colData - if (!all(features %in% colnames(colData(spe)))) { - stop("All features must be present in colData.") + # validate 'metric' are valid colData + if (!all(metric %in% colnames(colData(spe)))) { + stop("Metric must be present in colData.") } # validate samples is valid colData @@ -87,17 +87,13 @@ localVariance <- function(spe, n_neighbors = 36, } # ===== start function ===== - # log1p transform specified features + # log1p transform specified metric if (log) { - features_log <- lapply(features, function(feature) { - feature_log <- paste0(feature, "_log") - colData(spe)[[feature_log]] <- log1p(colData(spe)[[feature]]) - return(feature_log) # Return the new feature name - }) - - features_to_use <- c(features, unlist(features_log)) + metric_log <- paste0(metric, "_log") + colData(spe)[metric_log] <- log1p(colData(spe)[[metric]]) + metric_to_use <- metric_log } else { - features_to_use <- features + metric_to_use <- metric } # Get a list of unique sample IDs @@ -112,7 +108,7 @@ localVariance <- function(spe, n_neighbors = 36, sample <- unique_sample_ids[sample_id] spe_subset <- subset(spe, , sample_id == sample) - # Create a list of spatial coordinates and qc features + # Create a list of spatial coordinates and qc metric columnData <- colData(spe_subset) columnData$coords <- spatialCoords(spe_subset) @@ -122,53 +118,33 @@ localVariance <- function(spe, n_neighbors = 36, warn.ties = FALSE )$index - # === Get local variance === - # Initialize a matrix to store variance for each feature - var_matrix <- matrix(NA, nrow(columnData), length(features_to_use)) - colnames(var_matrix) <- features_to_use - - mean_matrix <- matrix(NA, nrow(columnData), length(features_to_use)) - colnames(mean_matrix) <- features_to_use + # === Compute local variance === + # get neighborhood metric + neighborhoods <- lapply(seq_len(nrow(dnn)), function(i) { + indices <- dnn[i, ] + indices <- indices[indices != 0] + indices <- c(i, indices) - # Loop through each row in the nearest neighbor index matrix - for (i in seq_len(nrow(dnn))) { - dnn.idx <- dnn[i, ] - for (j in seq_along(features_to_use)) { - neighborhood <- columnData[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]] - - var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1] - mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1] - } - } + columnData[indices, metric_to_use] + }) - # Handle non-finite values - var_matrix[!is.finite(var_matrix)] <- 0 - mean_matrix[!is.finite(mean_matrix)] <- 0 + # Compute variance and mean + stats_matrix <- t(sapply(neighborhoods, function(x) { + c(var = var(x, na.rm = TRUE), mean = mean(x, na.rm = TRUE)) + })) - # == Regress out mean-variance bias == - for (feature_idx in seq_along(features_to_use)) { - # Prepare data.frame for current feature - mito_var_df <- data.frame( - mito_var = log2(var_matrix[, feature_idx]), - mito_mean = log2(mean_matrix[, feature_idx]) - ) + # Perform robust linear regression to regress out mean-var bias + fit.irls <- MASS::rlm(log2(var) ~ mean, + data = as.data.frame(stats_matrix)) - # Perform robust linear regression (IRLS) of variance vs mean - fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df) - - # Get residuals and update the variance matrix - resid.irls <- resid(fit.irls) - - # Replace original variance values with residuals - var_matrix[, feature_idx] <- resid.irls - } + var_resid <- resid(fit.irls) # Get residuals # add local variance to columnData dataframe if (!is.null(name)) { - columnData[name] <- var_matrix[, j] + columnData[name] <- var_resid } else { - feature_var <- paste0(features[j], "_var") - columnData[feature_var] <- var_matrix[, j] + metric_var <- paste0(metric, "_var") + columnData[metric_var] <- var_resid } # Store the modified columnData dataframe in the list diff --git a/tests/testthat/test-findArtifacts.R b/tests/testthat/test-findArtifacts.R index 2e99708..b96fbd2 100644 --- a/tests/testthat/test-findArtifacts.R +++ b/tests/testthat/test-findArtifacts.R @@ -21,5 +21,5 @@ test_that("example object contains artifacts colDta", { }) test_that("examples gives correct number of artifact spots", { - expect_equal(sum(spe$artifact), 349) + expect_equal(sum(spe$artifact), 355) })