Skip to content

Commit

Permalink
replaced variance for loop with apply in findArtifacts. Removed featu…
Browse files Browse the repository at this point in the history
…re for loops.
  • Loading branch information
MicTott committed Apr 10, 2024
1 parent fe8a56e commit 900c30e
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 109 deletions.
78 changes: 31 additions & 47 deletions R/findArtifacts.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,60 +80,46 @@ findArtifacts <- function(
stop("'n_rings' must be a positive integer.")
}


# log transform specified features
features <- c(mito_percent, mito_sum)
features_to_use <- character()
if (log) {
for (feature in features) {
feature_log <- paste0(feature, "_log")
colData(spe)[feature_log] <- log1p(colData(spe)[[feature]])
features_to_use <- c(features_to_use, feature_log)
}
} else {
features_to_use <- features
}

# get unique sample IDs
sample_ids <- unique(colData(spe)[[samples]])
unique_sample_ids <- unique(colData(spe)[[samples]])

# Initialize a list to store spe for each sample
colData_list <- list()
columnData_list <- sapply(unique_sample_ids, FUN = function(x) NULL)

for (sample in sample_ids) {
for (sample in unique_sample_ids) {
# subset by sample
spe.temp <- spe[, colData(spe)[[samples]] == sample]

# ======= Calculate local mito variance ========
var_matrix <- c()
for (i in seq_len(n_rings)) {
# ==== local mito variance ==== get n neighbors for i rings
n_neighbors <- 3 * i * (i + 1)
tmp.name <- paste0("k", n_neighbors)

# get local variance of mito ratio
spe.temp <- localVariance(spe.temp,
features = mito_percent,
n_neighbors = n_neighbors,
name = tmp.name
)

# add to var_matrix
var_matrix <- cbind(var_matrix, colData(spe.temp)[[tmp.name]])
}


# ========== PCA and clustering ========== add mito and mito_percent to
# var dataframe
# Use vapply to iterate over rings_seq
var_matrix <- sapply(seq_len(n_rings), function(i) {
# Calculate n_neighbors for the current ring
n_neighbors <- 3 * i * (i + 1)
tmp.name <- paste0("k", n_neighbors)

# Apply local variance calculation
spe.temp <<- localVariance(spe.temp,
metric = mito_percent,
n_neighbors = n_neighbors,
name = tmp.name)

# Extract and return the column corresponding to tmp.name from colData
colData(spe.temp)[[tmp.name]]
})


# ========== PCA and clustering ==========
var_matrix <- cbind(
var_matrix, colData(spe.temp)[[mito_percent]],
var_matrix,
colData(spe.temp)[[mito_percent]],
colData(spe.temp)[[mito_sum]]
)

var_df <- data.frame(var_matrix)

# Run PCA and add to reduced dims
pc <- prcomp(var_df, center = TRUE, scale. = TRUE)
rownames(pc$x) <- colnames(spe.temp) # assign rownames to avoid error
reducedDim(spe.temp, "PCA_artifacts") <- pc$x

# Cluster using kmeans and add to temp sce
Expand All @@ -143,13 +129,9 @@ findArtifacts <- function(
# =========== Artifact annotation ===========

# calculate average local variance of the two clusters
clus1_mean <- mean(colData(spe.temp)[[paste0(
"k",
18
clus1_mean <- mean(colData(spe.temp)[[paste0("k",18
)]][spe.temp$Kmeans == 1])
clus2_mean <- mean(colData(spe.temp)[[paste0(
"k",
18
clus2_mean <- mean(colData(spe.temp)[[paste0("k",18
)]][spe.temp$Kmeans == 2])

artifact_clus <- which.min(c(clus1_mean, clus2_mean))
Expand All @@ -158,16 +140,18 @@ findArtifacts <- function(
# 1 to 'TRUE'
spe.temp$artifact <- FALSE
spe.temp$artifact[spe.temp$Kmeans == artifact_clus] <- TRUE
spe.temp$Kmeans <- NULL

# add to sample list
colData_list[[sample]] <- colData(spe.temp)
columnData_list[[sample]] <- colData(spe.temp)
}

# rbind the list of dataframes
colData_aggregated <- do.call(rbind, colData_list)
columnData_aggregated <- do.call(rbind, columnData_list)

# replace SPE column data with aggregated data
colData(spe) <- colData_aggregated
colData(spe) <- columnData_aggregated
reducedDims(spe) <- reducedDims(spe.temp)

# return sce
return(spe)
Expand Down
98 changes: 37 additions & 61 deletions R/localVariance.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
#' sample_id, sum_umi, sum_gene
#' @param n_neighbors Number of nearest neighbors to use for variance
#' calculation
#' @param features Features to use for variance calculation
#' @param metric metric to use for variance calculation
#' @param samples Column in colData to use for sample ID
#' @param log Whether to log1p transform the features
#' @param log Whether to log1p transform the metric
#' @param name Name of the new column to add to colData
#'
#' @return SpatialExperiment object with feature variance added to colData
#' @return SpatialExperiment object with metric variance added to colData
#'
#' @importFrom SummarizedExperiment colData
#' @importFrom BiocNeighbors findKNN
Expand Down Expand Up @@ -42,20 +42,20 @@
#' # Identifying the mitochondrial transcripts in our SpatialExperiment.
#' is.mito <- rownames(spe)[grepl("^MT-", rownames(spe))]
#'
#' # Calculating QC metrics for each spot using scuttle
#' # Calculating QC metric for each spot using scuttle
#' spe <- scuttle::addPerCellQCMetrics(spe, subsets = list(Mito = is.mito))
#' colnames(colData(spe))
#'
#' spe <- localVariance(spe,
#' features = "subsets_Mito_percent",
#' metric = "subsets_Mito_percent",
#' n_neighbors = 36,
#' name = "local_mito_variance_k36"
#' )
#'
#' plotQC(spe, metric="local_mito_variance_k36")
#'
localVariance <- function(spe, n_neighbors = 36,
features = c("expr_chrM_ratio"),
metric = c("expr_chrM_ratio"),
samples = "sample_id", log = FALSE, name = NULL) {

# ===== Validity checks =====
Expand All @@ -64,14 +64,14 @@ localVariance <- function(spe, n_neighbors = 36,
stop("Input data must be a SpatialExperiment object.")
}

# Validate 'features' is a character vector
if (!is.character(features)) {
stop("'features' must be a character vector.")
# Validate 'metric' is a character vector
if (!is.character(metric)) {
stop("'metric' must be a character vector.")
}

# validate 'features' are valid colData
if (!all(features %in% colnames(colData(spe)))) {
stop("All features must be present in colData.")
# validate 'metric' are valid colData
if (!all(metric %in% colnames(colData(spe)))) {
stop("Metric must be present in colData.")
}

# validate samples is valid colData
Expand All @@ -87,17 +87,13 @@ localVariance <- function(spe, n_neighbors = 36,
}

# ===== start function =====
# log1p transform specified features
# log1p transform specified metric
if (log) {
features_log <- lapply(features, function(feature) {
feature_log <- paste0(feature, "_log")
colData(spe)[[feature_log]] <- log1p(colData(spe)[[feature]])
return(feature_log) # Return the new feature name
})

features_to_use <- c(features, unlist(features_log))
metric_log <- paste0(metric, "_log")
colData(spe)[metric_log] <- log1p(colData(spe)[[metric]])
metric_to_use <- metric_log
} else {
features_to_use <- features
metric_to_use <- metric
}

# Get a list of unique sample IDs
Expand All @@ -112,7 +108,7 @@ localVariance <- function(spe, n_neighbors = 36,
sample <- unique_sample_ids[sample_id]
spe_subset <- subset(spe, , sample_id == sample)

# Create a list of spatial coordinates and qc features
# Create a list of spatial coordinates and qc metric
columnData <- colData(spe_subset)
columnData$coords <- spatialCoords(spe_subset)

Expand All @@ -122,53 +118,33 @@ localVariance <- function(spe, n_neighbors = 36,
warn.ties = FALSE
)$index

# === Get local variance ===
# Initialize a matrix to store variance for each feature
var_matrix <- matrix(NA, nrow(columnData), length(features_to_use))
colnames(var_matrix) <- features_to_use

mean_matrix <- matrix(NA, nrow(columnData), length(features_to_use))
colnames(mean_matrix) <- features_to_use
# === Compute local variance ===
# get neighborhood metric
neighborhoods <- lapply(seq_len(nrow(dnn)), function(i) {
indices <- dnn[i, ]
indices <- indices[indices != 0]
indices <- c(i, indices)

# Loop through each row in the nearest neighbor index matrix
for (i in seq_len(nrow(dnn))) {
dnn.idx <- dnn[i, ]
for (j in seq_along(features_to_use)) {
neighborhood <- columnData[c(i, dnn.idx[dnn.idx != 0]), ][[features_to_use[j]]]

var_matrix[i, j] <- var(neighborhood, na.rm = TRUE)[1]
mean_matrix[i, j] <- mean(neighborhood, na.rm = TRUE)[1]
}
}
columnData[indices, metric_to_use]
})

# Handle non-finite values
var_matrix[!is.finite(var_matrix)] <- 0
mean_matrix[!is.finite(mean_matrix)] <- 0
# Compute variance and mean
stats_matrix <- t(sapply(neighborhoods, function(x) {
c(var = var(x, na.rm = TRUE), mean = mean(x, na.rm = TRUE))
}))

# == Regress out mean-variance bias ==
for (feature_idx in seq_along(features_to_use)) {
# Prepare data.frame for current feature
mito_var_df <- data.frame(
mito_var = log2(var_matrix[, feature_idx]),
mito_mean = log2(mean_matrix[, feature_idx])
)
# Perform robust linear regression to regress out mean-var bias
fit.irls <- MASS::rlm(log2(var) ~ mean,
data = as.data.frame(stats_matrix))

# Perform robust linear regression (IRLS) of variance vs mean
fit.irls <- MASS::rlm(mito_var ~ mito_mean, data = mito_var_df)

# Get residuals and update the variance matrix
resid.irls <- resid(fit.irls)

# Replace original variance values with residuals
var_matrix[, feature_idx] <- resid.irls
}
var_resid <- resid(fit.irls) # Get residuals

# add local variance to columnData dataframe
if (!is.null(name)) {
columnData[name] <- var_matrix[, j]
columnData[name] <- var_resid
} else {
feature_var <- paste0(features[j], "_var")
columnData[feature_var] <- var_matrix[, j]
metric_var <- paste0(metric, "_var")
columnData[metric_var] <- var_resid
}

# Store the modified columnData dataframe in the list
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-findArtifacts.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@ test_that("example object contains artifacts colDta", {
})

test_that("examples gives correct number of artifact spots", {
expect_equal(sum(spe$artifact), 349)
expect_equal(sum(spe$artifact), 355)
})

0 comments on commit 900c30e

Please sign in to comment.