sample_level_pseudobulk_clustering.R

## ------------------------------------
##
## Script name: single-cell data sample level pseudobulk clustering
##
## Purpose of script: Generate sample-level pseudobulk clustering result and plot
##
## Author: Jiekun (Jackie) Yang
##
## Date Created: 2022-11-09
##
## Email: jkyang@mit.edu
##
##--------------------------------------

### input_file_directory is a vector of directory paths with all the sample-level r data objects saved, include "/" at the end of the path
### pheno_df is a data frame with the phenotypes you would like to add to the clustering plot with rownames matching the sample names
### pheno_colors is a list with each element corresponding to one phenotype provided in pheno_df
### target_folder is where you would like to store output files generated by this function
### file_name is the file name that you want to name the output plot

## library setup
library(Seurat)
options(future.globals.maxSize = 128000 * 1024^2)
library(future)
library(future.apply)
library(pheatmap)
library(RColorBrewer)
plan("multiprocess", workers = 16)

## function
sample_level_pseudobulk_cluster <- function(input_file_directory = NULL, pheno_df = NULL, pheno_colors = NULL, target_folder = NULL, file_name = NULL) {
  i <- 1
  for (temp.file in list.files(pattern = ".rds", path = input_file_directory, full.names = T)) {
    temp.data <- readRDS(temp.file)
    temp.name <- gsub(".rds", "", basename(temp.file))
    if (i == 1) {
      temp.all_sample_bulk <- as.data.frame(Matrix::rowMeans(temp.data@assays$SCT))
      colnames(temp.all_sample_bulk) <- temp.name
      temp.all_sample_bulk$Row.names <- rownames(temp.all_sample_bulk)
    } else {
      temp.all_sample_bulk <- merge(temp.all_sample_bulk, as.data.frame(Matrix::rowMeans(temp.data@assays$SCT)), by.x = "Row.names", by.y = "row.names")
      colnames(temp.all_sample_bulk)[i+1] <- temp.name
    }
    i <- i+1
  }
  rownames(temp.all_sample_bulk) <- temp.all_sample_bulk$Row.names
  temp.all_sample_bulk <- temp.all_sample_bulk[, -1]
  temp.corrs <- cor(as.matrix(temp.all_sample_bulk), method = "spearman")
  temp.corr.dists <- as.dist(1 - temp.corrs)
  temp.sample.num <- ncol(temp.corrs)
  pdf(paste0(target_folder, file_name, "_sample_level_pseudo_bulk_clustering.pdf"), width = 0.2*temp.sample.num, height = 0.15*temp.sample.num)
  print(pheatmap(temp.corrs, border_color = NA, clustering_method = "ward.D", annotation_col = pheno_df, clustering_distance_rows=temp.corr.dists, clustering_distance_cols=temp.corr.dists, col=colorRampPalette(brewer.pal(9, "Blues"))(99), show_rownames = FALSE, show_colnames = TRUE, annotation_colors = pheno_colors))
  dev.off()
}