diff --git a/analyses/data_preparation/01-multi-modal-clustering-prepare-data.R b/analyses/data_preparation/01-multi-modal-clustering-prepare-data.R index 9a796cf..d0d9d73 100644 --- a/analyses/data_preparation/01-multi-modal-clustering-prepare-data.R +++ b/analyses/data_preparation/01-multi-modal-clustering-prepare-data.R @@ -10,10 +10,7 @@ suppressPackageStartupMessages({ option_list <- list( make_option(c("--histology_file"), type = "character", help = "Histology file (.tsv)"), make_option(c("--short_histology"), type = "character", help = "Short histology of interest"), - make_option(c("--cancer_genes"), type = "character", help = "cancer gene list (.rds)"), make_option(c("--count_file"), type = "character", help = "Expression matrix, preferably counts (.rds)"), - make_option(c("--cnv_gainloss_file"), type = "character", help = "CNVkit gainloss file (.tsv)"), - make_option(c("--snv_file"), type = "character", help = "SNV consensus (.maf | tsv.gz)"), make_option(c("--methyl_file"), type = "character", help = "Methylation values, preferably beta values (.rds)"), make_option(c("--splice_file"), type = "character", help = "PSI values (.rds)"), make_option(c("--gtf_file"), type = "character", help = "Gencode gtf file"), @@ -29,9 +26,6 @@ dir.create(output_dir, showWarnings = F, recursive = T) # source functions source(file.path("utils", "filter_cnv.R")) -# cancer genes -cancer_genes <- readRDS(opt$cancer_genes) - # read histology file and filter to short histology of interest histology_file <- opt$histology_file histology_file <- read_tsv(file = histology_file) @@ -61,7 +55,7 @@ count_mat <- melt(as.matrix(count_mat), varnames = c("Gene", "Kids_First_Biospecimen_ID")) count_mat <- count_mat %>% dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id), - by = 'Kids_First_Biospecimen_ID') + by = 'Kids_First_Biospecimen_ID') # get unique biospecimens per sample id unique_ids <- count_mat %>% @@ -86,152 +80,7 @@ count_mat <- count_mat %>% # print dimensions print(dim(count_mat)) -# 2) CNV -cnv_gainloss_file <- opt$cnv_gainloss_file -cnv_dat <- data.table::fread(file = cnv_gainloss_file) -cnv_dat <- cnv_dat %>% - dplyr::rename("Kids_First_Biospecimen_ID" = "BS_ID") %>% - dplyr::filter(Kids_First_Biospecimen_ID %in% histology_file$Kids_First_Biospecimen_ID) %>% - dplyr::select(Kids_First_Biospecimen_ID, gene, log2) %>% - dplyr::inner_join( - histology_file %>% dplyr::select( - Kids_First_Biospecimen_ID, - sample_id, - tumor_ploidy, - tumor_fraction - ), - by = "Kids_First_Biospecimen_ID" - ) %>% - unique() - -# get unique biospecimens per sample id -unique_ids <- cnv_dat %>% - dplyr::select(sample_id, Kids_First_Biospecimen_ID) %>% - unique() %>% - dplyr::arrange(Kids_First_Biospecimen_ID) %>% - dplyr::group_by(sample_id) %>% - dplyr::distinct(sample_id, .keep_all = T) -cnv_dat <- cnv_dat %>% - dplyr::filter( - sample_id %in% unique_ids$sample_id, - Kids_First_Biospecimen_ID %in% unique_ids$Kids_First_Biospecimen_ID - ) -cnv_samples <- cnv_dat %>% - dplyr::select(Kids_First_Biospecimen_ID, sample_id) %>% - unique() - -# convert to matrix -# function to adjust copy number and status -adjust_cn <- function(x) { - # get tumor fraction and ploidy - tumor_fraction <- unique(na.omit(x$tumor_fraction)) - tumor_ploidy <- unique(na.omit(x$tumor_ploidy)) - - if (length(tumor_fraction) == 1 & length(tumor_ploidy) == 1) { - # calculate adjusted copy number if tumor fraction and ploidy info is available - x$adjusted_cn <- (((2 ^ (x$log2) - ( - 1 - tumor_fraction - )) * tumor_ploidy) / tumor_fraction) - 0.5 - x$adjusted_cn <- round(x$adjusted_cn) - x$adjusted_status <- ifelse(x$adjusted_cn == 0, - "Complete Loss", - ifelse( - x$adjusted_cn == 1, - "Loss", - ifelse( - x$adjusted_cn %in% c(tumor_ploidy + 1:9), - "Gain", - ifelse(x$adjusted_cn >= 10, "Amplification", "Neutral") - ) - )) - - # replace old columns with new ones - x <- x %>% - dplyr::rename("status" = "adjusted_status", # rename new columns - "copy_number" = "adjusted_cn") - - } -} - -# apply function to all samples in the consensus file -cnv_dat <- plyr::ddply( - .data = cnv_dat, - .variables = "sample_id", - .fun = function(x) - adjust_cn(x = x) -) -cnv_dat <- cnv_dat %>% - dplyr::filter(status != "Neutral") %>% - dplyr::select(-c(copy_number)) %>% - unique() - -# subset to cancer genes -cnv_dat <- cnv_dat %>% - dplyr::rename("hgnc_symbol" = "gene") %>% - filter_cnv(myCancerGenes = cancer_genes) -cnv_dat <- acast( - cnv_dat, - sample_id ~ hgnc_symbol, - value.var = "log2", - fill = 0, - fun.aggregate = max -) - -cnv_dat <- abs(cnv_dat) - -# print dimensions -print(dim(cnv_dat)) - -# 3) SNV -snv_file <- opt$snv_file -snv_dat <- data.table::fread(file = snv_file) -maf_nonsynonymous <- c( - "Missense_Mutation", - "Frame_Shift_Del", - "In_Frame_Ins", - "Frame_Shift_Ins", - "Splice_Site", - "Nonsense_Mutation", - "In_Frame_Del", - "Nonstop_Mutation", - "Translation_Start_Site" -) -snv_dat <- snv_dat %>% - dplyr::filter( - Variant_Classification %in% maf_nonsynonymous, - Tumor_Sample_Barcode %in% histology_file$Kids_First_Biospecimen_ID - ) - -# combine with histology to map sample id -snv_dat <- snv_dat %>% - dplyr::rename("Kids_First_Biospecimen_ID" = "Tumor_Sample_Barcode") %>% - dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id), - by = 'Kids_First_Biospecimen_ID') - -# get unique biospecimens per sample id -unique_ids <- snv_dat %>% - dplyr::select(sample_id, Kids_First_Biospecimen_ID) %>% - unique() %>% - dplyr::arrange(Kids_First_Biospecimen_ID) %>% - dplyr::group_by(sample_id) %>% - dplyr::distinct(sample_id, .keep_all = T) -snv_dat <- snv_dat %>% - dplyr::filter( - sample_id %in% unique_ids$sample_id, - Kids_First_Biospecimen_ID %in% unique_ids$Kids_First_Biospecimen_ID - ) -snv_samples <- snv_dat %>% - dplyr::select(Kids_First_Biospecimen_ID, sample_id) %>% - unique() - -# convert to matrix -snv_dat <- acast(snv_dat, sample_id ~ Hugo_Symbol, value.var = "Variant_Classification") -snv_dat[snv_dat > 0] <- 1 # anything > 0 should be coded as 1 - -# print dimensions -print(dim(snv_dat)) - -# 4) Methylation +# 2) Methylation # read beta-values methyl_file <- opt$methyl_file methyl_data <- readRDS(file = file.path(methyl_file)) @@ -268,7 +117,7 @@ methyl_samples <- hist_methyl %>% # print dimensions print(dim(methyl_data)) -# 5) Splice dataset +# 3) Splice dataset # read splice data splice_file <- opt$splice_file splice_mat <- readRDS(splice_file) @@ -280,7 +129,7 @@ splice_mat <- melt(as.matrix(splice_mat), # combine with histology to map sample id splice_mat <- splice_mat %>% dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id), - by = 'Kids_First_Biospecimen_ID') + by = 'Kids_First_Biospecimen_ID') # get unique biospecimens per sample id unique_ids <- splice_mat %>% @@ -306,9 +155,7 @@ splice_mat <- splice_mat %>% print(dim(splice_mat)) # subset to samples of interest (n = 152) -samples_of_interest <- intersect(rownames(count_mat), rownames(cnv_dat)) -samples_of_interest <- intersect(samples_of_interest, rownames(snv_dat)) -samples_of_interest <- intersect(samples_of_interest, rownames(methyl_data)) +samples_of_interest <- intersect(rownames(count_mat), rownames(methyl_data)) samples_of_interest <- intersect(samples_of_interest, rownames(splice_mat)) # now final filter/transformation on samples of interest @@ -335,24 +182,7 @@ write_tsv( file = file.path(output_dir, "rna_data.tsv") ) -# 2) CNV -# filter for standard deviation (too many features) -cnv_dat <- cnv_dat[samples_of_interest, ] -cnv_dat <- as.data.frame(cnv_dat) -print(dim(cnv_dat)) # 1193 -write_tsv(as.data.frame(cnv_dat) %>% rownames_to_column(), - file = file.path(output_dir, "cnv_data.tsv")) - -# 3) SNV -# filter out genes with low mutation rate -snv_dat <- snv_dat[samples_of_interest, ] -mut.rate <- apply(X = snv_dat, MARGIN = 2, FUN = mean) -snv_dat <- snv_dat[, which(mut.rate > 0.02)] -print(dim(snv_dat)) # 170 -write_tsv(as.data.frame(snv_dat) %>% rownames_to_column(), - file = file.path(output_dir, "snv_data.tsv")) - -# 4) Methylation +# 2) Methylation methyl_data <- methyl_data[samples_of_interest, ] methyl_data <- methyl_data[, colSums(is.na(methyl_data)) < nrow(methyl_data)] @@ -368,7 +198,7 @@ write_tsv( file = file.path(output_dir, "methyl_data.tsv") ) -# 5) Splicing +# 3) Splicing # splice_mat <- t(splice_mat) %>% as.data.frame() splice_mat <- splice_mat[samples_of_interest, ] splice_mat <- splice_mat[, colSums(splice_mat) > 0] @@ -392,19 +222,11 @@ rna_samples <- count_samples %>% methyl_samples <- methyl_samples %>% dplyr::filter(sample_id %in% samples_of_interest) %>% dplyr::rename("Kids_First_Biospecimen_ID_Methyl" = "Kids_First_Biospecimen_ID") -cnv_samples <- cnv_samples %>% - dplyr::filter(sample_id %in% samples_of_interest) %>% - dplyr::rename("Kids_First_Biospecimen_ID_CNV" = "Kids_First_Biospecimen_ID") -snv_samples <- snv_samples %>% - dplyr::filter(sample_id %in% samples_of_interest) %>% - dplyr::rename("Kids_First_Biospecimen_ID_SNV" = "Kids_First_Biospecimen_ID") splice_samples <- splice_samples %>% dplyr::filter(sample_id %in% samples_of_interest) %>% dplyr::rename("Kids_First_Biospecimen_ID_Splice" = "Kids_First_Biospecimen_ID") sample_map <- rna_samples %>% dplyr::select(sample_id, Kids_First_Biospecimen_ID_RNA) %>% - inner_join(cnv_samples) %>% - inner_join(snv_samples) %>% inner_join(methyl_samples) %>% inner_join(splice_samples) write_tsv(sample_map, file = file.path(output_dir, "samples_map.tsv")) diff --git a/analyses/data_preparation/run_analysis.sh b/analyses/data_preparation/run_analysis.sh index 7077087..928f9f2 100644 --- a/analyses/data_preparation/run_analysis.sh +++ b/analyses/data_preparation/run_analysis.sh @@ -11,23 +11,19 @@ script_directory="$(perl -e 'use File::Basename; cd "$script_directory" || exit # update paths to new directory with subsetted data/matrices -data_dir="../../data" +data_dir="/sbgenomics/project-files/opc-v15" histology_file="${data_dir}/v15/histologies.tsv" short_histology="HGAT" count_file="${data_dir}/v15/gene-counts-rsem-expected_count-collapsed.rds" -cnv_gainloss_file="${data_dir}/v15/All.gainloss.txt.gz" -snv_file="${data_dir}/v15/snv-consensus-plus-hotspots.maf.tsv.gz" methyl_b_file="${data_dir}/v15/methyl-beta-values.rds" functional_sites_splice_file="${data_dir}/v15/splice_events_pan_cancer_functional_filter.rds" # filtered to functional sites by Ammar Naqvi using v12 +gtf_file="${data_dir}/v15/gencode.v39.primary_assembly.annotation.gtf.gz" # prepare input files for multi-modal clustering Rscript --vanilla 01-multi-modal-clustering-prepare-data.R \ --histology_file $histology_file \ --short_histology $short_histology \ ---cancer_genes $cancer_genes \ --count_file $count_file \ ---cnv_gainloss_file $cnv_gainloss_file \ ---snv_file $snv_file \ --methyl_file $methyl_b_file \ --splice_file $functional_sites_splice_file \ --gtf_file $gtf_file \