Skip to content

Commit

Permalink
update paths and remove snv/cnv
Browse files Browse the repository at this point in the history
  • Loading branch information
aadamk committed Sep 16, 2024
1 parent eca87bc commit 45c558f
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 191 deletions.
192 changes: 7 additions & 185 deletions analyses/data_preparation/01-multi-modal-clustering-prepare-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@ suppressPackageStartupMessages({
option_list <- list(
make_option(c("--histology_file"), type = "character", help = "Histology file (.tsv)"),
make_option(c("--short_histology"), type = "character", help = "Short histology of interest"),
make_option(c("--cancer_genes"), type = "character", help = "cancer gene list (.rds)"),
make_option(c("--count_file"), type = "character", help = "Expression matrix, preferably counts (.rds)"),
make_option(c("--cnv_gainloss_file"), type = "character", help = "CNVkit gainloss file (.tsv)"),
make_option(c("--snv_file"), type = "character", help = "SNV consensus (.maf | tsv.gz)"),
make_option(c("--methyl_file"), type = "character", help = "Methylation values, preferably beta values (.rds)"),
make_option(c("--splice_file"), type = "character", help = "PSI values (.rds)"),
make_option(c("--gtf_file"), type = "character", help = "Gencode gtf file"),
Expand All @@ -29,9 +26,6 @@ dir.create(output_dir, showWarnings = F, recursive = T)
# source functions
source(file.path("utils", "filter_cnv.R"))

# cancer genes
cancer_genes <- readRDS(opt$cancer_genes)

# read histology file and filter to short histology of interest
histology_file <- opt$histology_file
histology_file <- read_tsv(file = histology_file)
Expand Down Expand Up @@ -61,7 +55,7 @@ count_mat <- melt(as.matrix(count_mat),
varnames = c("Gene", "Kids_First_Biospecimen_ID"))
count_mat <- count_mat %>%
dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id),
by = 'Kids_First_Biospecimen_ID')
by = 'Kids_First_Biospecimen_ID')

# get unique biospecimens per sample id
unique_ids <- count_mat %>%
Expand All @@ -86,152 +80,7 @@ count_mat <- count_mat %>%
# print dimensions
print(dim(count_mat))

# 2) CNV
cnv_gainloss_file <- opt$cnv_gainloss_file
cnv_dat <- data.table::fread(file = cnv_gainloss_file)
cnv_dat <- cnv_dat %>%
dplyr::rename("Kids_First_Biospecimen_ID" = "BS_ID") %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% histology_file$Kids_First_Biospecimen_ID) %>%
dplyr::select(Kids_First_Biospecimen_ID, gene, log2) %>%
dplyr::inner_join(
histology_file %>% dplyr::select(
Kids_First_Biospecimen_ID,
sample_id,
tumor_ploidy,
tumor_fraction
),
by = "Kids_First_Biospecimen_ID"
) %>%
unique()

# get unique biospecimens per sample id
unique_ids <- cnv_dat %>%
dplyr::select(sample_id, Kids_First_Biospecimen_ID) %>%
unique() %>%
dplyr::arrange(Kids_First_Biospecimen_ID) %>%
dplyr::group_by(sample_id) %>%
dplyr::distinct(sample_id, .keep_all = T)
cnv_dat <- cnv_dat %>%
dplyr::filter(
sample_id %in% unique_ids$sample_id,
Kids_First_Biospecimen_ID %in% unique_ids$Kids_First_Biospecimen_ID
)
cnv_samples <- cnv_dat %>%
dplyr::select(Kids_First_Biospecimen_ID, sample_id) %>%
unique()

# convert to matrix
# function to adjust copy number and status
adjust_cn <- function(x) {
# get tumor fraction and ploidy
tumor_fraction <- unique(na.omit(x$tumor_fraction))
tumor_ploidy <- unique(na.omit(x$tumor_ploidy))

if (length(tumor_fraction) == 1 & length(tumor_ploidy) == 1) {
# calculate adjusted copy number if tumor fraction and ploidy info is available
x$adjusted_cn <- (((2 ^ (x$log2) - (
1 - tumor_fraction
)) * tumor_ploidy) / tumor_fraction) - 0.5
x$adjusted_cn <- round(x$adjusted_cn)
x$adjusted_status <- ifelse(x$adjusted_cn == 0,
"Complete Loss",
ifelse(
x$adjusted_cn == 1,
"Loss",
ifelse(
x$adjusted_cn %in% c(tumor_ploidy + 1:9),
"Gain",
ifelse(x$adjusted_cn >= 10, "Amplification", "Neutral")
)
))

# replace old columns with new ones
x <- x %>%
dplyr::rename("status" = "adjusted_status", # rename new columns
"copy_number" = "adjusted_cn")

}
}

# apply function to all samples in the consensus file
cnv_dat <- plyr::ddply(
.data = cnv_dat,
.variables = "sample_id",
.fun = function(x)
adjust_cn(x = x)
)
cnv_dat <- cnv_dat %>%
dplyr::filter(status != "Neutral") %>%
dplyr::select(-c(copy_number)) %>%
unique()

# subset to cancer genes
cnv_dat <- cnv_dat %>%
dplyr::rename("hgnc_symbol" = "gene") %>%
filter_cnv(myCancerGenes = cancer_genes)
cnv_dat <- acast(
cnv_dat,
sample_id ~ hgnc_symbol,
value.var = "log2",
fill = 0,
fun.aggregate = max
)

cnv_dat <- abs(cnv_dat)

# print dimensions
print(dim(cnv_dat))

# 3) SNV
snv_file <- opt$snv_file
snv_dat <- data.table::fread(file = snv_file)
maf_nonsynonymous <- c(
"Missense_Mutation",
"Frame_Shift_Del",
"In_Frame_Ins",
"Frame_Shift_Ins",
"Splice_Site",
"Nonsense_Mutation",
"In_Frame_Del",
"Nonstop_Mutation",
"Translation_Start_Site"
)
snv_dat <- snv_dat %>%
dplyr::filter(
Variant_Classification %in% maf_nonsynonymous,
Tumor_Sample_Barcode %in% histology_file$Kids_First_Biospecimen_ID
)

# combine with histology to map sample id
snv_dat <- snv_dat %>%
dplyr::rename("Kids_First_Biospecimen_ID" = "Tumor_Sample_Barcode") %>%
dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id),
by = 'Kids_First_Biospecimen_ID')

# get unique biospecimens per sample id
unique_ids <- snv_dat %>%
dplyr::select(sample_id, Kids_First_Biospecimen_ID) %>%
unique() %>%
dplyr::arrange(Kids_First_Biospecimen_ID) %>%
dplyr::group_by(sample_id) %>%
dplyr::distinct(sample_id, .keep_all = T)
snv_dat <- snv_dat %>%
dplyr::filter(
sample_id %in% unique_ids$sample_id,
Kids_First_Biospecimen_ID %in% unique_ids$Kids_First_Biospecimen_ID
)
snv_samples <- snv_dat %>%
dplyr::select(Kids_First_Biospecimen_ID, sample_id) %>%
unique()

# convert to matrix
snv_dat <- acast(snv_dat, sample_id ~ Hugo_Symbol, value.var = "Variant_Classification")
snv_dat[snv_dat > 0] <- 1 # anything > 0 should be coded as 1

# print dimensions
print(dim(snv_dat))

# 4) Methylation
# 2) Methylation
# read beta-values
methyl_file <- opt$methyl_file
methyl_data <- readRDS(file = file.path(methyl_file))
Expand Down Expand Up @@ -268,7 +117,7 @@ methyl_samples <- hist_methyl %>%
# print dimensions
print(dim(methyl_data))

# 5) Splice dataset
# 3) Splice dataset
# read splice data
splice_file <- opt$splice_file
splice_mat <- readRDS(splice_file)
Expand All @@ -280,7 +129,7 @@ splice_mat <- melt(as.matrix(splice_mat),
# combine with histology to map sample id
splice_mat <- splice_mat %>%
dplyr::inner_join(histology_file %>% dplyr::select(Kids_First_Biospecimen_ID, sample_id),
by = 'Kids_First_Biospecimen_ID')
by = 'Kids_First_Biospecimen_ID')

# get unique biospecimens per sample id
unique_ids <- splice_mat %>%
Expand All @@ -306,9 +155,7 @@ splice_mat <- splice_mat %>%
print(dim(splice_mat))

# subset to samples of interest (n = 152)
samples_of_interest <- intersect(rownames(count_mat), rownames(cnv_dat))
samples_of_interest <- intersect(samples_of_interest, rownames(snv_dat))
samples_of_interest <- intersect(samples_of_interest, rownames(methyl_data))
samples_of_interest <- intersect(rownames(count_mat), rownames(methyl_data))
samples_of_interest <- intersect(samples_of_interest, rownames(splice_mat))

# now final filter/transformation on samples of interest
Expand All @@ -335,24 +182,7 @@ write_tsv(
file = file.path(output_dir, "rna_data.tsv")
)

# 2) CNV
# filter for standard deviation (too many features)
cnv_dat <- cnv_dat[samples_of_interest, ]
cnv_dat <- as.data.frame(cnv_dat)
print(dim(cnv_dat)) # 1193
write_tsv(as.data.frame(cnv_dat) %>% rownames_to_column(),
file = file.path(output_dir, "cnv_data.tsv"))

# 3) SNV
# filter out genes with low mutation rate
snv_dat <- snv_dat[samples_of_interest, ]
mut.rate <- apply(X = snv_dat, MARGIN = 2, FUN = mean)
snv_dat <- snv_dat[, which(mut.rate > 0.02)]
print(dim(snv_dat)) # 170
write_tsv(as.data.frame(snv_dat) %>% rownames_to_column(),
file = file.path(output_dir, "snv_data.tsv"))

# 4) Methylation
# 2) Methylation
methyl_data <- methyl_data[samples_of_interest, ]
methyl_data <- methyl_data[, colSums(is.na(methyl_data)) < nrow(methyl_data)]

Expand All @@ -368,7 +198,7 @@ write_tsv(
file = file.path(output_dir, "methyl_data.tsv")
)

# 5) Splicing
# 3) Splicing
# splice_mat <- t(splice_mat) %>% as.data.frame()
splice_mat <- splice_mat[samples_of_interest, ]
splice_mat <- splice_mat[, colSums(splice_mat) > 0]
Expand All @@ -392,19 +222,11 @@ rna_samples <- count_samples %>%
methyl_samples <- methyl_samples %>%
dplyr::filter(sample_id %in% samples_of_interest) %>%
dplyr::rename("Kids_First_Biospecimen_ID_Methyl" = "Kids_First_Biospecimen_ID")
cnv_samples <- cnv_samples %>%
dplyr::filter(sample_id %in% samples_of_interest) %>%
dplyr::rename("Kids_First_Biospecimen_ID_CNV" = "Kids_First_Biospecimen_ID")
snv_samples <- snv_samples %>%
dplyr::filter(sample_id %in% samples_of_interest) %>%
dplyr::rename("Kids_First_Biospecimen_ID_SNV" = "Kids_First_Biospecimen_ID")
splice_samples <- splice_samples %>%
dplyr::filter(sample_id %in% samples_of_interest) %>%
dplyr::rename("Kids_First_Biospecimen_ID_Splice" = "Kids_First_Biospecimen_ID")
sample_map <- rna_samples %>%
dplyr::select(sample_id, Kids_First_Biospecimen_ID_RNA) %>%
inner_join(cnv_samples) %>%
inner_join(snv_samples) %>%
inner_join(methyl_samples) %>%
inner_join(splice_samples)
write_tsv(sample_map, file = file.path(output_dir, "samples_map.tsv"))
8 changes: 2 additions & 6 deletions analyses/data_preparation/run_analysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,19 @@ script_directory="$(perl -e 'use File::Basename;
cd "$script_directory" || exit

# update paths to new directory with subsetted data/matrices
data_dir="../../data"
data_dir="/sbgenomics/project-files/opc-v15"
histology_file="${data_dir}/v15/histologies.tsv"
short_histology="HGAT"
count_file="${data_dir}/v15/gene-counts-rsem-expected_count-collapsed.rds"
cnv_gainloss_file="${data_dir}/v15/All.gainloss.txt.gz"
snv_file="${data_dir}/v15/snv-consensus-plus-hotspots.maf.tsv.gz"
methyl_b_file="${data_dir}/v15/methyl-beta-values.rds"
functional_sites_splice_file="${data_dir}/v15/splice_events_pan_cancer_functional_filter.rds" # filtered to functional sites by Ammar Naqvi using v12
gtf_file="${data_dir}/v15/gencode.v39.primary_assembly.annotation.gtf.gz"

# prepare input files for multi-modal clustering
Rscript --vanilla 01-multi-modal-clustering-prepare-data.R \
--histology_file $histology_file \
--short_histology $short_histology \
--cancer_genes $cancer_genes \
--count_file $count_file \
--cnv_gainloss_file $cnv_gainloss_file \
--snv_file $snv_file \
--methyl_file $methyl_b_file \
--splice_file $functional_sites_splice_file \
--gtf_file $gtf_file \
Expand Down

0 comments on commit 45c558f

Please sign in to comment.