From 8cec371d98842a664e0b918830e767224b503391 Mon Sep 17 00:00:00 2001 From: LY Date: Tue, 5 Sep 2023 16:56:53 +0800 Subject: [PATCH] add hrd score --- R/calc.ai_new.R | 4 +- R/calc.hrd.R | 3 +- R/calc.lst.R | 7 +- R/calc.lst.bak.R | 81 +++++++++++++++++ R/chrominfo_grch37.2.list | 25 ++++++ R/chrominfo_grch37.list | 25 ++++++ R/chrominfo_grch38.2.list | 25 ++++++ R/chrominfo_grch38.list | 25 ++++++ R/preprocess.hrd.R | 2 +- R/preprocess.seqz.R | 101 ++++++++++----------- R/scar_score.R | 179 ++++++++++++++++++++++---------------- R/scar_score.seqz.R | 104 ++++++++++++++++++++++ R/sysdata.rda | Bin 706 -> 726 bytes README.md | 3 +- 14 files changed, 453 insertions(+), 131 deletions(-) create mode 100644 R/calc.lst.bak.R create mode 100644 R/chrominfo_grch37.2.list create mode 100644 R/chrominfo_grch37.list create mode 100644 R/chrominfo_grch38.2.list create mode 100644 R/chrominfo_grch38.list create mode 100644 R/scar_score.seqz.R diff --git a/R/calc.ai_new.R b/R/calc.ai_new.R index d2122a0..a05b1e0 100644 --- a/R/calc.ai_new.R +++ b/R/calc.ai_new.R @@ -76,8 +76,10 @@ calc.ai_new<-function(seg, chrominfo, min.size=1e6, cont = 0,ploidyByChromosome= no.events <- matrix(0, nrow=length(samples), ncol=12) rownames(no.events) <- samples colnames(no.events) <- c("Telomeric AI", "Mean size", "Interstitial AI", "Mean Size", "Whole chr AI", "Telomeric LOH", "Mean size", "Interstitial LOH", "Mean Size", "Whole chr LOH", "Ploidy", "Aberrant cell fraction") + TAI_seg = "" for(j in samples){ sample.seg <- seg[seg[,1] %in% j,] + TAI_seg = sample.seg[sample.seg[,'AI'] == 1,] no.events[j,1] <- nrow(sample.seg[sample.seg[,'AI'] == 1,]) no.events[j,2] <- mean(sample.seg[sample.seg[,'AI'] == 1,4] - sample.seg[sample.seg[,'AI'] == 1,3]) no.events[j,3] <- nrow(sample.seg[sample.seg[,'AI'] == 2,]) @@ -93,5 +95,5 @@ calc.ai_new<-function(seg, chrominfo, min.size=1e6, cont = 0,ploidyByChromosome= no.events[j,9] <- mean(sample.seg[sample.seg[,'AI'] == 2,4] - sample.seg[sample.seg[,'AI'] == 2,3]) no.events[j,10] <- nrow(sample.seg[sample.seg[,'AI'] == 3,]) } - return(no.events) + return(list(no.events,TAI_seg)) } diff --git a/R/calc.hrd.R b/R/calc.hrd.R index 7571bf3..5cd9db1 100644 --- a/R/calc.hrd.R +++ b/R/calc.hrd.R @@ -37,7 +37,8 @@ calc.hrd<-function(seg, nA=7, return.loc=FALSE,sizelimit1){ if(return.loc){ return(out.seg) } else { - return(output) + out = list(output, segLOH) + return(out) } } diff --git a/R/calc.lst.R b/R/calc.lst.R index 51e5426..1ffd263 100644 --- a/R/calc.lst.R +++ b/R/calc.lst.R @@ -9,6 +9,7 @@ calc.lst<-function(seg, chrominfo=chrominfo,nA=7,chr.arm='no'){ nB <- nA+1 samples <- unique(seg[,1]) output <- setNames(rep(0,length(samples)), samples) + cnvseg <- NULL for(j in samples){ sample.seg <- seg[seg[,1] %in% j,] sample.lst <- c() @@ -52,6 +53,8 @@ calc.lst<-function(seg, chrominfo=chrominfo,nA=7,chr.arm='no'){ p.arm <- cbind(p.arm[,1:8], c(0,1)[match((p.arm[,4]-p.arm[,3]) >= 10e6, c('FALSE','TRUE'))]) for(k in 2:nrow(p.arm)){ if(p.arm[k,9] == 1 & p.arm[(k-1),9]==1 & (p.arm[k,3]-p.arm[(k-1),4]) < 3e6){ + cnvseg <- rbind(cnvseg,c(p.arm[k,1:8],paste(c(p.arm[k,2],p.arm[k,3],p.arm[k,4]),collapse="_"))) + cnvseg <- rbind(cnvseg,c(p.arm[k-1,1:8],paste(c(p.arm[k,2],p.arm[k,3],p.arm[k,4]),collapse="_"))) sample.lst <- c(sample.lst, 1) } } @@ -66,6 +69,8 @@ calc.lst<-function(seg, chrominfo=chrominfo,nA=7,chr.arm='no'){ q.arm <- cbind(q.arm[,1:8], c(0,1)[match((q.arm[,4]-q.arm[,3]) >= 10e6, c('FALSE','TRUE'))]) for(k in 2:nrow(q.arm)){ if(q.arm[k,9] == 1 & q.arm[(k-1),9]==1 & (q.arm[k,3]-q.arm[(k-1),4]) < 3e6){ + cnvseg <- rbind(cnvseg,c(q.arm[k,1:8],paste(c(q.arm[k,2],q.arm[k,3],q.arm[k,4]),collapse="_"))) + cnvseg <- rbind(cnvseg,c(q.arm[k-1,1:8],paste(c(q.arm[k,2],q.arm[k,3],q.arm[k,4]),collapse="_"))) sample.lst <- c(sample.lst, 1) } @@ -74,5 +79,5 @@ calc.lst<-function(seg, chrominfo=chrominfo,nA=7,chr.arm='no'){ } output[j] <- sum(sample.lst) } - return(output) + return(list(output, cnvseg)) } diff --git a/R/calc.lst.bak.R b/R/calc.lst.bak.R new file mode 100644 index 0000000..dd32095 --- /dev/null +++ b/R/calc.lst.bak.R @@ -0,0 +1,81 @@ +#' Determine large-scale transitions +#' +#' @param seg segmentation data +#' @param chrominfo reference genome version +#' @param nA column number of copy number of A allele +#' @param chr.arm option to use chromosome arms defined during segmentation +#' @return number of LSTs +calc.lst<-function(seg, chrominfo=chrominfo,nA=7,chr.arm='no'){ + nB <- nA+1 + samples <- unique(seg[,1]) + output <- setNames(rep(0,length(samples)), samples) + cnvseg <- NULL + for(j in samples){ + sample.seg <- seg[seg[,1] %in% j,] + sample.lst <- c() + chroms <- unique(sample.seg[,2]) + chroms <- chroms[!chroms %in% c(23,24,'chr23','chr24','chrX','chrx','chrY','chry')] + for(i in chroms){ + sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F] + if(chr.arm !='no'){ + p.max <- if(any(sample.chrom.seg[,chr.arm] == 'p')){max(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'p',4])} + q.min <- min(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'q',3]) + } + if(nrow(sample.chrom.seg) < 2) {next} + sample.chrom.seg.new <- sample.chrom.seg + if(chr.arm == 'no'){ + p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,3] <= chrominfo[i,2],,drop=F] # split into chromosome arms + q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,4] >= chrominfo[i,3],,drop=F] + q.arm<- shrink.seg.ai(q.arm) + q.arm[1,3] <- chrominfo[i,3] + if(nrow(p.arm) > 0){ + p.arm<- shrink.seg.ai(p.arm) + p.arm[nrow(p.arm),4] <- chrominfo[i,2] + } + } + if(chr.arm != 'no'){ + q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'q',,drop=F] + q.arm<- shrink.seg.ai(q.arm) + q.arm[1,3] <- q.min + if(any(sample.chrom.seg.new[,chr.arm] == 'p')){ + p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'p',,drop=F] # split into chromosome arms + p.arm<- shrink.seg.ai(p.arm) + p.arm[nrow(p.arm),4] <- p.max + } + } + n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6) + while(length(n.3mb) > 0){ + p.arm <- p.arm[-(n.3mb[1]),] + p.arm <- shrink.seg.ai(p.arm) + n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6) + } + if(nrow(p.arm) >= 2){ + p.arm <- cbind(p.arm[,1:8], c(0,1)[match((p.arm[,4]-p.arm[,3]) >= 10e6, c('FALSE','TRUE'))]) + for(k in 2:nrow(p.arm)){ + if(p.arm[k,9] == 1 & p.arm[(k-1),9]==1 & (p.arm[k,3]-p.arm[(k-1),4]) < 3e6){ + cnvseg <- rbind(cnvseg,p.arm[k,1:8]) + sample.lst <- c(sample.lst, 1) + } + } + } + n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6) + while(length(n.3mb) > 0){ + q.arm <- q.arm[-(n.3mb[1]),] + q.arm <- shrink.seg.ai(q.arm) + n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6) + } + if(nrow(q.arm) >= 2){ + q.arm <- cbind(q.arm[,1:8], c(0,1)[match((q.arm[,4]-q.arm[,3]) >= 10e6, c('FALSE','TRUE'))]) + for(k in 2:nrow(q.arm)){ + if(q.arm[k,9] == 1 & q.arm[(k-1),9]==1 & (q.arm[k,3]-q.arm[(k-1),4]) < 3e6){ + cnvseg <- rbind(cnvseg,q.arm[k,1:8]) + sample.lst <- c(sample.lst, 1) + + } + } + } + } + output[j] <- sum(sample.lst) + } + return(list(output, cnvseg)) +} diff --git a/R/chrominfo_grch37.2.list b/R/chrominfo_grch37.2.list new file mode 100644 index 0000000..82778ba --- /dev/null +++ b/R/chrominfo_grch37.2.list @@ -0,0 +1,25 @@ +chrom centstart centend +chr1 121500000 128900000 +chr2 90500000 96800000 +chr3 87900000 93900000 +chr4 48200000 52700000 +chr5 46100000 50700000 +chr6 58700000 63300000 +chr7 58000000 61700000 +chr8 43100000 48100000 +chr9 47300000 50700000 +chr10 38000000 42300000 +chr11 51600000 55700000 +chr12 33300000 38200000 +chr13 16300000 19500000 +chr14 16100000 19100000 +chr15 15800000 20700000 +chr16 34600000 38600000 +chr17 22200000 25800000 +chr18 15400000 19000000 +chr19 24400000 28600000 +chr20 25600000 29400000 +chr21 10900000 14300000 +chr22 12200000 17900000 +chrX 58100000 63000000 +chrY 11600000 13400000 diff --git a/R/chrominfo_grch37.list b/R/chrominfo_grch37.list new file mode 100644 index 0000000..0056e24 --- /dev/null +++ b/R/chrominfo_grch37.list @@ -0,0 +1,25 @@ + chrom centstart centend +chr1 chr1 121500000 128900000 +chr2 chr2 90500000 96800000 +chr3 chr3 87900000 93900000 +chr4 chr4 48200000 52700000 +chr5 chr5 46100000 50700000 +chr6 chr6 58700000 63300000 +chr7 chr7 58000000 61700000 +chr8 chr8 43100000 48100000 +chr9 chr9 47300000 50700000 +chr10 chr10 38000000 42300000 +chr11 chr11 51600000 55700000 +chr12 chr12 33300000 38200000 +chr13 chr13 16300000 19500000 +chr14 chr14 16100000 19100000 +chr15 chr15 15800000 20700000 +chr16 chr16 34600000 38600000 +chr17 chr17 22200000 25800000 +chr18 chr18 15400000 19000000 +chr19 chr19 24400000 28600000 +chr20 chr20 25600000 29400000 +chr21 chr21 10900000 14300000 +chr22 chr22 12200000 17900000 +chrX chrX 58100000 63000000 +chrY chrY 11600000 13400000 diff --git a/R/chrominfo_grch38.2.list b/R/chrominfo_grch38.2.list new file mode 100644 index 0000000..502a6b2 --- /dev/null +++ b/R/chrominfo_grch38.2.list @@ -0,0 +1,25 @@ +chrom centstart centend +chr1 121700000 125100000 +chr2 91800000 96000000 +chr3 87800000 94000000 +chr4 48200000 51800000 +chr5 46100000 51400000 +chr6 58500000 62600000 +chr7 58100000 62100000 +chr8 43200000 47200000 +chr9 42200000 45500000 +chr10 38000000 41600000 +chr11 51000000 55800000 +chr12 33200000 37800000 +chr13 16500000 18900000 +chr14 16100000 18200000 +chr15 17500000 20500000 +chr16 35300000 38400000 +chr17 22700000 27400000 +chr18 15400000 21500000 +chr19 24200000 28100000 +chr20 25700000 30400000 +chr21 10900000 13000000 +chr22 13700000 17400000 +chrX 58100000 63800000 +chrY 10300000 10600000 diff --git a/R/chrominfo_grch38.list b/R/chrominfo_grch38.list new file mode 100644 index 0000000..dfb6c00 --- /dev/null +++ b/R/chrominfo_grch38.list @@ -0,0 +1,25 @@ + chrom centstart centend +chr1 chr1 121700000 125100000 +chr2 chr2 91800000 96000000 +chr3 chr3 87800000 94000000 +chr4 chr4 48200000 51800000 +chr5 chr5 46100000 51400000 +chr6 chr6 58500000 62600000 +chr7 chr7 58100000 62100000 +chr8 chr8 43200000 47200000 +chr9 chr9 42200000 45500000 +chr10 chr10 38000000 41600000 +chr11 chr11 51000000 55800000 +chr12 chr12 33200000 37800000 +chr13 chr13 16500000 18900000 +chr14 chr14 16100000 18200000 +chr15 chr15 17500000 20500000 +chr16 chr16 35300000 38400000 +chr17 chr17 22700000 27400000 +chr18 chr18 15400000 21500000 +chr19 chr19 24200000 28100000 +chr20 chr20 25700000 30400000 +chr21 chr21 10900000 13000000 +chr22 chr22 13700000 17400000 +chrX chrX 58100000 63800000 +chrY chrY 10300000 10600000 diff --git a/R/preprocess.hrd.R b/R/preprocess.hrd.R index 63f371d..48381da 100644 --- a/R/preprocess.hrd.R +++ b/R/preprocess.hrd.R @@ -5,13 +5,13 @@ preprocess.hrd<-function(seg){ seg <- seg[!seg[,2] %in% c(paste('chr',c('X','Y','x','y',23,24),sep=''),c('X','Y','x','y',23,24)),] seg[,1] <- as.character(seg[,1]) - if(! all(seg[,8] <= seg[,7]) ){ tmp <- seg seg[tmp[,8] > tmp[,7],7] <- tmp[tmp[,8] > tmp[,7],8] seg[tmp[,8] > tmp[,7],8] <- tmp[tmp[,8] > tmp[,7],7] } seg <- shrink.seg.ai.wrapper(seg) + print(seg) return(seg) } diff --git a/R/preprocess.seqz.R b/R/preprocess.seqz.R index bc3f086..6266c0c 100644 --- a/R/preprocess.seqz.R +++ b/R/preprocess.seqz.R @@ -1,50 +1,51 @@ -#' Preprocessing seqz files -#' -#' Preprocesses seqz files -#' @param seg A segmentation file -#' @param ploidy0 ploidy, optional -#' @return Output is -preprocess.seqz<-function(seg, ploidy0=NULL, chr.in.names=TRUE, outputdir=NULL){ - if (is.null(ploidy0)){ - ploidy01 = seq(1, 5.5, 0.1) - } else { - ploidy01= seq(ploidy0-0.5,ploidy0+0.5,0.1) - } - - if (is.null(outputdir)){ - outputdir = getwd() - } - - run_name<-gsub(".*/","",gsub("_small.seqz","",gsub("gz","",seg))) - if(chr.in.names){ - extract<-sequenza.extract(seg, chromosome.list=paste('chr',c(1:24),sep=''),gamma = 60, kmin = 50) - } else { - extract<-sequenza.extract(seg, chromosome.list=c(1:24),gamma = 60, kmin = 50) - } - extract.fit<-sequenza::sequenza.fit(extract, N.ratio.filter = 10, N.BAF.filter = 1, segment.filter = 3e6, mufreq.treshold = 0.10, ratio.priority = FALSE,ploidy=ploidy01, mc.cores = 1) - # sequenza.results(extract, extract.fit, out.dir = getwd(),sample.id =run_name) - - seg.tab <- do.call(rbind, extract$segments[extract$chromosomes]) - seg.len <- (seg.tab$end.pos - seg.tab$start.pos)/1e+06 - cint <- get.ci(extract.fit) - cellularity <- cint$max.cellularity - ploidy <- cint$max.ploidy - avg.depth.ratio <- mean(extract$gc$adj[, 2]) - info_seg<-c(cellularity,ploidy,avg.depth.ratio) - names(info_seg)<-c("cellularity","ploidy","avg.depth.ratio") - write.table(t(info_seg),paste0(outputdir,"/",run_name,"_info_seg.txt"),sep="\t",row.names=F) - allele.cn <- sequenza:::baf.bayes(Bf = seg.tab$Bf, CNt.max = 20, depth.ratio = seg.tab$depth.ratio, avg.depth.ratio = 1, - cellularity = cint$max.cellularity, ploidy = cint$max.ploidy, - sd.ratio = seg.tab$sd.ratio, weight.ratio = seg.len, sd.Bf = seg.tab$sd.BAF, - weight.Bf = 1, ratio.priority = FALSE, CNn = 2) - seg.tab$CN <- allele.cn[,1] - allele.cn <- as.data.table(allele.cn) - #Making imput file - seg <- data.frame(SampleID = as.character(run_name), Chromosome = seg.tab$chromosome, Start_position = seg.tab$start.pos, - End_position = seg.tab$end.pos, Nprobes = 1, total_cn = allele.cn$CNt, A_cn = allele.cn$B, - B_cn = allele.cn$A, ploidy = ploidy) - seg$contamination <- 1 - seg<-seg[!is.na(seg$A_cn),] - seg<-seg[!is.na(seg$B_cn),] - return(seg) -} +#' Preprocessing seqz files +#' +#' Preprocesses seqz files +#' @param seg A segmentation file +#' @param ploidy0 ploidy, optional +#' @return Output is +preprocess.seqz<-function(seg, ploidy0=NULL, chr.in.names=TRUE, outputdir=NULL){ + if (is.null(ploidy0)){ + ploidy01 = seq(1, 5.5, 0.1) + } else { + ploidy01= seq(ploidy0-0.5,ploidy0+0.5,0.1) + } + + if (is.null(outputdir)){ + outputdir = getwd() + } + + run_name<-gsub(".*/","",gsub("_small.seqz","",gsub("gz","",seg))) + if(chr.in.names){ + extract<-sequenza.extract(seg, chromosome.list=paste('chr',c(1:24),sep=''),gamma = 60, kmin = 50) + } else { + extract<-sequenza.extract(seg, chromosome.list=c(1:24),gamma = 60, kmin = 50) + } + extract.fit<-sequenza::sequenza.fit(extract, N.ratio.filter = 10, N.BAF.filter = 1, segment.filter = 3e6, mufreq.treshold = 0.10, ratio.priority = FALSE,ploidy=ploidy01, mc.cores = 1) + # sequenza.results(extract, extract.fit, out.dir = getwd(),sample.id =run_name) + + seg.tab <- do.call(rbind, extract$segments[extract$chromosomes]) + seg.len <- (seg.tab$end.pos - seg.tab$start.pos)/1e+06 + cint <- get.ci(extract.fit) + cellularity <- cint$max.cellularity + ploidy <- cint$max.ploidy + #avg.depth.ratio <- mean(extract$gc$adj[, 2]) + avg.depth.ratio <- extract$avg.depth.ratio + info_seg<-c(cellularity,ploidy,avg.depth.ratio) + names(info_seg)<-c("cellularity","ploidy","avg.depth.ratio") + write.table(t(info_seg),paste0(outputdir,"/",run_name,"_info_seg.txt"),sep="\t",row.names=F) + allele.cn <- sequenza:::baf.bayes(Bf = seg.tab$Bf, CNt.max = 20, depth.ratio = seg.tab$depth.ratio, avg.depth.ratio = 1, + cellularity = cint$max.cellularity, ploidy = cint$max.ploidy, + sd.ratio = seg.tab$sd.ratio, weight.ratio = seg.len, sd.Bf = seg.tab$sd.BAF, + weight.Bf = 1, ratio.priority = FALSE, CNn = 2) + seg.tab$CN <- allele.cn[,1] + allele.cn <- as.data.frame(allele.cn) + #Making imput file + seg <- data.frame(SampleID = as.character(run_name), Chromosome = seg.tab$chromosome, Start_position = seg.tab$start.pos, + End_position = seg.tab$end.pos, Nprobes = 1, total_cn = allele.cn$CNt, A_cn = allele.cn$B, + B_cn = allele.cn$A, ploidy = ploidy) + seg$contamination <- 1 + seg<-seg[!is.na(seg$A_cn),] + seg<-seg[!is.na(seg$B_cn),] + return(seg) +} diff --git a/R/scar_score.R b/R/scar_score.R index 95be90f..2e072f9 100644 --- a/R/scar_score.R +++ b/R/scar_score.R @@ -1,76 +1,103 @@ -#' Scar score -#' -#' Determining genomic scar score (telomeric allelic imbalance, loss-off heterozigosity, large-scle transitions), signs of homologous recombination deficiency -#' @param seg Imput file: either a binned sequenza out file (or any other segmentation file with the following columns: chromosome, position, base.ref, depth.normal, depth.tumor, depth.ratio, Af, Bf, zygosity.normal, GC.percent, good.reads, AB.normal, AB.tumor, tumor.strand) or an allele-specific segmentation file with the following columns: 1st column: sample name, 2nd column: chromosome, 3rd column: segmentation start, 4th column: segmentation end, 5th column: total copynumber, 6th column: copy number of A allele, 7th column: copy number of B allele -#' @param reference: reference genome: either grch38 or grch37. Default is grch38. -#' @param seqz Optional parameter, set to TRUE, if input file is a binned sequenza file. -#' @param ploidy Optional parameter, may be used if the ploidy of the sample is known. -#' @param chr.in.names Optional parameter, default: TRUE, set to FALSE if input file does not contain 'chr' in chromosome names. -#' @return Output is, with the following columns: HRD Telomeric AI Mean size Interstitial AI Mean Size Whole chr AI Telomeric LOH Mean size Interstitial LOH Mean Size Whole chr LOH Ploidy Aberrant cell fraction LST HRDscore adjustedHRDscore -#' @export -#' @import sequenza -#' @import data.table -scar_score<-function(seg,reference = "grch38", chr.in.names=TRUE, m,seqz=FALSE, ploidy=NULL, sizelimitLOH=15e6, outputdir=NULL){ - - if (is.null(outputdir)){ - outputdir=getwd() - } - if (reference == "grch38"){ - chrominfo = chrominfo_grch38 - if(!chr.in.names){ - chrominfo$chrom = gsub("chr","",chrominfo$chr) - rownames(chrominfo) = chrominfo$chr - } - } else if(reference == "grch37"){ - chrominfo = chrominfo_grch37 - if(!chr.in.names){ - chrominfo$chrom = gsub("chr","",chrominfo$chr) - rownames(chrominfo) = chrominfo$chr - } - } else if(reference == "mouse"){ - chrominfo = chrominfo_mouse - if(!chr.in.names){ - chrominfo$chrom = gsub("chr","",chrominfo$chr) - rownames(chrominfo) = chrominfo$chr - } - } else { - stop() - } - - if (seqz==TRUE){ - cat('Preprocessing started...\n') - seg<-preprocess.seqz(seg, ploidy0=ploidy, chr.in.names=chr.in.names) - cat('Preprocessing finished \n') - } else { - seg<-read.table(seg,header=T, check.names = F, stringsAsFactors = F, sep="\t") - seg[,9]<-seg[,8] - seg[,8]<-seg[,7] - seg[,7]<-seg[,6] - seg[,10]<-rep(1,dim(seg)[1]) - - } - #prep - cat('Determining HRD-LOH, LST, TAI \n') - seg<-preprocess.hrd(seg) - #Calculating the hrd score: - res_hrd <- calc.hrd(seg,sizelimit1=sizelimitLOH) - #Calculating the telomeric allelic imbalance score: - res_ai<- calc.ai_new(seg = seg, chrominfo = chrominfo) #<-- the first column is what I need - #Calculating the large scale transition scores: - res_lst <- calc.lst(seg = seg, chrominfo = chrominfo) #<-- You need to use the chrominfo.snp6 file! Nicolai sent it to you! - sum_HRD0<-res_lst+res_hrd+res_ai[1] - - if (is.null(ploidy)){ - sum_HRDc<-NA - } else { - sum_HRDc<-res_lst-15.5*ploidy+res_hrd+res_ai[1] - } - - #HRDresulst<-c(res_hrd,res_ai,res_lst,sum_HRD0,sum_HRDc) - #names(HRDresulst)<-c("HRD",colnames(res_ai),"LST", "HRD-sum","adjusted-HRDsum") - HRDresulst<-c(res_hrd,res_ai[1],res_lst,sum_HRD0) - names(HRDresulst)<-c("HRD",colnames(res_ai)[1],"LST", "HRD-sum") - run_name<-names(sum_HRD0) - write.table(t(HRDresulst),paste0(outputdir,"/",run_name,"_HRDresults.txt"),col.names=NA,sep="\t",row.names=unique(seg[,1])) - return(t(HRDresulst)) -} +#' Scar score +#' +#' Determining genomic scar score (telomeric allelic imbalance, loss-off heterozigosity, large-scle transitions), signs of homologous recombination deficiency +#' @param seg Imput file: either a binned sequenza out file (or any other segmentation file with the following columns: chromosome, position, base.ref, depth.normal, depth.tumor, depth.ratio, Af, Bf, zygosity.normal, GC.percent, good.reads, AB.normal, AB.tumor, tumor.strand) or an allele-specific segmentation file with the following columns: 1st column: sample name, 2nd column: chromosome, 3rd column: segmentation start, 4th column: segmentation end, 5th column: total copynumber, 6th column: copy number of A allele, 7th column: copy number of B allele +#' @param reference: reference genome: either grch38 or grch37. Default is grch38. +#' @param seqz Optional parameter, set to TRUE, if input file is a binned sequenza file. +#' @param ploidy Optional parameter, may be used if the ploidy of the sample is known. +#' @param chr.in.names Optional parameter, default: TRUE, set to FALSE if input file does not contain 'chr' in chromosome names. +#' @return Output is, with the following columns: HRD Telomeric AI Mean size Interstitial AI Mean Size Whole chr AI Telomeric LOH Mean size Interstitial LOH Mean Size Whole chr LOH Ploidy Aberrant cell fraction LST HRDscore adjustedHRDscore +#' @export +#' @import sequenza +#' @import data.table +#' +args <- commandArgs(T) +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.lst.bak.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.ai_new.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/preprocess.hrd.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/shrink.seg.ai.wrapper.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/shrink.seg.ai.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.hrd.R") +chrominfo_grch37 = read.table("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/chrominfo_grch37.2.list", header = T) +chrominfo_grch38 = read.table("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/chrominfo_grch38.2.list", header = T) + + +scar_score<-function(seg,reference = "grch38", seqz=FALSE, facets=FALSE, facets.stat=NULL, ploidy=NULL, sizelimitLOH=15e6, outputdir=NULL){ + + if (is.null(outputdir)){ + outputdir=getwd() + } + if (reference == "grch38"){ + chrominfo = chrominfo_grch38 + } else if(reference == "grch37"){ + chrominfo = chrominfo_grch37 + } else { + stop() + } + + purity <- NULL + emflags <- NULL + if (seqz==TRUE){ + cat('Preprocessing started...\n') + seg<-preprocess.seqz(seg,ploidy0=ploidy) + cat('Preprocessing finished \n') + } else if(facets==TRUE){ + cat('Using Facets result, Preprocessing started...\n') + if (! is.null(facets.stat)){ + tmp <- read.table(facets.stat,header=F,stringsAsFactors = F, sep="\t" ) + if (tmp$V1[3] == "ploidy" && tmp$V2[3] != "NA"){ + cat ("I:ploidy is ", tmp$V2[3], "\n") + ploidy <- as.numeric(tmp$V2[3]) + } + if (is.na(tmp$V2[5])){ + cat ("W:possible unreliable facets result, ", tmp$V2[5], "\n") + } + purity <- tmp$V2[2] + emflags <- tmp$V2[5] + } + seg<-preprocess.facets(seg, ploidy=ploidy) + cat('Preprocessing finished \n') + }else { + seg<-read.table(seg,header=T, check.names = F, stringsAsFactors = F, sep="\t") + seg[,9]<-seg[,8] + seg[,8]<-seg[,7] + seg[,7]<-seg[,6] + seg[,6]<-seg[,5] + seg[,10]<-rep(1,dim(seg)[1]) + + } + #prep + cat('Determining HRD-LOH, LST, TAI \n') + seg<-preprocess.hrd(seg) + #Calculating the hrd score: + out_loh <- calc.hrd(seg,sizelimit1=sizelimitLOH) + res_hrd = out_loh[[1]] + segLOH = out_loh[[2]] + #Calculating the telomeric allelic imbalance score: + out_tai<- calc.ai_new(seg = seg, chrominfo = chrominfo, min.size = 11e6) #<-- the first column is what I need + res_ai <- out_tai[[1]] + seg_ai <- out_tai[[2]] + #Calculating the large scale transition scores: + out_lst <- calc.lst(seg = seg, chrominfo = chrominfo) #<-- You need to use the chrominfo.snp6 file! Nicolai sent it to you! + res_lst <- out_lst[[1]] + seg_lst <- out_lst[[2]] + sum_HRD0<-res_lst+res_hrd+res_ai[1] + if (is.null(ploidy)){ + sum_HRDc<-NA + } else { + sum_HRDc<-res_lst-15.5*ploidy+res_hrd+res_ai[1] + } + + #HRDresulst<-c(res_hrd,res_ai,res_lst,sum_HRD0,sum_HRDc) + #names(HRDresulst)<-c("HRD",colnames(res_ai),"LST", "HRD-sum","adjusted-HRDsum") + HRDresulst<-as.character(c(res_hrd,res_ai[1],res_lst,sum_HRD0, toString(purity), toString(ploidy), toString(emflags))) + names(HRDresulst)<-c("HRD-LOH",colnames(res_ai)[1],"LST", "HRD-sum", "cnv-purity", "cnv-ploidy", "cnv-emflags") + run_name<-names(sum_HRD0) + write.table(t(HRDresulst),paste0(outputdir,"/",run_name,"_HRDresults.txt"),col.names=NA,sep="\t",row.names=unique(seg[,1]),quote = FALSE) + write.table(segLOH,paste0(outputdir,"/",run_name,"_cnv.LOH.txt"),sep="\t",quote = FALSE,row.names=FALSE) + write.table(seg_ai,paste0(outputdir,"/",run_name,"_cnv.TAI.txt"),sep="\t",quote = FALSE,row.names = FALSE) + write.table(seg_lst,paste0(outputdir,"/",run_name,"_cnv.LST.txt"),sep="\t",quote = FALSE,row.names = FALSE) + return(t(HRDresulst)) +} + +scar_score(args[1],reference = "grch37",outputdir=args[2]) diff --git a/R/scar_score.seqz.R b/R/scar_score.seqz.R new file mode 100644 index 0000000..71bdbc4 --- /dev/null +++ b/R/scar_score.seqz.R @@ -0,0 +1,104 @@ +#' Scar score +#' +#' Determining genomic scar score (telomeric allelic imbalance, loss-off heterozigosity, large-scle transitions), signs of homologous recombination deficiency +#' @param seg Imput file: either a binned sequenza out file (or any other segmentation file with the following columns: chromosome, position, base.ref, depth.normal, depth.tumor, depth.ratio, Af, Bf, zygosity.normal, GC.percent, good.reads, AB.normal, AB.tumor, tumor.strand) or an allele-specific segmentation file with the following columns: 1st column: sample name, 2nd column: chromosome, 3rd column: segmentation start, 4th column: segmentation end, 5th column: total copynumber, 6th column: copy number of A allele, 7th column: copy number of B allele +#' @param reference: reference genome: either grch38 or grch37. Default is grch38. +#' @param seqz Optional parameter, set to TRUE, if input file is a binned sequenza file. +#' @param ploidy Optional parameter, may be used if the ploidy of the sample is known. +#' @param chr.in.names Optional parameter, default: TRUE, set to FALSE if input file does not contain 'chr' in chromosome names. +#' @return Output is, with the following columns: HRD Telomeric AI Mean size Interstitial AI Mean Size Whole chr AI Telomeric LOH Mean size Interstitial LOH Mean Size Whole chr LOH Ploidy Aberrant cell fraction LST HRDscore adjustedHRDscore +#' @export +#' @import sequenza +#' @import data.table +#' +args <- commandArgs(T) +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.lst.bak.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.ai_new.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/preprocess.hrd.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/preprocess.seqz.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/shrink.seg.ai.wrapper.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/shrink.seg.ai.R") +source("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/calc.hrd.R") +chrominfo_grch37 = read.table("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/chrominfo_grch37.2.list", header = T) +chrominfo_grch38 = read.table("/mnt/GenePlus002/genecloud/Org_terminal/org_52/terminal/licq_18810756054/Research/HRD/07_we7h/013_HRD_CNV/code/scarHRD/R/chrominfo_grch38.2.list", header = T) + + +scar_score<-function(seg,reference = "grch38", seqz=FALSE, facets=FALSE, facets.stat=NULL, ploidy=NULL, sizelimitLOH=15e6, outputdir=NULL){ + + if (is.null(outputdir)){ + outputdir=getwd() + } + if (reference == "grch38"){ + chrominfo = chrominfo_grch38 + } else if(reference == "grch37"){ + chrominfo = chrominfo_grch37 + } else { + stop() + } + + purity <- NULL + emflags <- NULL + if (seqz==TRUE){ + cat('Preprocessing started...\n') + seg<-preprocess.seqz(seg,ploidy0=ploidy) + cat('Preprocessing finished \n') + } else if(facets==TRUE){ + cat('Using Facets result, Preprocessing started...\n') + if (! is.null(facets.stat)){ + tmp <- read.table(facets.stat,header=F,stringsAsFactors = F, sep="\t" ) + if (tmp$V1[3] == "ploidy" && tmp$V2[3] != "NA"){ + cat ("I:ploidy is ", tmp$V2[3], "\n") + ploidy <- as.numeric(tmp$V2[3]) + } + if (is.na(tmp$V2[5])){ + cat ("W:possible unreliable facets result, ", tmp$V2[5], "\n") + } + purity <- tmp$V2[2] + emflags <- tmp$V2[5] + } + seg<-preprocess.facets(seg, ploidy=ploidy) + cat('Preprocessing finished \n') + }else { + seg<-read.table(seg,header=T, check.names = F, stringsAsFactors = F, sep="\t") + seg[,9]<-seg[,8] + seg[,8]<-seg[,7] + seg[,7]<-seg[,6] + seg[,6]<-seg[,5] + seg[,10]<-rep(1,dim(seg)[1]) + + } + #prep + cat('Determining HRD-LOH, LST, TAI \n') + seg<-preprocess.hrd(seg) + #Calculating the hrd score: + out_loh <- calc.hrd(seg,sizelimit1=sizelimitLOH) + res_hrd = out_loh[[1]] + segLOH = out_loh[[2]] + #Calculating the telomeric allelic imbalance score: + out_tai<- calc.ai_new(seg = seg, chrominfo = chrominfo, min.size = 11e6) #<-- the first column is what I need + res_ai <- out_tai[[1]] + seg_ai <- out_tai[[2]] + #Calculating the large scale transition scores: + out_lst <- calc.lst(seg = seg, chrominfo = chrominfo) #<-- You need to use the chrominfo.snp6 file! Nicolai sent it to you! + res_lst <- out_lst[[1]] + seg_lst <- out_lst[[2]] + sum_HRD0<-res_lst+res_hrd+res_ai[1] + if (is.null(ploidy)){ + sum_HRDc<-NA + } else { + sum_HRDc<-res_lst-15.5*ploidy+res_hrd+res_ai[1] + } + + #HRDresulst<-c(res_hrd,res_ai,res_lst,sum_HRD0,sum_HRDc) + #names(HRDresulst)<-c("HRD",colnames(res_ai),"LST", "HRD-sum","adjusted-HRDsum") + HRDresulst<-as.character(c(res_hrd,res_ai[1],res_lst,sum_HRD0, toString(purity), toString(ploidy), toString(emflags))) + names(HRDresulst)<-c("HRD-LOH",colnames(res_ai)[1],"LST", "HRD-sum", "cnv-purity", "cnv-ploidy", "cnv-emflags") + run_name<-names(sum_HRD0) + write.table(t(HRDresulst),paste0(outputdir,"/",run_name,"_HRDresults.txt"),col.names=NA,sep="\t",row.names=unique(seg[,1]),quote = FALSE) + write.table(segLOH,paste0(outputdir,"/",run_name,"_cnv.LOH.txt"),sep="\t",quote = FALSE,row.names=FALSE) + write.table(seg_ai,paste0(outputdir,"/",run_name,"_cnv.TAI.txt"),sep="\t",quote = FALSE,row.names = FALSE) + write.table(seg_lst,paste0(outputdir,"/",run_name,"_cnv.LST.txt"),sep="\t",quote = FALSE,row.names = FALSE) + return(t(HRDresulst)) +} + +scar_score(args[1],reference = "grch37",seqz=TRUE,outputdir=args[2]) diff --git a/R/sysdata.rda b/R/sysdata.rda index cadac9873f3babfc6d03dd8ae592e9df2c7c1454..1e25b175f46a2d46cbe1bd1837df8285ebbe281b 100644 GIT binary patch literal 726 zcmZ>Y%CIzaj8qGb+RHAw2Gql&;Nh9ondjia(e#l=kAZ9 zKS-3tA7F~TJbR~0;;MPGW-O3sUt#i<0SGeK8W?yP7?^z-7#J5^$Xu+}YkZ|S)c>%j zQ3FUCB!>htSu_|J_yZVH7>uOA!Z2yp5LNY8{h2JxZoWWs*cyPQUbw)JA(g>k0^~C* zFsO1(30ZYneUX48&u{l#H~%k|yg23c%{u-5uK}w%4t8mjXIza6by>osu(Cry;rfmT z?JFl8J;)-Ecqk)5h2f0kWRtWr4;>rTOIgNsaZBh-Vo8c&xV2!agl0iwj*p&4f~>cZ zrJV#Dk7vq~=GIHilb*#$JY-DeXkg))JW)W^rr)OK(h;%kPcP@yUPzrJs;Am1W|(ezKDc4~M(uNfN-iCaLar@GbRPLc z>#f(~YEto-Xd+%LEbK8cT*+-)#_P4UW%vH8E`Jt~y!W=mYU5>7jVye(Scx?KT=g^V zVO?j{q5~5ySTKLAGk2Di$O!GU*s<$b^tH%qOD?FqemUjZ$!l#9jzTI*4qY3ieAgCb zhkCE@v~K(MHC%WhlknoUOpkR}UKYi+cu!;Eo>N*kN8rP`H_NnF%e=}ATNUywZt60r zBL`lr7b?;SkkI+5?*E%Rs9=c`-&B?-+DVcT7efj@KF=~PuDfAQD)7oyW4ds zBJ{zB9T?8{-8tu;Gl%n?#x2cGN3#O}2#_4}l}rdZwG9=DO98N9lOeEy-L!?ijz~DT zC)5^hZIAdmoNhCyC=qFXGvAd-Kbz@SAz($oP61a6*d^d90lNk45%8uYmaJSGMN%OW zMI=soWN)tww?I$lxiQY9cs@vHMh&v2OeNAqgT&u6G7w|I?JSeBV4K;yv!`!;?w1Tk4*7|?i~t8-}-d;9+xfMqr6-duj*&0 zrPWC2zPpy9<7-($_hWg9c(|}ZP3O@j3~BH+Navb#Qqs;zVLc@eos`@@CF7TF!1wA@ zeBd#VrY?@@Q;KB2jEe7a^djU4s*CPOy`CL z|6G97Nhb9jkva_~8F&F$6b1>W!4P ogtd`g3fZAe5AD2lis!9U&AYoZ@TUc9?Fg9t1;NT1+zJc;0H4uTZ2$lO diff --git a/README.md b/README.md index de9eb03..9e6c86e 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ scarHRD R package Manual Introduction ============ +This is a adaptation of the original `scarHRD` R package (https://github.com/sztup/scarHRD). Some modifications have been done. `scarHRD` is an R package which determines the levels of homologous recombination deficiency (telomeric allelic imbalance, loss off heterozygosity, number of large-scale transitions) based on NGS (WES, WGS) data. @@ -55,7 +56,7 @@ Installation ``` r library(devtools) -install_github('sztup/scarHRD',build_vignettes = TRUE) +install_github('AmoydxGXP/scarHRD',build_vignettes = TRUE) ``` Running on GRCh38