From 710fc4b1e389f90e9a518d10fc589e276399c6a1 Mon Sep 17 00:00:00 2001 From: mnshgl0110 Date: Wed, 8 Mar 2023 11:46:08 +0100 Subject: [PATCH] added simple-event-annotation as a command line script --- src/main/r/simple-event-annotation.R | 64 ++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 src/main/r/simple-event-annotation.R diff --git a/src/main/r/simple-event-annotation.R b/src/main/r/simple-event-annotation.R new file mode 100755 index 000000000..dd3799ad2 --- /dev/null +++ b/src/main/r/simple-event-annotation.R @@ -0,0 +1,64 @@ +# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") +#BiocManager::install("StructuralVariantAnnotation") +#install.packages("stringr") +suppressMessages(library(VariantAnnotation)) +suppressMessages(library(StructuralVariantAnnotation)) +suppressMessages(library(stringr)) +args=commandArgs(trailingOnly = TRUE) + +################################################################################ +# Check arguments +if(length(args)!=3){ + print("Incorrect number of arguments.\nUsage:\nRscript simple-event-annotation.R ") + quit() +} +if(!(file.exists(args[1]))){ + print(paste("Cannot open input file:", args[1])) + quit() +} +fin=args[1] +genome=args[2] +prefix=args[3] +################################################################################ +#' Simple SV type classifier +simpleEventType <- function(gr) { + pgr = partner(gr) + return(ifelse(seqnames(gr) != seqnames(pgr), "CTX", # inter-chromosomosal + ifelse(strand(gr) == strand(pgr), "INV", + ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS", # TODO: improve classification of complex events + ifelse(xor(start(gr) < start(pgr), strand(gr) == "-"), "DEL", + "DUP"))))) +} +# using the example in the GRIDSS /example directory +vcf <- readVcf(file=fin, genome=genome) +info(header(vcf)) = unique(as(rbind(as.data.frame(info(header(vcf))), data.frame( + row.names=c("SIMPLE_TYPE"), + Number=c("1"), + Type=c("String"), + Description=c("Simple event type annotation based purely on breakend position and orientation."))), "DataFrame")) +gr <- breakpointRanges(vcf) +svtype <- simpleEventType(gr) +info(vcf)$SIMPLE_TYPE <- NA_character_ +info(vcf[gr$sourceId])$SIMPLE_TYPE <- svtype +# info(vcf[gr$sourceId])$SVLEN <- gr$svLen +vcf@info[gr$sourceId, 'SVLEN'] <- gr$svLen # Create a new entry for SVLEN and add corresponding data +writeVcf(vcf, paste0(prefix, ".annotated.vcf")) # generated by example/gridss.sh + +# TODO: perform event filtering here +# By default, GRIDSS is very sensitive but this comes at the cost of a high false discovery rate +gr <- gr[gr$FILTER == "PASS" & partner(gr)$FILTER == "PASS"] # Remove low confidence calls + +simplegr <- gr[simpleEventType(gr) %in% c("INS", "INV", "DEL", "DUP")] +simplebed <- data.frame( + chrom=seqnames(simplegr), + # call the centre of the homology/inexact interval + start=as.integer((start(simplegr) + end(simplegr)) / 2), + end=as.integer((start(partner(simplegr)) + end(partner(simplegr))) / 2), + name=simpleEventType(simplegr), + score=simplegr$QUAL, + strand="." +) +# Just the lower of the two breakends so we don't output everything twice +simplebed <- simplebed[simplebed$start < simplebed$end,] +write.table(simplebed, paste0(prefix,".simple.bed"), quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE) +