From d9f0f8bb30deb42d184fde4a74ccfa4382d7cdc1 Mon Sep 17 00:00:00 2001
From: Daniel Cameron <d-cameron@github.com>
Date: Mon, 11 Feb 2019 20:15:01 +1100
Subject: [PATCH] Working gridss_targeted.sh example gridss_targeted.sh now
 calculates metrics based on first 10,000,000 reads Models have been updated
 to be more robust to approximate metrics Version bumped to 2.2.0

---
 docker/Dockerfile                             |   6 +-
 example/example_regions_of_interest.bed       |   1 +
 example/gridss.sh                             |   2 +-
 ..._fully_integrated_fastq_to_vcf_pipeline.sh |   2 +-
 example/gridss_separate.sh                    |   2 +-
 example/gridss_targeted.sh                    | 132 ++++++++++++------
 example/somatic.sh                            |   2 +-
 pom.xml                                       |   2 +-
 scripts/cohort_analysis/0_README.txt          |   4 +-
 scripts/cohort_analysis/1_setup.sh            |   2 +-
 .../2_generate_analysis_scripts.py            |   2 +-
 scripts/cohort_analysis/4_cleanup.py          |   2 +-
 .../wehi/idsv/metrics/IdsvSamFileMetrics.java |   2 +-
 ...FastEmpiricalReferenceLikelihoodModel.java |   2 +-
 .../java/au/edu/wehi/idsv/model/Models.java   |   9 +-
 ...llectGridssMetricsAndExtractFullReads.java |  99 +++++++++++--
 ...CollectGridssMetricsAndExtractSVReads.java |   4 -
 src/main/java/gridss/ExtractFullReads.java    |   2 +-
 .../java/gridss/IndexedExtractFullReads.java  |   8 +-
 .../analysis/CigarSizeDistribution.java       |   2 +-
 ...ucturalVariantReadsCommandLineProgram.java |   3 +
 21 files changed, 203 insertions(+), 87 deletions(-)
 create mode 100644 example/example_regions_of_interest.bed

diff --git a/docker/Dockerfile b/docker/Dockerfile
index ce98bc7d0..52ae5e374 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -6,7 +6,7 @@ FROM ubuntu:16.04
 LABEL base.image="biocontainers:latest"
 LABEL version="1"
 LABEL software="GRIDSS"
-LABEL software.version="2.0.1"
+LABEL software.version="2.2.0"
 LABEL about.summary="Genomic Rearrangement IDentification Software Suite"
 LABEL about.home="https://github.com/PapenfussLab/gridss"
 LABEL about.tags="Genomics"
@@ -32,6 +32,6 @@ RUN chmod 777 /data
 USER gridss
 RUN mkdir /data/gridss
 WORKDIR /data/gridss
-RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
-ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.0.1-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
+ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.2.0-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"]
diff --git a/example/example_regions_of_interest.bed b/example/example_regions_of_interest.bed
new file mode 100644
index 000000000..b6bb0623a
--- /dev/null
+++ b/example/example_regions_of_interest.bed
@@ -0,0 +1 @@
+chr12	1527326	1528326
\ No newline at end of file
diff --git a/example/gridss.sh b/example/gridss.sh
index 584f17a20..8ae6e69ff 100644
--- a/example/gridss.sh
+++ b/example/gridss.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=hg19.fa
 OUTPUT=${INPUT/.bam/.sv.vcf}
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 if [[ ! -f "$INPUT" ]] ; then
 	echo "Missing $INPUT input file."
diff --git a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
index 38c8c96d8..1a8754bf7 100644
--- a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
+++ b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh
@@ -41,7 +41,7 @@ REFERENCE=~/reference_genomes/human/hg19.fa
 INPUT=pipelined.example.input.bam
 OUTPUT=pipelined.example.sv.vcf
 RAW_GRIDSS_ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.1.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
 GRIDSS_JVM_ARGS="
 	-Dsamjdk.use_async_io_read_samtools=true 
 	-Dsamjdk.use_async_io_write_samtools=true 
diff --git a/example/gridss_separate.sh b/example/gridss_separate.sh
index 062e503cb..9eb640de1 100644
--- a/example/gridss_separate.sh
+++ b/example/gridss_separate.sh
@@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=hg19.fa
 OUTPUT=${INPUT/.bam/.sv.vcf}
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 WORKING_DIR=.
 
 
diff --git a/example/gridss_targeted.sh b/example/gridss_targeted.sh
index acfd0c19c..08c1bda44 100644
--- a/example/gridss_targeted.sh
+++ b/example/gridss_targeted.sh
@@ -2,25 +2,33 @@
 #
 # Runs GRIDSS on a set of targeted regions
 #
-# Note that to properly call compound breakpoints, this script should be run
+#
+# WARNING: reference read counts will only be correct in targeted regions.
+# If a breakpoint goes from a targeted region to a region not in the BED of
+# interest then the untargeted breakend will be reported as homozygous variant
+# even when it is not (since the reference-supporting reads were not included).
+# This can be fixed by rerunning gridss.AnnotateReferenceReads against the
+# original input file directly, or as explained below.
+#
+# To properly call compound breakpoints, this script should be run
 # twice: once for the intial GRIDSS calls, then again on the GRIDSS calls
 # This enables accurate calling of complex events for which only some of the
-# breakpoints involved fall within the initially targeted region
+# breakpoints involved fall within the initially targeted region, as well
+# as fixing the above issue with reference read counts.
 #
 
-BED_REGIONS_OF_INTEREST=targetted_regions.bed # If it's already a bed then you don't need the next two lines
-VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
-Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput #BED_REGIONS_OF_INTEREST
+BED_REGIONS_OF_INTEREST=example_regions_of_interest.bed
+# If you have a VCF file, you can use the following script convert to a BED
+# file that correctly handle symbolic SVs
+#VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file
+#Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput $BED_REGIONS_OF_INTEREST
 
-REGIONS_OF_INTEREST=colo829.region.bed
-REFERENCE=/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta
-OUTPUT=colo829.lite.vcf
-ASSEMBLY=assembly.lite.bam
-GRIDSS_JAR=gridss-2.1.2-gridss-jar-with-dependencies.jar
-
-# This example uses a paired tumour/normal
-NORMAL=COLO829R_dedup.realigned.bam
-TUMOUR=COLO829T_dedup.realigned.bam
+REGIONS_OF_INTEREST=$BED_REGIONS_OF_INTEREST
+REFERENCE=hg19.fa
+INPUT=chr12.1527326.DEL1024.bam
+OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
+ASSEMBLY=${INPUT/.bam/.targeted.assembly.bam}
+GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 # Extract the reads flanking each breakpoint, not just overlapping.
 # 2k is chosen since it it (should) be larger than the fragment size
@@ -29,18 +37,8 @@ TUMOUR=COLO829T_dedup.realigned.bam
 # to that of the full file.
 REGION_PADDING_SIZE=2000
 
-
-
-
-
-INPUT=../../temp/test.bam
-REFERENCE=~/hartwig/temp/Homo_sapiens.GRCh37.GATK.illumina.fa
-OUTPUT=${INPUT/.bam/.targeted.sv.vcf}
-ASSEMBLY=${OUTPUT/.sv.vcf/.targeted.gridss.assembly.bam}
-GRIDSS_JAR=../target/gridss-2.1.2-gridss-jar-with-dependencies.jar
-WORKING_DIR=../../temp/
-TMP_DIR=../../temp/
-
+# Number of reads for which to calculate metrics for
+STOP_METRICS_AFTER=10000000
 JVM_ARGS="-ea \
 	-Dreference_fasta="$REFERENCE" \
 	-Dsamjdk.create_index=true \
@@ -50,36 +48,78 @@ JVM_ARGS="-ea \
 	-Dsamjdk.buffer_size=4194304 \
 	-Dgridss.gridss.output_to_temp_file=true \
 	-cp $GRIDSS_JAR "
-# Don't perform async read-ahead for gridss.IndexedExtractFullReads
-# as there's not point in performing read-ahead of reads we're going
-# ignore anyway
-JVM_ARGS_SINGLE_THREADED="-ea \
-        -Dreference_fasta="$REFERENCE" \
-        -Dsamjdk.create_index=true \
-        -Dsamjdk.use_async_io_write_samtools=true \
-        -Dsamjdk.use_async_io_write_tribble=true \
-        -Dgridss.gridss.output_to_temp_file=true \
-        -cp $GRIDSS_JAR "
-
-
-for F in $NORMAL $TUMOUR ; do
-	java -Xmx8g $JVM_ARGS_SINGLE_THREADED gridss.IndexedExtractFullReads B=$REGIONS_OF_INTEREST REGION_PADDING_SIZE=$REGION_PADDING_SIZE I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
-	# If you have are extracting a large portion of genome then
+# for each input file
+for F in $INPUT ; do
+	TARGETED_BAM=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam
+	GRIDSS_WORKING_DIR=./$(basename $TARGETED_BAM).gridss.working
+	TMP_DIR=./$(basename $TARGETED_BAM).gridss.working
+	echo "Calculate metrics for $F based on first $STOP_METRICS_AFTER reads"
+	# estimate metrics
+	mkdir -p $GRIDSS_WORKING_DIR
+	java -Xmx8g $JVM_ARGS gridss.analysis.CollectGridssMetrics \
+			TMP_DIR=$GRIDSS_WORKING_DIR \
+			ASSUME_SORTED=true \
+			I=$F \
+			O=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM) \
+			THRESHOLD_COVERAGE=25000 \
+			FILE_EXTENSION=null \
+			GRIDSS_PROGRAM=null \
+			GRIDSS_PROGRAM=CollectCigarMetrics \
+			GRIDSS_PROGRAM=CollectMapqMetrics \
+			GRIDSS_PROGRAM=CollectTagMetrics \
+			GRIDSS_PROGRAM=CollectIdsvMetrics \
+			GRIDSS_PROGRAM=ReportThresholdCoverage \
+			PROGRAM=null \
+			PROGRAM=CollectInsertSizeMetrics \
+			STOP_AFTER=$STOP_METRICS_AFTER \
+			| tee log.$F.collectgridssmetrics.log 
+	java -Xmx8g $JVM_ARGS gridss.analysis.CollectStructuralVariantReadMetrics \
+			TMP_DIR=$TMP_DIR \
+			I=$F \
+			OUTPUT=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).sv_metrics \
+			INSERT_SIZE_METRICS=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).insert_size_metrics \
+			STOP_AFTER=$STOP_METRICS_AFTER \
+			| tee log.$F.collectsvmetrics.log 
+	echo "Extracting all reads from fragments overlapping $REGIONS_OF_INTEREST from $F"
+	
+	# IndexedExtractFullReads is much faster than ExtractFullReads when
+	# using a small region of interest. Unfortunately, if the regions of
+	# interest are all SVs, the insert size distribution will be biased
+	## Don't perform async read-ahead for gridss.IndexedExtractFullReads
+	## as there's not point in performing read-ahead of reads we're going
+	## ignore anyway
+	JVM_ARGS_SINGLE_THREADED="-ea \
+			-Dreference_fasta="$REFERENCE" \
+			-Dsamjdk.create_index=true \
+			-Dsamjdk.use_async_io_write_samtools=true \
+			-Dgridss.gridss.output_to_temp_file=true \
+			-cp $GRIDSS_JAR "
+	java -Xmx8g $JVM_ARGS_SINGLE_THREADED \
+		gridss.IndexedExtractFullReads \
+		B=$REGIONS_OF_INTEREST \
+		REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+		I=$F \
+		O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
+	
+	# If you are extracting a large portion of genome then
 	# you should perform a full pass of the input file as it will
 	# be faster than doing millions of index seeks()
-	#java -Xmx8g $JVM_ARGS gridss.ExtractFullReads REGION_PADDING_SIZE=$REGION_PADDING_SIZE B=$REGIONS_OF_INTEREST I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log &
+	#java -Xmx8g $JVM_ARGS gridss.CollectGridssMetricsAndExtractFullReads \
+	#	REGION_PADDING_SIZE=$REGION_PADDING_SIZE \
+	#	REGION_BED=$REGIONS_OF_INTEREST \
+	#	I=$F \
+	#	O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log &
 done
 wait
+
 # Run GRIDSS on the extracted files
-# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash.
-# 
+# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. Refer to the GRIDSS README for more details
 java -Xmx31g $JVM_ARGS \
 	-cp $GRIDSS_JAR gridss.CallVariants \
 	TMP_DIR=. \
 	WORKING_DIR=. \
 	REFERENCE_SEQUENCE=$REFERENCE \
-	INPUT=$NORMAL.lite.bam \
-	INPUT=$TUMOUR.lite.bam \
+	INPUT=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam \
 	OUTPUT=$OUTPUT \
 	ASSEMBLY=$ASSEMBLY \
 	WORKER_THREADS=16 \
diff --git a/example/somatic.sh b/example/somatic.sh
index 080539116..c178563f3 100644
--- a/example/somatic.sh
+++ b/example/somatic.sh
@@ -8,7 +8,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed
 REFERENCE=~/reference_genomes/human/hg19.fa
 OUTPUT=somatic.sv.vcf
 ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam}
-GRIDSS_JAR=~/bin/gridss-2.0.1-jar-with-dependencies.jar
+GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar
 
 if [[ ! -f "$NORMAL" ]] ; then
 	echo "Missing $NORMAL input file."
diff --git a/pom.xml b/pom.xml
index 3f344834b..e3fabf8db 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
 	<groupId>au.edu.wehi</groupId>
 	<artifactId>gridss</artifactId>
 	<packaging>jar</packaging>
-	<version>2.1.2-gridss</version>
+	<version>2.2.0-gridss</version>
 	<name>gridss</name>
 	<url>https://github.com/PapenfussLab/gridss</url>
 	<properties>
diff --git a/scripts/cohort_analysis/0_README.txt b/scripts/cohort_analysis/0_README.txt
index e20e4d979..dd1a11ed0 100644
--- a/scripts/cohort_analysis/0_README.txt
+++ b/scripts/cohort_analysis/0_README.txt
@@ -38,7 +38,7 @@ Instructions
 > ./1_setup.sh
 
 This will download the GRIDSS jar file and dependencies
-(https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar)
+(https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar)
 and download and decompress the ENCODE blacklist file
 (https://www.encodeproject.org/files/ENCFF001TDO/@@download/ENCFF001TDO.bed.gz).
 
@@ -54,7 +54,7 @@ of the following files:
 
 BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
 REFERENCE_GENOME = "data/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
 
 
 5. Open & edit the samples file sample.csv using a text editor. The file
diff --git a/scripts/cohort_analysis/1_setup.sh b/scripts/cohort_analysis/1_setup.sh
index 29d9d0814..d81d07fdc 100644
--- a/scripts/cohort_analysis/1_setup.sh
+++ b/scripts/cohort_analysis/1_setup.sh
@@ -3,7 +3,7 @@
 # Setup script
 
 # Download the GRIDSS jar file and dependencies from:
-wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar
+wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar
 
 # Download and decompress the ENCODE blacklist file:
 mkdir data
diff --git a/scripts/cohort_analysis/2_generate_analysis_scripts.py b/scripts/cohort_analysis/2_generate_analysis_scripts.py
index 53538bdc6..97a66b6b2 100644
--- a/scripts/cohort_analysis/2_generate_analysis_scripts.py
+++ b/scripts/cohort_analysis/2_generate_analysis_scripts.py
@@ -5,7 +5,7 @@
 
 BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed"
 REFERENCE_GENOME = "data/hg19/hg19.fa"
-GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar"
+GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar"
 
 
 # Read the script template
diff --git a/scripts/cohort_analysis/4_cleanup.py b/scripts/cohort_analysis/4_cleanup.py
index f323828b5..3d06382b6 100644
--- a/scripts/cohort_analysis/4_cleanup.py
+++ b/scripts/cohort_analysis/4_cleanup.py
@@ -4,7 +4,7 @@
 
 rm scripts/*
 rm output/*
-rm gridss-2.0.1-gridss-jar-with-dependencies.jar
+rm gridss-2.2.0-gridss-jar-with-dependencies.jar
 rm data/ENCODE_blacklist_hg19/ENCFF001TDO.bed
 
 # rm data/hg19/*
diff --git a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
index 6d2e8defd..67a0b6365 100644
--- a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
+++ b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java
@@ -153,7 +153,7 @@ public double readPairFoldedCumulativeDistribution(int fragmentSize) {
 		}
 		double totalPairs = idsvMetrics.READ_PAIRS_BOTH_MAPPED;
 		double dpPairs = totalPairs - insertDistribution.getTotalMappedPairs() + pairsFromFragmentDistribution;
-		return dpPairs / totalPairs;
+		return Math.min(1, dpPairs / totalPairs);
 	}
 	
 	/**
diff --git a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
index 81234c7ce..61a769922 100644
--- a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
+++ b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java
@@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma
 	public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) {
 		IdsvMetrics im = metrics.getIdsvMetrics();
 		// completely unmapped read pairs are excluded for consistency with sc and dp calculation
-		double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS));
+		double score = MathUtil.prToPhred((double)Math.max(1, im.READ_PAIRS_ONE_MAPPED) / (double)Math.max(1, Math.max(im.READ_PAIRS_ONE_MAPPED, (im.MAPPED_READS))));
 		score = Math.min(score, mapq);
 		return score;
 	}
diff --git a/src/main/java/au/edu/wehi/idsv/model/Models.java b/src/main/java/au/edu/wehi/idsv/model/Models.java
index 1f72e6d3a..7d4ebbcf2 100644
--- a/src/main/java/au/edu/wehi/idsv/model/Models.java
+++ b/src/main/java/au/edu/wehi/idsv/model/Models.java
@@ -36,17 +36,10 @@ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, Lis
 				public BreakendSummary apply(DirectedEvidence input) {
 					return input.getBreakendSummary();
 				}
-			}), Lists.transform(evidence, new Function<DirectedEvidence, Long>() {
-				@Override
-				public Long apply(DirectedEvidence input) {
-					return ScalingHelper.toScaledWeight(input.getBreakendQual());
-				}
-			}));
+			}), Lists.transform(evidence, (Function<DirectedEvidence, Long>) input -> ScalingHelper.toScaledWeight(input.getBreakendQual())));
 	}
 	/**
 	 * Calculates the most likely breakend interval for the given evidence 
-	 * @param evidence
-	 * @return breakend interval with highest total evidence quality
 	 */
 	public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, List<BreakendSummary> bs, List<Long> weights) {
 		if (bs == null || bs.size() == 0) throw new IllegalArgumentException("No evidence supplied");
diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
index 58a6394ce..ba49272b8 100644
--- a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
+++ b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java
@@ -2,6 +2,8 @@
 
 import au.edu.wehi.idsv.ReadPairConcordanceMethod;
 import gridss.analysis.CollectGridssMetrics;
+import gridss.analysis.CollectIdsvMetrics;
+import gridss.analysis.CollectStructuralVariantReadMetrics;
 import gridss.cmdline.CommandLineProgramHelper;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
@@ -25,26 +27,60 @@
         programGroup = gridss.cmdline.programgroups.DataConversion.class
 )
 public class CollectGridssMetricsAndExtractFullReads extends CollectGridssMetrics {
+    // #region SV command line args
+    @Argument(doc="Minimum indel size", optional=true)
+    public int MIN_INDEL_SIZE = 1;
+    @Argument(doc="Minimum bases clipped", optional=true)
+    public int MIN_CLIP_LENGTH = 1;
+    @Argument(doc="Include hard and soft clipped reads in output", optional=true)
+    public boolean CLIPPED = true;
+    @Argument(doc="Include reads containing indels in output", optional=true)
+    public boolean INDELS = true;
+    @Argument(doc="Include split reads in output", optional=true)
+    public boolean SPLIT = true;
+    @Argument(doc="Include read pairs in which only one of the read is aligned to the reference.", optional=true)
+    public boolean SINGLE_MAPPED_PAIRED = true;
+    @Argument(doc="Include read pairs that align do not align in the expected orientation within the expected fragment size distribution.", optional=true)
+    public boolean DISCORDANT_READ_PAIRS = true;
+    @Argument(doc="Method of calculating read pair concordance. Valid values are SAM_FLAG, PERCENTAGE, and FIXED", optional=true)
+    public ReadPairConcordanceMethod READ_PAIR_CONCORDANCE_METHOD = ReadPairConcordanceMethod.SAM_FLAG;
+    @Argument(doc="Minimum concordant read pair fragment size if using the FIXED method of calculation", optional=true)
+    public int FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = 0;
+    @Argument(doc="Maximum concordant read pair fragment size if using the FIXED method of calculation", optional=true)
+    public int FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = 0;
+    @Argument(doc = "Percent (0.0-1.0) of read pairs considered concorant if using the PERCENTAGE method of calculation.", optional=true)
+    public Float READ_PAIR_CONCORDANT_PERCENT = 0.995f;
+    @Argument(doc="Picard tools insert size distribution metrics txt file. Required if using the PERCENTAGE read pair concordance calculation method.", optional=true)
+    public File INSERT_SIZE_METRICS = null;
+    @Argument(doc="Include unmapped reads", optional=true)
+    public boolean UNMAPPED_READS = true;
+    @Argument(doc="If true, also include reads marked as duplicates.")
+    public boolean INCLUDE_DUPLICATES = false;
+    @Argument(shortName="SVO", doc="Output file containing SV metrics", optional=true)
+    public File SV_METRICS_OUTPUT;
+    // #endregion
+    // #region extract command line args
     @Argument(shortName="MO", doc="Output file containing SV metrics", optional=true)
-    public File REGION_BED;
+    public File REGION_BED = new ExtractFullReads().REGION_BED;
     @Argument(doc = "Extract reads whose mate maps to an export region. " +
             "If the MC tag is not present, only the starting alignment position of the mate is considered. " +
             "When determining whether the mate maps to an export region " +
             "only the primary alignment of that mate is considered. Secondary " +
             "and supplementary alignments are ignored.")
-    public boolean EXTRACT_MATES = true;
-    @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
-    public boolean EXTRACT_SPLITS = true;
+    public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES;
+    @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region", optional=true)
+    public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS;
     @Argument(doc = "Number of bases surrounding each export region to include in the index query. " +
             "Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" +
             "IO requests made. " +
-            "This parameter is not used if a linear scan is performed.")
-    public int REGION_PADDING_SIZE = 2000;
-    @Argument(doc = "File to write the output reads to.")
+            "This parameter is not used if a linear scan is performed.", optional=true)
+    public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE;
+    @Argument(doc = "File to write the output reads to.", optional=true)
     public File READ_OUTPUT;
     @Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
             + " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
             shortName="THREADS")
+    // #endregion
     public int WORKER_THREADS = 1;
     public static void main(final String[] args) {
         new CollectGridssMetricsAndExtractFullReads().instanceMainWithExit(args);
@@ -89,10 +125,57 @@ public boolean supportsMetricAccumulationLevel() {
 			}
         };
     }
+    protected CollectStructuralVariantReadMetrics getSVMetrics() {
+        CollectStructuralVariantReadMetrics extract = new CollectStructuralVariantReadMetrics();
+        CommandLineProgramHelper.copyInputs(this, extract);
+        extract.MIN_INDEL_SIZE = MIN_INDEL_SIZE;
+        extract.MIN_CLIP_LENGTH = MIN_CLIP_LENGTH;
+        extract.CLIPPED = CLIPPED;
+        extract.INDELS = INDELS;
+        extract.SPLIT = SPLIT;
+        extract.SINGLE_MAPPED_PAIRED = SINGLE_MAPPED_PAIRED;
+        extract.DISCORDANT_READ_PAIRS = DISCORDANT_READ_PAIRS;
+        extract.READ_PAIR_CONCORDANCE_METHOD = READ_PAIR_CONCORDANCE_METHOD;
+        extract.FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE;
+        extract.FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE;
+        extract.READ_PAIR_CONCORDANT_PERCENT = READ_PAIR_CONCORDANT_PERCENT;
+        extract.INSERT_SIZE_METRICS = INSERT_SIZE_METRICS;
+        extract.UNMAPPED_READS = UNMAPPED_READS;
+        extract.INCLUDE_DUPLICATES = INCLUDE_DUPLICATES;
+        extract.OUTPUT = SV_METRICS_OUTPUT;
+        extract.INPUT = this.INPUT;
+        extract.ASSUME_SORTED = true;
+        return extract;
+    }
+    public ProgramInterface createSVMetrics() {
+        return new ProgramInterface() {
+            @Override
+            public SinglePassSamProgram makeInstance(final String outbase,
+                                                     final String outext,
+                                                     final File input,
+                                                     final File reference,
+                                                     final Set<MetricAccumulationLevel> metricAccumulationLevel,
+                                                     final File dbSnp,
+                                                     final File intervals,
+                                                     final File refflat,
+                                                     final  Set<String> ignoreSequence) {
+                final CollectStructuralVariantReadMetrics program =  getSVMetrics();
+                return program.asSinglePassSamProgram();
+            }
+            @Override
+            public boolean needsReferenceSequence() {
+                return false;
+            }
+            @Override
+            public boolean supportsMetricAccumulationLevel() {
+                return false;
+            }
+        };
+    }
     @Override
     public void setProgramsToRun(Collection<ProgramInterface> programsToRun) {
-    	// Inject SV read extraction
     	programsToRun.add(createExtractFullReads());
+        programsToRun.add(createSVMetrics());
     	super.setProgramsToRun(programsToRun);
     }
 }
diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
index cca6a434f..00c924deb 100644
--- a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
+++ b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java
@@ -61,10 +61,6 @@ public class CollectGridssMetricsAndExtractSVReads extends CollectGridssMetrics
     public static void main(final String[] args) {
         new CollectGridssMetricsAndExtractSVReads().instanceMainWithExit(args);
     }
-    @Override
-    protected String[] customCommandLineValidation() {
-        return super.customCommandLineValidation();
-    }
     protected ExtractSVReads getExtractSVReads() {
     	ExtractSVReads extract = new ExtractSVReads();
     	CommandLineProgramHelper.copyInputs(this, extract);
diff --git a/src/main/java/gridss/ExtractFullReads.java b/src/main/java/gridss/ExtractFullReads.java
index f5053eba6..cfbff64b3 100644
--- a/src/main/java/gridss/ExtractFullReads.java
+++ b/src/main/java/gridss/ExtractFullReads.java
@@ -33,7 +33,7 @@ public class ExtractFullReads extends SinglePassSamProgram {
 	@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
 	public boolean EXTRACT_SPLITS = true;
 	@Argument(doc = "Number of bases surrounding each export region to include in the index query. ")
-	public int REGION_PADDING_SIZE = 2000;
+	public int REGION_PADDING_SIZE = 0;
 
 	private File tmpOut;
 	private SAMFileWriter writer;
diff --git a/src/main/java/gridss/IndexedExtractFullReads.java b/src/main/java/gridss/IndexedExtractFullReads.java
index dbf104841..5b166bb9a 100644
--- a/src/main/java/gridss/IndexedExtractFullReads.java
+++ b/src/main/java/gridss/IndexedExtractFullReads.java
@@ -30,17 +30,17 @@ public class IndexedExtractFullReads extends CommandLineProgram {
 	@Argument(shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output file location.")
 	public File OUTPUT;
 	@Argument(shortName = "B", doc = "BED file containing regions to export")
-	public File REGION_BED;
+	public File REGION_BED = new ExtractFullReads().REGION_BED;
 	@Argument(doc = "Extract reads whose mate maps to an export region. " +
 			"If the MC tag is not present, only the starting alignment position of the mate is considered. " +
 			"When determining whether the mate maps to an export region " +
 			"only the primary alignment of that mate is considered. Secondary " +
 			"and supplementary alignments are ignored.")
-	public boolean EXTRACT_MATES = true;
+	public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES;
 	@Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region")
-	public boolean EXTRACT_SPLITS = true;
+	public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS;
 	@Argument(doc = "Number of additional bases surrounding each export region to include in the index query. ")
-	public int REGION_PADDING_SIZE = 0;
+	public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE;
 	@Argument(doc="Number of worker threads to spawn. Defaults to number of cores available."
 			+ " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.",
 			shortName="THREADS")
diff --git a/src/main/java/gridss/analysis/CigarSizeDistribution.java b/src/main/java/gridss/analysis/CigarSizeDistribution.java
index 4c26a76e7..c9ff2b02f 100644
--- a/src/main/java/gridss/analysis/CigarSizeDistribution.java
+++ b/src/main/java/gridss/analysis/CigarSizeDistribution.java
@@ -33,7 +33,7 @@ private static double[] calcPhred(List<CigarDetailMetrics> sc) {
 			CigarDetailMetrics m = sc.get(i);
 			assert(m.LENGTH == i);
 			if (cumsum > 0) {
-				score = -10 * Math.log10((double)cumsum / (double)total);
+				score = -10 * Math.log10((double)Math.max(1, cumsum) / (double)Math.max(1, total));
 			}
 			phred[i] = score;
 			cumsum -= m.COUNT;
diff --git a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
index e2db2c195..d949ea95d 100644
--- a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
+++ b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java
@@ -18,6 +18,7 @@
  */
 public abstract class ProcessStructuralVariantReadsCommandLineProgram extends ByReadNameSinglePassSamProgram {
 	private static final Log log = Log.getInstance(ProcessStructuralVariantReadsCommandLineProgram.class);
+	//#region SV read args
 	@Argument(doc="Minimum indel size", optional=true)
     public int MIN_INDEL_SIZE = 1;
     @Argument(doc="Minimum bases clipped", optional=true)
@@ -46,6 +47,8 @@ public abstract class ProcessStructuralVariantReadsCommandLineProgram extends By
     public boolean UNMAPPED_READS = true;
     @Argument(doc="If true, also include reads marked as duplicates.")
 	public boolean INCLUDE_DUPLICATES = false;
+	//#endregion SV read args
+
     @Override
 	public String[] customCommandLineValidation() {
 		if (READ_PAIR_CONCORDANCE_METHOD == ReadPairConcordanceMethod.PERCENTAGE && INSERT_SIZE_METRICS == null) {