From d9f0f8bb30deb42d184fde4a74ccfa4382d7cdc1 Mon Sep 17 00:00:00 2001 From: Daniel Cameron Date: Mon, 11 Feb 2019 20:15:01 +1100 Subject: [PATCH] Working gridss_targeted.sh example gridss_targeted.sh now calculates metrics based on first 10,000,000 reads Models have been updated to be more robust to approximate metrics Version bumped to 2.2.0 --- docker/Dockerfile | 6 +- example/example_regions_of_interest.bed | 1 + example/gridss.sh | 2 +- ..._fully_integrated_fastq_to_vcf_pipeline.sh | 2 +- example/gridss_separate.sh | 2 +- example/gridss_targeted.sh | 132 ++++++++++++------ example/somatic.sh | 2 +- pom.xml | 2 +- scripts/cohort_analysis/0_README.txt | 4 +- scripts/cohort_analysis/1_setup.sh | 2 +- .../2_generate_analysis_scripts.py | 2 +- scripts/cohort_analysis/4_cleanup.py | 2 +- .../wehi/idsv/metrics/IdsvSamFileMetrics.java | 2 +- ...FastEmpiricalReferenceLikelihoodModel.java | 2 +- .../java/au/edu/wehi/idsv/model/Models.java | 9 +- ...llectGridssMetricsAndExtractFullReads.java | 99 +++++++++++-- ...CollectGridssMetricsAndExtractSVReads.java | 4 - src/main/java/gridss/ExtractFullReads.java | 2 +- .../java/gridss/IndexedExtractFullReads.java | 8 +- .../analysis/CigarSizeDistribution.java | 2 +- ...ucturalVariantReadsCommandLineProgram.java | 3 + 21 files changed, 203 insertions(+), 87 deletions(-) create mode 100644 example/example_regions_of_interest.bed diff --git a/docker/Dockerfile b/docker/Dockerfile index ce98bc7d0..52ae5e374 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,7 +6,7 @@ FROM ubuntu:16.04 LABEL base.image="biocontainers:latest" LABEL version="1" LABEL software="GRIDSS" -LABEL software.version="2.0.1" +LABEL software.version="2.2.0" LABEL about.summary="Genomic Rearrangement IDentification Software Suite" LABEL about.home="https://github.com/PapenfussLab/gridss" LABEL about.tags="Genomics" @@ -32,6 +32,6 @@ RUN chmod 777 /data USER gridss RUN mkdir /data/gridss WORKDIR /data/gridss -RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar +RUN wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar -ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.0.1-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"] +ENTRYPOINT ["java", "-ea", "-Xmx16g", "-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true", "-Dgridss.gridss.output_to_temp_file=true", "-cp", "/data/gridss/gridss-2.2.0-gridss-jar-with-dependencies.jar", "gridss.CallVariants", "WORKER_THREADS=4"] diff --git a/example/example_regions_of_interest.bed b/example/example_regions_of_interest.bed new file mode 100644 index 000000000..b6bb0623a --- /dev/null +++ b/example/example_regions_of_interest.bed @@ -0,0 +1 @@ +chr12 1527326 1528326 \ No newline at end of file diff --git a/example/gridss.sh b/example/gridss.sh index 584f17a20..8ae6e69ff 100644 --- a/example/gridss.sh +++ b/example/gridss.sh @@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed REFERENCE=hg19.fa OUTPUT=${INPUT/.bam/.sv.vcf} ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam} -GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar +GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar if [[ ! -f "$INPUT" ]] ; then echo "Missing $INPUT input file." diff --git a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh index 38c8c96d8..1a8754bf7 100644 --- a/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh +++ b/example/gridss_fully_integrated_fastq_to_vcf_pipeline.sh @@ -41,7 +41,7 @@ REFERENCE=~/reference_genomes/human/hg19.fa INPUT=pipelined.example.input.bam OUTPUT=pipelined.example.sv.vcf RAW_GRIDSS_ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam} -GRIDSS_JAR=~/bin/gridss-2.1.1-jar-with-dependencies.jar +GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar GRIDSS_JVM_ARGS=" -Dsamjdk.use_async_io_read_samtools=true -Dsamjdk.use_async_io_write_samtools=true diff --git a/example/gridss_separate.sh b/example/gridss_separate.sh index 062e503cb..9eb640de1 100644 --- a/example/gridss_separate.sh +++ b/example/gridss_separate.sh @@ -7,7 +7,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed REFERENCE=hg19.fa OUTPUT=${INPUT/.bam/.sv.vcf} ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam} -GRIDSS_JAR=../target/gridss-2.1.1-gridss-jar-with-dependencies.jar +GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar WORKING_DIR=. diff --git a/example/gridss_targeted.sh b/example/gridss_targeted.sh index acfd0c19c..08c1bda44 100644 --- a/example/gridss_targeted.sh +++ b/example/gridss_targeted.sh @@ -2,25 +2,33 @@ # # Runs GRIDSS on a set of targeted regions # -# Note that to properly call compound breakpoints, this script should be run +# +# WARNING: reference read counts will only be correct in targeted regions. +# If a breakpoint goes from a targeted region to a region not in the BED of +# interest then the untargeted breakend will be reported as homozygous variant +# even when it is not (since the reference-supporting reads were not included). +# This can be fixed by rerunning gridss.AnnotateReferenceReads against the +# original input file directly, or as explained below. +# +# To properly call compound breakpoints, this script should be run # twice: once for the intial GRIDSS calls, then again on the GRIDSS calls # This enables accurate calling of complex events for which only some of the -# breakpoints involved fall within the initially targeted region +# breakpoints involved fall within the initially targeted region, as well +# as fixing the above issue with reference read counts. # -BED_REGIONS_OF_INTEREST=targetted_regions.bed # If it's already a bed then you don't need the next two lines -VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file -Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput #BED_REGIONS_OF_INTEREST +BED_REGIONS_OF_INTEREST=example_regions_of_interest.bed +# If you have a VCF file, you can use the following script convert to a BED +# file that correctly handle symbolic SVs +#VCF_REGIONS_OF_INTEREST=targetted_regions.vcf # If your input is a VCF you'll need to convert it to a BED file +#Rscript gridss_targeted_vcf_to_region_bed.R --input $VCF_REGIONS_OF_INTEREST --ouput $BED_REGIONS_OF_INTEREST -REGIONS_OF_INTEREST=colo829.region.bed -REFERENCE=/data/refgenomes/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta -OUTPUT=colo829.lite.vcf -ASSEMBLY=assembly.lite.bam -GRIDSS_JAR=gridss-2.1.2-gridss-jar-with-dependencies.jar - -# This example uses a paired tumour/normal -NORMAL=COLO829R_dedup.realigned.bam -TUMOUR=COLO829T_dedup.realigned.bam +REGIONS_OF_INTEREST=$BED_REGIONS_OF_INTEREST +REFERENCE=hg19.fa +INPUT=chr12.1527326.DEL1024.bam +OUTPUT=${INPUT/.bam/.targeted.sv.vcf} +ASSEMBLY=${INPUT/.bam/.targeted.assembly.bam} +GRIDSS_JAR=../target/gridss-2.2.0-gridss-jar-with-dependencies.jar # Extract the reads flanking each breakpoint, not just overlapping. # 2k is chosen since it it (should) be larger than the fragment size @@ -29,18 +37,8 @@ TUMOUR=COLO829T_dedup.realigned.bam # to that of the full file. REGION_PADDING_SIZE=2000 - - - - -INPUT=../../temp/test.bam -REFERENCE=~/hartwig/temp/Homo_sapiens.GRCh37.GATK.illumina.fa -OUTPUT=${INPUT/.bam/.targeted.sv.vcf} -ASSEMBLY=${OUTPUT/.sv.vcf/.targeted.gridss.assembly.bam} -GRIDSS_JAR=../target/gridss-2.1.2-gridss-jar-with-dependencies.jar -WORKING_DIR=../../temp/ -TMP_DIR=../../temp/ - +# Number of reads for which to calculate metrics for +STOP_METRICS_AFTER=10000000 JVM_ARGS="-ea \ -Dreference_fasta="$REFERENCE" \ -Dsamjdk.create_index=true \ @@ -50,36 +48,78 @@ JVM_ARGS="-ea \ -Dsamjdk.buffer_size=4194304 \ -Dgridss.gridss.output_to_temp_file=true \ -cp $GRIDSS_JAR " -# Don't perform async read-ahead for gridss.IndexedExtractFullReads -# as there's not point in performing read-ahead of reads we're going -# ignore anyway -JVM_ARGS_SINGLE_THREADED="-ea \ - -Dreference_fasta="$REFERENCE" \ - -Dsamjdk.create_index=true \ - -Dsamjdk.use_async_io_write_samtools=true \ - -Dsamjdk.use_async_io_write_tribble=true \ - -Dgridss.gridss.output_to_temp_file=true \ - -cp $GRIDSS_JAR " - - -for F in $NORMAL $TUMOUR ; do - java -Xmx8g $JVM_ARGS_SINGLE_THREADED gridss.IndexedExtractFullReads B=$REGIONS_OF_INTEREST REGION_PADDING_SIZE=$REGION_PADDING_SIZE I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log & - # If you have are extracting a large portion of genome then +# for each input file +for F in $INPUT ; do + TARGETED_BAM=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam + GRIDSS_WORKING_DIR=./$(basename $TARGETED_BAM).gridss.working + TMP_DIR=./$(basename $TARGETED_BAM).gridss.working + echo "Calculate metrics for $F based on first $STOP_METRICS_AFTER reads" + # estimate metrics + mkdir -p $GRIDSS_WORKING_DIR + java -Xmx8g $JVM_ARGS gridss.analysis.CollectGridssMetrics \ + TMP_DIR=$GRIDSS_WORKING_DIR \ + ASSUME_SORTED=true \ + I=$F \ + O=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM) \ + THRESHOLD_COVERAGE=25000 \ + FILE_EXTENSION=null \ + GRIDSS_PROGRAM=null \ + GRIDSS_PROGRAM=CollectCigarMetrics \ + GRIDSS_PROGRAM=CollectMapqMetrics \ + GRIDSS_PROGRAM=CollectTagMetrics \ + GRIDSS_PROGRAM=CollectIdsvMetrics \ + GRIDSS_PROGRAM=ReportThresholdCoverage \ + PROGRAM=null \ + PROGRAM=CollectInsertSizeMetrics \ + STOP_AFTER=$STOP_METRICS_AFTER \ + | tee log.$F.collectgridssmetrics.log + java -Xmx8g $JVM_ARGS gridss.analysis.CollectStructuralVariantReadMetrics \ + TMP_DIR=$TMP_DIR \ + I=$F \ + OUTPUT=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).sv_metrics \ + INSERT_SIZE_METRICS=$GRIDSS_WORKING_DIR/$(basename $TARGETED_BAM).insert_size_metrics \ + STOP_AFTER=$STOP_METRICS_AFTER \ + | tee log.$F.collectsvmetrics.log + echo "Extracting all reads from fragments overlapping $REGIONS_OF_INTEREST from $F" + + # IndexedExtractFullReads is much faster than ExtractFullReads when + # using a small region of interest. Unfortunately, if the regions of + # interest are all SVs, the insert size distribution will be biased + ## Don't perform async read-ahead for gridss.IndexedExtractFullReads + ## as there's not point in performing read-ahead of reads we're going + ## ignore anyway + JVM_ARGS_SINGLE_THREADED="-ea \ + -Dreference_fasta="$REFERENCE" \ + -Dsamjdk.create_index=true \ + -Dsamjdk.use_async_io_write_samtools=true \ + -Dgridss.gridss.output_to_temp_file=true \ + -cp $GRIDSS_JAR " + java -Xmx8g $JVM_ARGS_SINGLE_THREADED \ + gridss.IndexedExtractFullReads \ + B=$REGIONS_OF_INTEREST \ + REGION_PADDING_SIZE=$REGION_PADDING_SIZE \ + I=$F \ + O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log & + + # If you are extracting a large portion of genome then # you should perform a full pass of the input file as it will # be faster than doing millions of index seeks() - #java -Xmx8g $JVM_ARGS gridss.ExtractFullReads REGION_PADDING_SIZE=$REGION_PADDING_SIZE B=$REGIONS_OF_INTEREST I=$F O=$F.lite.bam 2>&1 | tee log.$F.extract.log & + #java -Xmx8g $JVM_ARGS gridss.CollectGridssMetricsAndExtractFullReads \ + # REGION_PADDING_SIZE=$REGION_PADDING_SIZE \ + # REGION_BED=$REGIONS_OF_INTEREST \ + # I=$F \ + # O=$TARGETED_BAM 2>&1 | tee log.$F.extract.log & done wait + # Run GRIDSS on the extracted files -# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. -# +# warning: if you haven't increased your OS file ulimit above 4096 then will probably crash. Refer to the GRIDSS README for more details java -Xmx31g $JVM_ARGS \ -cp $GRIDSS_JAR gridss.CallVariants \ TMP_DIR=. \ WORKING_DIR=. \ REFERENCE_SEQUENCE=$REFERENCE \ - INPUT=$NORMAL.lite.bam \ - INPUT=$TUMOUR.lite.bam \ + INPUT=$(dirname $INPUT)/$(basename $INPUT .bam).targeted.bam \ OUTPUT=$OUTPUT \ ASSEMBLY=$ASSEMBLY \ WORKER_THREADS=16 \ diff --git a/example/somatic.sh b/example/somatic.sh index 080539116..c178563f3 100644 --- a/example/somatic.sh +++ b/example/somatic.sh @@ -8,7 +8,7 @@ BLACKLIST=wgEncodeDacMapabilityConsensusExcludable.bed REFERENCE=~/reference_genomes/human/hg19.fa OUTPUT=somatic.sv.vcf ASSEMBLY=${OUTPUT/.sv.vcf/.gridss.assembly.bam} -GRIDSS_JAR=~/bin/gridss-2.0.1-jar-with-dependencies.jar +GRIDSS_JAR=~/bin/gridss-2.2.0-jar-with-dependencies.jar if [[ ! -f "$NORMAL" ]] ; then echo "Missing $NORMAL input file." diff --git a/pom.xml b/pom.xml index 3f344834b..e3fabf8db 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ au.edu.wehi gridss jar - 2.1.2-gridss + 2.2.0-gridss gridss https://github.com/PapenfussLab/gridss diff --git a/scripts/cohort_analysis/0_README.txt b/scripts/cohort_analysis/0_README.txt index e20e4d979..dd1a11ed0 100644 --- a/scripts/cohort_analysis/0_README.txt +++ b/scripts/cohort_analysis/0_README.txt @@ -38,7 +38,7 @@ Instructions > ./1_setup.sh This will download the GRIDSS jar file and dependencies -(https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar) +(https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar) and download and decompress the ENCODE blacklist file (https://www.encodeproject.org/files/ENCFF001TDO/@@download/ENCFF001TDO.bed.gz). @@ -54,7 +54,7 @@ of the following files: BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed" REFERENCE_GENOME = "data/hg19.fa" -GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar" +GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar" 5. Open & edit the samples file sample.csv using a text editor. The file diff --git a/scripts/cohort_analysis/1_setup.sh b/scripts/cohort_analysis/1_setup.sh index 29d9d0814..d81d07fdc 100644 --- a/scripts/cohort_analysis/1_setup.sh +++ b/scripts/cohort_analysis/1_setup.sh @@ -3,7 +3,7 @@ # Setup script # Download the GRIDSS jar file and dependencies from: -wget https://github.com/PapenfussLab/gridss/releases/download/v2.0.1/gridss-2.0.1-gridss-jar-with-dependencies.jar +wget https://github.com/PapenfussLab/gridss/releases/download/v2.2.0/gridss-2.2.0-gridss-jar-with-dependencies.jar # Download and decompress the ENCODE blacklist file: mkdir data diff --git a/scripts/cohort_analysis/2_generate_analysis_scripts.py b/scripts/cohort_analysis/2_generate_analysis_scripts.py index 53538bdc6..97a66b6b2 100644 --- a/scripts/cohort_analysis/2_generate_analysis_scripts.py +++ b/scripts/cohort_analysis/2_generate_analysis_scripts.py @@ -5,7 +5,7 @@ BLACKLIST_FILENAME = "data/ENCODE_blacklist_hg19/ENCFF001TDO.bed" REFERENCE_GENOME = "data/hg19/hg19.fa" -GRIDSS_JARFILE = "./gridss-2.0.1-gridss-jar-with-dependencies.jar" +GRIDSS_JARFILE = "./gridss-2.2.0-gridss-jar-with-dependencies.jar" # Read the script template diff --git a/scripts/cohort_analysis/4_cleanup.py b/scripts/cohort_analysis/4_cleanup.py index f323828b5..3d06382b6 100644 --- a/scripts/cohort_analysis/4_cleanup.py +++ b/scripts/cohort_analysis/4_cleanup.py @@ -4,7 +4,7 @@ rm scripts/* rm output/* -rm gridss-2.0.1-gridss-jar-with-dependencies.jar +rm gridss-2.2.0-gridss-jar-with-dependencies.jar rm data/ENCODE_blacklist_hg19/ENCFF001TDO.bed # rm data/hg19/* diff --git a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java index 6d2e8defd..67a0b6365 100644 --- a/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java +++ b/src/main/java/au/edu/wehi/idsv/metrics/IdsvSamFileMetrics.java @@ -153,7 +153,7 @@ public double readPairFoldedCumulativeDistribution(int fragmentSize) { } double totalPairs = idsvMetrics.READ_PAIRS_BOTH_MAPPED; double dpPairs = totalPairs - insertDistribution.getTotalMappedPairs() + pairsFromFragmentDistribution; - return dpPairs / totalPairs; + return Math.min(1, dpPairs / totalPairs); } /** diff --git a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java index 81234c7ce..61a769922 100644 --- a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java +++ b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java @@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) { IdsvMetrics im = metrics.getIdsvMetrics(); // completely unmapped read pairs are excluded for consistency with sc and dp calculation - double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS)); + double score = MathUtil.prToPhred((double)Math.max(1, im.READ_PAIRS_ONE_MAPPED) / (double)Math.max(1, Math.max(im.READ_PAIRS_ONE_MAPPED, (im.MAPPED_READS)))); score = Math.min(score, mapq); return score; } diff --git a/src/main/java/au/edu/wehi/idsv/model/Models.java b/src/main/java/au/edu/wehi/idsv/model/Models.java index 1f72e6d3a..7d4ebbcf2 100644 --- a/src/main/java/au/edu/wehi/idsv/model/Models.java +++ b/src/main/java/au/edu/wehi/idsv/model/Models.java @@ -36,17 +36,10 @@ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, Lis public BreakendSummary apply(DirectedEvidence input) { return input.getBreakendSummary(); } - }), Lists.transform(evidence, new Function() { - @Override - public Long apply(DirectedEvidence input) { - return ScalingHelper.toScaledWeight(input.getBreakendQual()); - } - })); + }), Lists.transform(evidence, (Function) input -> ScalingHelper.toScaledWeight(input.getBreakendQual()))); } /** * Calculates the most likely breakend interval for the given evidence - * @param evidence - * @return breakend interval with highest total evidence quality */ public static BreakendSummary calculateBreakend(LinearGenomicCoordinate lgc, List bs, List weights) { if (bs == null || bs.size() == 0) throw new IllegalArgumentException("No evidence supplied"); diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java index 58a6394ce..ba49272b8 100644 --- a/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java +++ b/src/main/java/gridss/CollectGridssMetricsAndExtractFullReads.java @@ -2,6 +2,8 @@ import au.edu.wehi.idsv.ReadPairConcordanceMethod; import gridss.analysis.CollectGridssMetrics; +import gridss.analysis.CollectIdsvMetrics; +import gridss.analysis.CollectStructuralVariantReadMetrics; import gridss.cmdline.CommandLineProgramHelper; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -25,26 +27,60 @@ programGroup = gridss.cmdline.programgroups.DataConversion.class ) public class CollectGridssMetricsAndExtractFullReads extends CollectGridssMetrics { + // #region SV command line args + @Argument(doc="Minimum indel size", optional=true) + public int MIN_INDEL_SIZE = 1; + @Argument(doc="Minimum bases clipped", optional=true) + public int MIN_CLIP_LENGTH = 1; + @Argument(doc="Include hard and soft clipped reads in output", optional=true) + public boolean CLIPPED = true; + @Argument(doc="Include reads containing indels in output", optional=true) + public boolean INDELS = true; + @Argument(doc="Include split reads in output", optional=true) + public boolean SPLIT = true; + @Argument(doc="Include read pairs in which only one of the read is aligned to the reference.", optional=true) + public boolean SINGLE_MAPPED_PAIRED = true; + @Argument(doc="Include read pairs that align do not align in the expected orientation within the expected fragment size distribution.", optional=true) + public boolean DISCORDANT_READ_PAIRS = true; + @Argument(doc="Method of calculating read pair concordance. Valid values are SAM_FLAG, PERCENTAGE, and FIXED", optional=true) + public ReadPairConcordanceMethod READ_PAIR_CONCORDANCE_METHOD = ReadPairConcordanceMethod.SAM_FLAG; + @Argument(doc="Minimum concordant read pair fragment size if using the FIXED method of calculation", optional=true) + public int FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = 0; + @Argument(doc="Maximum concordant read pair fragment size if using the FIXED method of calculation", optional=true) + public int FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = 0; + @Argument(doc = "Percent (0.0-1.0) of read pairs considered concorant if using the PERCENTAGE method of calculation.", optional=true) + public Float READ_PAIR_CONCORDANT_PERCENT = 0.995f; + @Argument(doc="Picard tools insert size distribution metrics txt file. Required if using the PERCENTAGE read pair concordance calculation method.", optional=true) + public File INSERT_SIZE_METRICS = null; + @Argument(doc="Include unmapped reads", optional=true) + public boolean UNMAPPED_READS = true; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; + @Argument(shortName="SVO", doc="Output file containing SV metrics", optional=true) + public File SV_METRICS_OUTPUT; + // #endregion + // #region extract command line args @Argument(shortName="MO", doc="Output file containing SV metrics", optional=true) - public File REGION_BED; + public File REGION_BED = new ExtractFullReads().REGION_BED; @Argument(doc = "Extract reads whose mate maps to an export region. " + "If the MC tag is not present, only the starting alignment position of the mate is considered. " + "When determining whether the mate maps to an export region " + "only the primary alignment of that mate is considered. Secondary " + "and supplementary alignments are ignored.") - public boolean EXTRACT_MATES = true; - @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region") - public boolean EXTRACT_SPLITS = true; + public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES; + @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region", optional=true) + public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS; @Argument(doc = "Number of bases surrounding each export region to include in the index query. " + "Setting this to a value slightly greater than the 99.99% fragment size distribution will reduce the number of random access" + "IO requests made. " + - "This parameter is not used if a linear scan is performed.") - public int REGION_PADDING_SIZE = 2000; - @Argument(doc = "File to write the output reads to.") + "This parameter is not used if a linear scan is performed.", optional=true) + public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE; + @Argument(doc = "File to write the output reads to.", optional=true) public File READ_OUTPUT; @Argument(doc="Number of worker threads to spawn. Defaults to number of cores available." + " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.", shortName="THREADS") + // #endregion public int WORKER_THREADS = 1; public static void main(final String[] args) { new CollectGridssMetricsAndExtractFullReads().instanceMainWithExit(args); @@ -89,10 +125,57 @@ public boolean supportsMetricAccumulationLevel() { } }; } + protected CollectStructuralVariantReadMetrics getSVMetrics() { + CollectStructuralVariantReadMetrics extract = new CollectStructuralVariantReadMetrics(); + CommandLineProgramHelper.copyInputs(this, extract); + extract.MIN_INDEL_SIZE = MIN_INDEL_SIZE; + extract.MIN_CLIP_LENGTH = MIN_CLIP_LENGTH; + extract.CLIPPED = CLIPPED; + extract.INDELS = INDELS; + extract.SPLIT = SPLIT; + extract.SINGLE_MAPPED_PAIRED = SINGLE_MAPPED_PAIRED; + extract.DISCORDANT_READ_PAIRS = DISCORDANT_READ_PAIRS; + extract.READ_PAIR_CONCORDANCE_METHOD = READ_PAIR_CONCORDANCE_METHOD; + extract.FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MIN_FRAGMENT_SIZE; + extract.FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE = FIXED_READ_PAIR_CONCORDANCE_MAX_FRAGMENT_SIZE; + extract.READ_PAIR_CONCORDANT_PERCENT = READ_PAIR_CONCORDANT_PERCENT; + extract.INSERT_SIZE_METRICS = INSERT_SIZE_METRICS; + extract.UNMAPPED_READS = UNMAPPED_READS; + extract.INCLUDE_DUPLICATES = INCLUDE_DUPLICATES; + extract.OUTPUT = SV_METRICS_OUTPUT; + extract.INPUT = this.INPUT; + extract.ASSUME_SORTED = true; + return extract; + } + public ProgramInterface createSVMetrics() { + return new ProgramInterface() { + @Override + public SinglePassSamProgram makeInstance(final String outbase, + final String outext, + final File input, + final File reference, + final Set metricAccumulationLevel, + final File dbSnp, + final File intervals, + final File refflat, + final Set ignoreSequence) { + final CollectStructuralVariantReadMetrics program = getSVMetrics(); + return program.asSinglePassSamProgram(); + } + @Override + public boolean needsReferenceSequence() { + return false; + } + @Override + public boolean supportsMetricAccumulationLevel() { + return false; + } + }; + } @Override public void setProgramsToRun(Collection programsToRun) { - // Inject SV read extraction programsToRun.add(createExtractFullReads()); + programsToRun.add(createSVMetrics()); super.setProgramsToRun(programsToRun); } } diff --git a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java index cca6a434f..00c924deb 100644 --- a/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java +++ b/src/main/java/gridss/CollectGridssMetricsAndExtractSVReads.java @@ -61,10 +61,6 @@ public class CollectGridssMetricsAndExtractSVReads extends CollectGridssMetrics public static void main(final String[] args) { new CollectGridssMetricsAndExtractSVReads().instanceMainWithExit(args); } - @Override - protected String[] customCommandLineValidation() { - return super.customCommandLineValidation(); - } protected ExtractSVReads getExtractSVReads() { ExtractSVReads extract = new ExtractSVReads(); CommandLineProgramHelper.copyInputs(this, extract); diff --git a/src/main/java/gridss/ExtractFullReads.java b/src/main/java/gridss/ExtractFullReads.java index f5053eba6..cfbff64b3 100644 --- a/src/main/java/gridss/ExtractFullReads.java +++ b/src/main/java/gridss/ExtractFullReads.java @@ -33,7 +33,7 @@ public class ExtractFullReads extends SinglePassSamProgram { @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region") public boolean EXTRACT_SPLITS = true; @Argument(doc = "Number of bases surrounding each export region to include in the index query. ") - public int REGION_PADDING_SIZE = 2000; + public int REGION_PADDING_SIZE = 0; private File tmpOut; private SAMFileWriter writer; diff --git a/src/main/java/gridss/IndexedExtractFullReads.java b/src/main/java/gridss/IndexedExtractFullReads.java index dbf104841..5b166bb9a 100644 --- a/src/main/java/gridss/IndexedExtractFullReads.java +++ b/src/main/java/gridss/IndexedExtractFullReads.java @@ -30,17 +30,17 @@ public class IndexedExtractFullReads extends CommandLineProgram { @Argument(shortName= StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Output file location.") public File OUTPUT; @Argument(shortName = "B", doc = "BED file containing regions to export") - public File REGION_BED; + public File REGION_BED = new ExtractFullReads().REGION_BED; @Argument(doc = "Extract reads whose mate maps to an export region. " + "If the MC tag is not present, only the starting alignment position of the mate is considered. " + "When determining whether the mate maps to an export region " + "only the primary alignment of that mate is considered. Secondary " + "and supplementary alignments are ignored.") - public boolean EXTRACT_MATES = true; + public boolean EXTRACT_MATES = new ExtractFullReads().EXTRACT_MATES; @Argument(doc = "Extract all records for reads that have a chimeric alignment mapping to an export region") - public boolean EXTRACT_SPLITS = true; + public boolean EXTRACT_SPLITS = new ExtractFullReads().EXTRACT_SPLITS; @Argument(doc = "Number of additional bases surrounding each export region to include in the index query. ") - public int REGION_PADDING_SIZE = 0; + public int REGION_PADDING_SIZE = new ExtractFullReads().REGION_PADDING_SIZE; @Argument(doc="Number of worker threads to spawn. Defaults to number of cores available." + " Note that I/O threads are not included in this worker thread count so CPU usage can be higher than the number of worker thread.", shortName="THREADS") diff --git a/src/main/java/gridss/analysis/CigarSizeDistribution.java b/src/main/java/gridss/analysis/CigarSizeDistribution.java index 4c26a76e7..c9ff2b02f 100644 --- a/src/main/java/gridss/analysis/CigarSizeDistribution.java +++ b/src/main/java/gridss/analysis/CigarSizeDistribution.java @@ -33,7 +33,7 @@ private static double[] calcPhred(List sc) { CigarDetailMetrics m = sc.get(i); assert(m.LENGTH == i); if (cumsum > 0) { - score = -10 * Math.log10((double)cumsum / (double)total); + score = -10 * Math.log10((double)Math.max(1, cumsum) / (double)Math.max(1, total)); } phred[i] = score; cumsum -= m.COUNT; diff --git a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java index e2db2c195..d949ea95d 100644 --- a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java +++ b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java @@ -18,6 +18,7 @@ */ public abstract class ProcessStructuralVariantReadsCommandLineProgram extends ByReadNameSinglePassSamProgram { private static final Log log = Log.getInstance(ProcessStructuralVariantReadsCommandLineProgram.class); + //#region SV read args @Argument(doc="Minimum indel size", optional=true) public int MIN_INDEL_SIZE = 1; @Argument(doc="Minimum bases clipped", optional=true) @@ -46,6 +47,8 @@ public abstract class ProcessStructuralVariantReadsCommandLineProgram extends By public boolean UNMAPPED_READS = true; @Argument(doc="If true, also include reads marked as duplicates.") public boolean INCLUDE_DUPLICATES = false; + //#endregion SV read args + @Override public String[] customCommandLineValidation() { if (READ_PAIR_CONCORDANCE_METHOD == ReadPairConcordanceMethod.PERCENTAGE && INSERT_SIZE_METRICS == null) {