From 54ed41908d248dcf0f9a04211f908249f4a91500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Nordstr=C3=B6m?= Date: Mon, 13 Nov 2017 10:17:44 +0100 Subject: [PATCH] Decreasing disk stress and processing time of the contig step by replacing temporary bam files by pipes. The drawback is that it is harder to control the maximum number of used cores. The piped version of RGCRG-ANTISENSE ran ~9m on 2.4G mouse reads compared to ~25m for the original version. Timing/testing was done without the python script and running it without nextflow. The resulting *.bedgraph files were identical. --- templates/contig/RGCRG-ANTISENSE | 19 ++++++++++--------- templates/contig/RGCRG-MATE1_SENSE | 23 ++++++++++++++--------- templates/contig/RGCRG-MATE2_SENSE | 21 +++++++++++++-------- templates/contig/RGCRG-NONE | 8 +++++--- templates/contig/RGCRG-SENSE | 13 ++++++++----- 5 files changed, 50 insertions(+), 34 deletions(-) diff --git a/templates/contig/RGCRG-ANTISENSE b/templates/contig/RGCRG-ANTISENSE index 3b1b403..ea4fed1 100644 --- a/templates/contig/RGCRG-ANTISENSE +++ b/templates/contig/RGCRG-ANTISENSE @@ -1,15 +1,16 @@ #!/bin/bash -eu -samtools view -h -@ ${cpus} ${bam} | \ -awk 'BEGIN {OFS="\\t"} {if (\$1!~/^@/) {\$2=xor(\$2,0x10)}; print}' | \ -samtools view -@ ${cpus} -Sb - > tmp.bam - -bamtools filter -tag NH:1 -in tmp.bam -out tmp_unique.bam && rm tmp.bam -genomeCoverageBed -strand + -split -bg -ibam tmp_unique.bam > ${prefix}.plusRaw.bedgraph -genomeCoverageBed -strand - -split -bg -ibam tmp_unique.bam > ${prefix}.minusRaw.bedgraph -rm tmp_unique.bam +samtools view -h -@ ${cpus} ${bam} \ + | awk 'BEGIN {OFS="\\t"} {if (\$1!~/^@/) {\$2=xor(\$2,0x10)}; print}' \ + | samtools view -Sbu - \ + | tee >( + genomeCoverageBed -strand + -split -bg -ibam - \ + > ${prefix}.contigs.plusRaw.bedgraph + ) \ + | genomeCoverageBed -strand - -split -bg -ibam - \ +> ${prefix}.contigs.minusRaw.bedgraph contigsNew.py --chrFile ${genomeFai} \ --fileP ${prefix}.plusRaw.bedgraph \ --fileM ${prefix}.minusRaw.bedgraph \ --sortOut \ - > ${prefix}.bed \ No newline at end of file + > ${prefix}.bed diff --git a/templates/contig/RGCRG-MATE1_SENSE b/templates/contig/RGCRG-MATE1_SENSE index 4cdcd74..9b51d88 100644 --- a/templates/contig/RGCRG-MATE1_SENSE +++ b/templates/contig/RGCRG-MATE1_SENSE @@ -1,15 +1,20 @@ #!/bin/bash -eu -samtools view -h -@ ${cpus} ${bam} | \ -awk 'BEGIN {OFS="\\t"} {if (\$1!~/^@/ && and(\$2,128)>0) {\$2=xor(\$2,0x10)}; print}' | \ -samtools view -@ ${cpus} -Sb - > tmp.bam - -bamtools filter -tag NH:1 -in tmp.bam -out tmp_unique.bam && rm tmp.bam -genomeCoverageBed -strand + -split -bg -ibam tmp_unique.bam > ${prefix}.plusRaw.bedgraph -genomeCoverageBed -strand - -split -bg -ibam tmp_unique.bam > ${prefix}.minusRaw.bedgraph -rm tmp_unique.bam +samtools view -h -@ ${cpus} ${bam} \ + | awk ' + BEGIN {OFS="\\t"} + {if (\$1!~/^@/ && and(\$2,128)>0) {\$2=xor(\$2,0x10)}; print} + ' \ + | samtools view -Sbu - \ + | bamtools filter -tag NH:1 \ + | tee >( + genomeCoverageBed -strand + -split -bg -ibam - \ + > ${prefix}.plusRaw.bedgraph + ) \ + | genomeCoverageBed -strand - -split -bg -ibam - \ +> ${prefix}.minusRaw.bedgraph contigsNew.py --chrFile ${genomeFai} \ --fileP ${prefix}.plusRaw.bedgraph \ --fileM ${prefix}.minusRaw.bedgraph \ --sortOut \ - > ${prefix}.bed \ No newline at end of file + > ${prefix}.bed diff --git a/templates/contig/RGCRG-MATE2_SENSE b/templates/contig/RGCRG-MATE2_SENSE index 4a46bea..54f79b6 100644 --- a/templates/contig/RGCRG-MATE2_SENSE +++ b/templates/contig/RGCRG-MATE2_SENSE @@ -1,12 +1,17 @@ #!/bin/bash -eu -samtools view -h -@ ${cpus} ${bam} | \ -awk 'BEGIN {OFS="\\t"} {if (\$1!~/^@/ && and(\$2,64)>0) {\$2=xor(\$2,0x10)}; print}' | \ -samtools view -@ ${cpus} -Sb - > tmp.bam - -bamtools filter -tag NH:1 -in tmp.bam -out tmp_unique.bam && rm tmp.bam -genomeCoverageBed -strand + -split -bg -ibam tmp_unique.bam > ${prefix}.plusRaw.bedgraph -genomeCoverageBed -strand - -split -bg -ibam tmp_unique.bam > ${prefix}.minusRaw.bedgraph -rm tmp_unique.bam +samtools view -h -@ ${cpus} ${bam} \ + | awk ' + BEGIN {OFS="\\t"} + {if (\$1!~/^@/ && and(\$2,64)>0) {\$2=xor(\$2,0x10)}; print} + ' \ + | samtools view -Sbu \ + | bamtools filter -tag NH:1 \ + | tee >( + genomeCoverageBed -strand + -split -bg -ibam - \ + > ${prefix}.plusRaw.bedgraph + ) \ + | genomeCoverageBed -strand - -split -bg -ibam - \ +> ${prefix}.minusRaw.bedgraph contigsNew.py --chrFile ${genomeFai} \ --fileP ${prefix}.plusRaw.bedgraph \ diff --git a/templates/contig/RGCRG-NONE b/templates/contig/RGCRG-NONE index c5b36f0..b311d6a 100644 --- a/templates/contig/RGCRG-NONE +++ b/templates/contig/RGCRG-NONE @@ -1,4 +1,6 @@ #!/bin/bash -eu -bamtools filter -tag NH:1 -in ${bam} -out tmp_unique.bam -bamToBed -i tmp_unique.bam | sort -T. -k1,1 -k2,2n | mergeBed > ${prefix}.bed -rm tmp_unique.bam \ No newline at end of file +bamtools filter -tag NH:1 -in ${bam} \ + | bamToBed -i - \ + | sort -T. -k1,1 -k2,2n \ + | mergeBed \ +> ${prefix}.bed diff --git a/templates/contig/RGCRG-SENSE b/templates/contig/RGCRG-SENSE index 33045d3..6d09baa 100644 --- a/templates/contig/RGCRG-SENSE +++ b/templates/contig/RGCRG-SENSE @@ -1,11 +1,14 @@ #!/bin/bash -eu -bamtools filter -tag NH:1 -in ${bam} -out tmp_unique.bam -genomeCoverageBed -strand + -split -bg -ibam tmp_unique.bam > ${prefix}.plusRaw.bedgraph -genomeCoverageBed -strand - -split -bg -ibam tmp_unique.bam > ${prefix}.minusRaw.bedgraph -rm tmp_unique.bam +bamtools filter -tag NH:1 -in ${bam} \ + | tee >( + genomeCoverageBed -strand + -split -bg -ibam - \ + > ${prefix}.plusRaw.bedgraph + ) \ + | genomeCoverageBed -strand - -split -bg -ibam - \ +> ${prefix}.minusRaw.bedgraph contigsNew.py --chrFile ${genomeFai} \ --fileP ${prefix}.plusRaw.bedgraph \ --fileM ${prefix}.minusRaw.bedgraph \ --sortOut \ - > ${prefix}.bed \ No newline at end of file + > ${prefix}.bed