From 8df3f434e99e54989c2c20bf053cfeec0157a9fa Mon Sep 17 00:00:00 2001 From: Daniel Joseph Van Twisk Date: Mon, 7 Oct 2024 06:57:36 -0400 Subject: [PATCH] Updated pipeline and generation files. --- arriba_helper.sh | 10 ++++-- badread_helper.sh | 8 ++--- fusionseeker_helper.sh | 20 ++++++++++++ generate_arriba.sh | 55 +++++++++++++++++++++++++++++++ generate_fusionseeker.sh | 26 +++++++++++++++ generate_genion.sh | 24 ++++++++++++++ generate_jaffal.sh | 23 +++++++++++++ generate_longgf.sh | 26 +++++++++++++++ generate_mapping.sh | 67 ++++++++++++++++++++++++++++++++++++++ generate_pbfusion.sh | 23 +++++++++++++ generate_simulated_data.sh | 37 ++++++++++++--------- genion_helper.sh | 15 +++++---- genself_helper.sh | 24 ++++++++++---- jaffal_helper.sh | 19 ++++++----- longgf_helper.sh | 10 +++--- minimap2_helper.sh | 27 +++++++++------ pbfusion_helper.sh | 18 ++++++++++ pbmm2_helper.sh | 13 ++++++++ run_pipeline.sh | 23 +++++++------ star_helper2.sh | 49 ++++++++++++++++++++++++++++ 20 files changed, 448 insertions(+), 69 deletions(-) create mode 100644 fusionseeker_helper.sh create mode 100644 generate_arriba.sh create mode 100644 generate_fusionseeker.sh create mode 100644 generate_genion.sh create mode 100644 generate_jaffal.sh create mode 100644 generate_longgf.sh create mode 100644 generate_mapping.sh create mode 100644 generate_pbfusion.sh create mode 100644 pbfusion_helper.sh create mode 100644 pbmm2_helper.sh create mode 100644 star_helper2.sh diff --git a/arriba_helper.sh b/arriba_helper.sh index 6015891..b0a5f0b 100644 --- a/arriba_helper.sh +++ b/arriba_helper.sh @@ -5,11 +5,17 @@ #SBATCH --time 12:00:00 #SBATCH --mem=128G +DATADIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k +MAPPING_DIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k_mapping +ARRIBA_DIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k_arriba + +[ ! -d ${ARRIBA_DIR} ] && mkdir ${ARRIBA_DIR} + #for j in $(seq 1 10); do #singularity exec --pid --bind /datastore arriba_latest.sif \ /home/vantwisk/arriba_v2.2.1/arriba \ - -x i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}Aligned.sortedByCoord.out.bam \ - -o i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}.tsv -O i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}.discarded.tsv \ + -x ${MAPPING_DIR}/fusions-${1}-${4}-${5}-Aligned.sortedByCoord.out.bam \ + -o ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.tsv -O ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.discarded.tsv \ -a Homo_sapiens.GRCh38.dna.primary_assembly.fa -g Homo_sapiens.GRCh38.105.chr.gtf \ -b /home/vantwisk/arriba_v2.2.1/database/blacklist_hg38_GRCh38_v2.2.1.tsv -k /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv -t /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv -p /home/vantwisk/arriba_v2.2.1/database/protein_domains_hg38_GRCh38_v2.2.1.gff3 #done diff --git a/badread_helper.sh b/badread_helper.sh index dca6790..f256947 100644 --- a/badread_helper.sh +++ b/badread_helper.sh @@ -5,8 +5,7 @@ #SBATCH -n 1 #SBATCH --time 24:00:00 -OUTDIR=longreads - +OUTDIR=/datastore/scratch/users/vantwisk/sim/longreads_training${8}k [ ! -d ${OUTDIR} ] && mkdir ${OUTDIR} echo $1 @@ -16,11 +15,12 @@ echo $4 echo $5 echo $6 echo $7 +echo $8 #for j in $(seq 1 10); do rustyread --threads 32 simulate --reference ${7} \ --quantity ${1}x \ - --qscore_model ${6} --glitches 0,0,0 --junk_reads 0 --random_reads 0 \ - --error_model ${6} --identity ${5} \ + --qscore_model /home/vantwisk/Badread/badread/qscore_models/${6} --glitches 0,0,0 --junk_reads 0 --random_reads 0 \ + --error_model /home/vantwisk/Badread/badread/error_models/${6} --identity ${5} \ --chimera 0 --seed $RANDOM | gzip > ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq.gz #pfun/fuse-${1}-${4}.fq.gz #fuse-transcript-${1}-${4}.fq.gz #done diff --git a/fusionseeker_helper.sh b/fusionseeker_helper.sh new file mode 100644 index 0000000..4c218a3 --- /dev/null +++ b/fusionseeker_helper.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name longgf +#SBATCH --partition allnodes +#SBATCH --time UNLIMITED +#SBATCH --cpus-per-task 1 +#SBATCH --mem-per-cpu 10g + +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_minimap2 +FUSIONSEEKER_DIR=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_fusionseeker + +[ ! -d ${FUSIONSEEKER_DIR} ] && mkdir ${FUSIONSEEKER_DIR} + +#singularity exec --pid --bind /datastore longgf_0.1.2--h05f6578_1.sif \ +fusionseeker \ + --tread 16 \ + --bam ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-sorted.bam \ + --gtf Homo_sapiens.GRCh38.105.gtf \ + --ref ../hg38.fa \ + -o ${FUSIONSEEKER_DIR}/fusions-${1}-${5}-${6}-${4}-fusionseeker \ + -s 2 diff --git a/generate_arriba.sh b/generate_arriba.sh new file mode 100644 index 0000000..4f882c4 --- /dev/null +++ b/generate_arriba.sh @@ -0,0 +1,55 @@ + +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') + +READ_LENGTHS=(100 150) + +#module load R-4.0.3 + +#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS} + +#cp ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTOME} +#cat ${FUSE_TRANSCRIPTS} >> ${FUSE_TRANSCRIPTOME} + +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!QUALITY[@]}; do +# for k in ${!TECH[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch minimap2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} +# done +# done +# done +# done +#done + +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!QUALITY[@]}; do +# for k in ${!TECH[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch genself_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} +# done +# done +# done +# done +#done + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!READ_LENGTHS[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch arriba_helper.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]} + done + done + done +done diff --git a/generate_fusionseeker.sh b/generate_fusionseeker.sh new file mode 100644 index 0000000..62b3e07 --- /dev/null +++ b/generate_fusionseeker.sh @@ -0,0 +1,26 @@ +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') +MIN_OVERLAP_LEN=100 +BIN_SIZE=50 +MIN_MAP_LENGTH=100 + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch fusionseeker_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]} + done + done + done + done +done diff --git a/generate_genion.sh b/generate_genion.sh new file mode 100644 index 0000000..6de10f6 --- /dev/null +++ b/generate_genion.sh @@ -0,0 +1,24 @@ +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') +MIN_SUPPORT=2 + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch genion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_SUPPORT} ${N_TRANSCRIPTS[$n]} + done + done + done + done +done diff --git a/generate_jaffal.sh b/generate_jaffal.sh new file mode 100644 index 0000000..6d852ce --- /dev/null +++ b/generate_jaffal.sh @@ -0,0 +1,23 @@ +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch jaffal_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} + done + done + done + done +done diff --git a/generate_longgf.sh b/generate_longgf.sh new file mode 100644 index 0000000..4cf63d8 --- /dev/null +++ b/generate_longgf.sh @@ -0,0 +1,26 @@ +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') +MIN_OVERLAP_LEN=100 +BIN_SIZE=50 +MIN_MAP_LENGTH=100 + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch longgf_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]} + done + done + done + done +done diff --git a/generate_mapping.sh b/generate_mapping.sh new file mode 100644 index 0000000..eedc0e9 --- /dev/null +++ b/generate_mapping.sh @@ -0,0 +1,67 @@ + +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') + +READ_LENGTHS=(100 150) + +#module load R-4.0.3 + +#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS} + +#cp ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTOME} +#cat ${FUSE_TRANSCRIPTS} >> ${FUSE_TRANSCRIPTOME} + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch minimap2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} + done + done + done + done +done + +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!QUALITY[@]}; do +# for k in ${!TECH[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch genself_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} +# done +# done +# done +# done +#done + +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!QUALITY[@]}; do +# for k in ${!TECH[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch pbmm2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} +# done +# done +# done +# done +#done + +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!READ_LENGTHS[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch star_helper2.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]} +# done +# done +# done +#done diff --git a/generate_pbfusion.sh b/generate_pbfusion.sh new file mode 100644 index 0000000..e0e50df --- /dev/null +++ b/generate_pbfusion.sh @@ -0,0 +1,23 @@ +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 + +N_TRANSCRIPTS=('1') +REPLICATES=10 +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') +TECH=('pacbio2016' 'nanopore2020') + +for i in $(seq 1 ${REPLICATES}); do + for q in ${!COVERAGE[@]}; do + for j in ${!QUALITY[@]}; do + for k in ${!TECH[@]}; do + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch pbfusion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]} + done + done + done + done +done diff --git a/generate_simulated_data.sh b/generate_simulated_data.sh index 01e0af7..af678d7 100644 --- a/generate_simulated_data.sh +++ b/generate_simulated_data.sh @@ -1,18 +1,19 @@ -TRANSCRIPTOME=Homo_sapiens.cdna_50k.fa -FUSE_TRANSCRIPTS=test_fusions1.fa -FUSE_TRANSCRIPTOME=test_transcriptome.fa -FUSE_META=test_fusions1.txt -NFUSIONS=500 +TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa +FUSE_TRANSCRIPTS=training16k_fusions.fa +FUSE_TRANSCRIPTOME=training16k_transcriptome.fa +FUSE_META=training16k_fusions.txt +NFUSIONS=100 +N_TRANSCRIPTS=('1') REPLICATES=10 -COVERAGE=(3 5 10 30 50) -QUALITY=('87,97,5') #('75,90,8' '87.5,97.5,5' '95,100,4') +COVERAGE=(3 5 10 30 50 100) +QUALITY=('75,90,8' '87,97,5' '95,100,4') TECH=('pacbio2016' 'nanopore2020') READ_LENGTHS=(100 150) -module load R-4.0.3 +#module load R-4.0.3 #Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS} @@ -23,17 +24,21 @@ for i in $(seq 1 ${REPLICATES}); do for q in ${!COVERAGE[@]}; do for j in ${!QUALITY[@]}; do for k in ${!TECH[@]}; do - sbatch badread_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ${FUSE_TRANSCRIPTOME} + for n in ${!N_TRANSCRIPTS[@]}; do + sbatch badread_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} training${N_TRANSCRIPTS[$n]}k_transcriptome.fa ${N_TRANSCRIPTS[$n]} + done done done done done -for i in $(seq 1 ${REPLICATES}); do - for q in ${!COVERAGE[@]}; do - for j in ${!READ_LENGTHS[@]}; do - sbatch art_helper.sh ${COVERAGE[$q]} ${READ_LENGTHS[$j]} ${FUSE_TRANSCRIPTOME} ${i} - done - done -done +#for i in $(seq 1 ${REPLICATES}); do +# for q in ${!COVERAGE[@]}; do +# for j in ${!READ_LENGTHS[@]}; do +# for n in ${!N_TRANSCRIPTS[@]}; do +# sbatch art_helper.sh ${COVERAGE[$q]} ${READ_LENGTHS[$j]} training${N_TRANSCRIPTS[$n]}k_transcriptome.fa ${i} ${N_TRANSCRIPTS[$n]} +# done +# done +# done +#done diff --git a/genion_helper.sh b/genion_helper.sh index 49eaab6..4ff5538 100644 --- a/genion_helper.sh +++ b/genion_helper.sh @@ -5,17 +5,18 @@ #SBATCH --cpus-per-task 1 #SBATCH --mem=128G -DATADIR=longreads -MINIMAP_DATADIR=longreads_mappings -GENION_DATADIR=longreads_genion +DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${11}k +MINIMAP_DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${11}k_minimap2 +GENION_DIR=/datastore/scratch/users/vantwisk/sim/longreads_training${11}k_genion + +[ ! -d ${GENION_DIR} ] && mkdir ${GENION_DIR} #singularity exec --pid --bind /datastore longgf_0.1.2--h05f6578_1.sif \ - genion \ - -t 32 \ - --min-support 2 \ +genion \ + --min-support ${10} \ -i ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz \ --gtf Homo_sapiens.GRCh38.105.gtf \ --gpaf ${MINIMAP_DATADIR}/fusions-${1}-${5}-${6}-${4}.paf \ -s ${MINIMAP_DATADIR}/fusions-${1}-${5}-${6}-${4}-selfalign.tsv \ -d genomicSuperDups.txt \ - -o ${GENION_DATADIR}/fusions-${1}-${5}-${6}-${4}-genion.tsv + -o ${GENION_DIR}/fusions-${1}-${5}-${6}-${4}-genion-minsup-${10}.tsv diff --git a/genself_helper.sh b/genself_helper.sh index e307052..e6e5a14 100644 --- a/genself_helper.sh +++ b/genself_helper.sh @@ -5,12 +5,24 @@ #SBATCH -n 1 #SBATCH -t 04:00:00 -DATADIR=longreads -MINIMAP_DATADIR=longreads-mappings +DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k_minimap2 + +[ ! -d ${DATADIR_MINIMAP} ] && mkdir ${DATADIR_MINIMAP} + +echo $1 #coverage +echo $2 #nothing +echo $3 #nothing +echo $4 #run +echo $5 #identity +echo $6 #tech +echo $7 #ax +echo $8 #sam +echo $9 #n #for j in $(seq 1 10); do -#/home/vantwisk/minimap2-2.21_x64-linux/minimap2 -cx ava-ont ${DATADIR}/fuse-${1}-${4}.fq.gz ${DATADIR}/fuse-${1}-${4}.fq.gz > ${DATADIR}/fuse-${1}-${4}-minimap2-splice.paf -/home/vantwisk/minimap2-2.21_x64-linux/minimap2 ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz -X -t 32 -2 -c -o ${MINIMAP_DATADIR}/fusions-${1}-${5}-${6}-${4}-selfalign.paf -cat ${MINIMAP_DATADIR}/fusions-${1}-${5}-${6}-${4}-selfalign.paf | cut -f1,6 | sed 's/_/\t/g' | awk 'BEGIN{OFS=\"\\t\";}{print substr($1,1,15),substr($2,1,15),substr($3,1,15),substr($4,1,15);}' | awk '$1!=$3' | sort | uniq > ${MINIMAP_DATADIR}/fusions-${1}-${5}-${6}-${4}-selfalign.tsv -#/home/vantwisk/minimap2-2.21_x64-linux/minimap2 -t 32 --MD -cx splice ../hg38.fa ${DATADIR}/fuse-${1}-${4}.fq.gz > ${DATADIR}/fuse-${1}-${4}-minimap2-splice.paf +#/home/vantwisk/minimap2-2.21_x64-linux/minimap2 -cx ava-ont ${DATADIR}/fuse-${1}-${4}.fq.gz ${DATADIR}/fuse-${1}-${4}.fq.gz > ${DATADIR_MINIMAP}/fuse-${1}-${4}-minimap2-splice.paf +/home/vantwisk/minimap2-2.21_x64-linux/minimap2 ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz -X -t 32 -2 -c -o ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-selfalign.paf +cat ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-selfalign.paf | cut -f1,6 | sed 's/_/\t/g' | awk 'BEGIN{OFS="\t";}{print substr($1,1,15),substr($2,1,15),substr($3,1,15),substr($4,1,15);}' | awk '$1!=$3' | sort | uniq > ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-selfalign.tsv +#/home/vantwisk/minimap2-2.21_x64-linux/minimap2 -t 32 --MD -cx splice ../hg38.fa ${DATADIR}/fuse-${1}-${4}.fq.gz > ${DATADIR_MINIMAP}/fuse-${1}-${4}-minimap2-splice.paf #done diff --git a/jaffal_helper.sh b/jaffal_helper.sh index a84e1a3..d6f8b84 100644 --- a/jaffal_helper.sh +++ b/jaffal_helper.sh @@ -5,16 +5,19 @@ #SBATCH --cpus-per-task 1 #SBATCH --mem=128G -DATADIR=longreads -JAFFAL_DATADIR=longreads_jaffal +DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k +JAFFAL_DATADIR=/datastore/scratch/users/vantwisk/sim/longreads${9}k_training_jaffal +#JAFFAL_DATADIR=/datastore/scratch/users/vantwisk/sim/longreads1k_training_jaffal OUTDIR=${JAFFAL_DATADIR}/fusions-${1}-${5}-${6}-${4}-jaffal_out +#OUTDIR=${JAFFAL_DATADIR}/fusions-${1}-${5}-${6}-${4}-jaffal_out + +[ ! -d ${JAFFAL_DATADIR} ] && mkdir ${JAFFAL_DATADIR} + +[ ! -d ${OUTDIR} ] && mkdir ${OUTDIR} -if [[ ! -d "${OUTDIR}" ]] -then - mkdir ${OUTDIR} -fi cd ${OUTDIR} #singularity exec --pid --bind /datastore longgf_0.1.2--h05f6578_1.sif \ - ~/JAFFA-version-2.2/tools/bin/bpipe run ~/JAFFA-version-2.2/JAFFAL.groovy \ - ../../${DATADIR}/fusions-${1}-${5}-${6}-*.fq.gz +echo ${DATADIR}/fusions-${1}-${5}-${6}-*.fq.gz +~/JAFFA-version-2.2/tools/bin/bpipe run ~/JAFFA-version-2.2/JAFFAL.groovy \ + ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz diff --git a/longgf_helper.sh b/longgf_helper.sh index 9397f02..2173ec3 100644 --- a/longgf_helper.sh +++ b/longgf_helper.sh @@ -5,11 +5,13 @@ #SBATCH --cpus-per-task 1 #SBATCH --mem-per-cpu 10g -MINIMAP_DIR=${7} -LONGGF_DIR=${8} +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_minimap2 +LONGGF_DIR=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_longgf + +[ ! -d ${LONGGF_DIR} ] && mkdir ${LONGGF_DIR} #singularity exec --pid --bind /datastore longgf_0.1.2--h05f6578_1.sif \ ~/LongGF/bin/LongGF \ - ${MINIMAP_DIR}/fusions-${1}-${5}-${6}-${4}-sorted-n.bam \ + ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-n-sorted.bam \ Homo_sapiens.GRCh38.105.gtf \ - 40 50 100 > ${LONGGF_DIR}/fusions-${1}-${5}-${6}-${4}.log + ${9} ${10} ${11} > ${LONGGF_DIR}/fusions-${1}-${5}-${6}-${4}-${9}-${10}-${11}.log diff --git a/minimap2_helper.sh b/minimap2_helper.sh index 736eb79..6e102a5 100644 --- a/minimap2_helper.sh +++ b/minimap2_helper.sh @@ -5,19 +5,26 @@ #SBATCH -n 1 #SBATCH -t 12:00:00 -DATADIR=longreads -DATADIR_MINIMAP=longreads_mappings +DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k_minimap2 -echo $1 -echo $2 -echo $3 -echo $4 -echo $5 -echo $6 -echo $7 -echo $8 +[ ! -d ${DATADIR_MINIMAP} ] && mkdir ${DATADIR_MINIMAP} + +echo $1 #coverage +echo $2 #nothing +echo $3 #nothing +echo $4 #run +echo $5 #identity +echo $6 #tech +echo $7 #ax +echo $8 #sam +echo $9 #n #for j in $(seq 1 10); do #/home/vantwisk/minimap2-2.21_x64-linux/minimap2 -cx ava-ont ${DATADIR}/fuse-${1}-${4}.fq.gz ${DATADIR}/fuse-${1}-${4}.fq.gz > ${DATADIR}/fuse-${1}-${4}-minimap2-splice.paf /home/vantwisk/minimap2-2.21_x64-linux/minimap2 -t 32 --MD -${7} splice ../hg38.fa ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz > ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}.${8} #done + +samtools view ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}.sam -o ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}.bam +samtools sort ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}.bam -o ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-sorted.bam +samtools sort -n ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}.bam -o ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-n-sorted.bam diff --git a/pbfusion_helper.sh b/pbfusion_helper.sh new file mode 100644 index 0000000..b4a3e06 --- /dev/null +++ b/pbfusion_helper.sh @@ -0,0 +1,18 @@ +#!/bin/bash +#SBATCH --job-name pbfusion +#SBATCH --partition allnodes +#SBATCH --time UNLIMITED +#SBATCH --cpus-per-task 1 +#SBATCH --mem=128G + +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k_minimap2 +DATADIR_BPFUSION=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k_pbfusion + +[ ! -d ${DATADIR_BPFUSION} ] && mkdir ${DATADIR_BPFUSION} + +pbfusion discover \ + --bam ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-pbmm2.bam \ + --gtf ../gencode.v38.annotation.gtf \ + --output-prefix ${DATADIR_PBFUSION}/fusions-${1}-${5}-${6}-${4}-pbmm2- \ + --min-coverage 1 \ + --threads 32 -v diff --git a/pbmm2_helper.sh b/pbmm2_helper.sh new file mode 100644 index 0000000..c00c4e3 --- /dev/null +++ b/pbmm2_helper.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name pbmm2 +#SBATCH --partition allnodes +#SBATCH --time UNLIMITED +#SBATCH --cpus-per-task 1 +#SBATCH --mem=128G + +DATADIR=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k +DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${9}k_minimap2 + +[ ! -d ${DATADIR_MINIMAP} ] && mkdir ${DATADIR_MINIMAP} + +pbmm2 align -j 32 --preset ISOSEQ --sort ../hg38_gencode.mmi ${DATADIR}/fusions-${1}-${5}-${6}-${4}.fq.gz ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-pbmm2.bam diff --git a/run_pipeline.sh b/run_pipeline.sh index 51cb324..7db409d 100644 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -3,16 +3,15 @@ DATADIR=pfun-med COVERAGE=50 REPLICATES=10 -Rscript split_transcripts.R +bash generate_annotation_resources.sh +bash generate_breakpoints.sh +bash generate_simulated_data.sh +bash generate_mapping.sh +bash generate_arriba.sh +bash generate_longgf.sh +bash generate_jaffal.sh +bash generate_genion.sh +bash generate_fusionseeker.sh +bash generate_pbfusion.sh -bash badread.sh ${COVERAGE} 1 1 ${REPLICATES} -bash fqc.sh ${COVERAGE} 1 1 ${REPLCIATES} -bash minimap2.sh ${COVERAGE} 1 1 ${REPLICATES} -bash minimap2_paf.sh ${COVERAGE} 1 1 ${REPLICATES} -bash genself.sh ${COVERAGE} 1 1 ${REPLICATES} -bash sort.sh ${COVERAGE} 1 1 ${REPLICATES} -bash longread.sh ${COVERAGE} 1 1 ${REPLICATES} -bash genion.sh ${COVERAGE} 1 1 ${REPLICATES} -bash jaffal.sh ${COVERAGE} 1 1 ${REPLICATES} - -Rscript generate_figures +Rscript generate_figures.R diff --git a/star_helper2.sh b/star_helper2.sh new file mode 100644 index 0000000..0313f15 --- /dev/null +++ b/star_helper2.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +#SBATCH --job-name star +#SBATCH --partition allnodes +#SBATCH --time 12:00:00 +#SBATCH --mem=128G + +DATADIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k +MAPPING_DIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k_mapping + +[ ! -d ${MAPPING_DIR} ] && mkdir ${MAPPING_DIR} + +#for j in $(seq 1 10); do +#singularity exec --pid --bind /datastore star_latest.sif \ +/home/vantwisk/STAR-2.7.10a/bin/Linux_x86_64_static/STAR \ + --runThreadN 16 \ + --genomeDir ../hg38_star_index --genomeLoad NoSharedMemory \ + --readFilesIn ${DATADIR}/fusions-${1}-${4}-${5}-1.fq ${DATADIR}/fusions-${1}-${4}-${5}-2.fq \ + --outFileNamePrefix ${MAPPING_DIR}/fusions-${1}-${4}-${5}- \ + --readFilesCommand zcat \ + --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outBAMcompression 0 \ + --chimOutType WithinBAM \ + --chimSegmentMin 12 \ + --chimJunctionOverhangMin 8 \ + --chimOutJunctionFormat 1 \ + --alignSJDBoverhangMin 10 \ + --alignMatesGapMax 100000 \ + --alignIntronMax 100000 \ + --alignSJstitchMismatchNmax 5 -1 5 5 \ + --outSAMattrRGline ID:GRPundef \ + --chimMultimapScoreRange 3 \ + --chimScoreJunctionNonGTAG -4 \ + --chimMultimapNmax 20 \ + --chimNonchimScoreDropMin 10 \ + --peOverlapNbasesMin 12 \ + --peOverlapMMp 0.1 \ + --alignInsertionFlush Right \ + --alignSplicedMateMapLminOverLmate 0 \ + --alignSplicedMateMapLmin 30 +# STAR --runThreadN 8 \ +# --genomeDir hg38_star_index \ +# --readFilesIn i_hun/Homo_sapiens_coverage-${3}-length-${2}-${4}-1.fq i_hun/Homo_sapiens_coverage-${3}-length-${2}-${4}-2.fq \ +# --outFileNamePrefix i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4} \ +# --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within --outBAMcompression 0 --outSAMattributes Standard \ +# --outFilterMultimapNmax 50 --peOverlapNbasesMin 10 --alignSplicedMateMapLminOverLmate 0.5 --alignSJstitchMismatchNmax 5 -1 5 5 \ +# --chimOutType SeparateSAMold +# --chimSegmentMin 10 --chimOutType Junctions --chimJunctionOverhangMin 10 --chimScoreDropMax 30 \ +# --chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 --chimSegmentReadGapMax 3 --chimMultimapNmax 50 +#done