Skip to content

Commit

Permalink
Updated pipeline and generation files.
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Joseph Van Twisk committed Oct 7, 2024
1 parent 09dfb4a commit 8df3f43
Show file tree
Hide file tree
Showing 20 changed files with 448 additions and 69 deletions.
10 changes: 8 additions & 2 deletions arriba_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
#SBATCH --time 12:00:00
#SBATCH --mem=128G

DATADIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k
MAPPING_DIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k_mapping
ARRIBA_DIR=/datastore/scratch/users/vantwisk/sim/shortreads_training${6}k_arriba

[ ! -d ${ARRIBA_DIR} ] && mkdir ${ARRIBA_DIR}

#for j in $(seq 1 10); do
#singularity exec --pid --bind /datastore arriba_latest.sif \
/home/vantwisk/arriba_v2.2.1/arriba \
-x i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}Aligned.sortedByCoord.out.bam \
-o i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}.tsv -O i_hun/Star_Homo_sapiens_coverage-${3}-length-${2}-${4}.discarded.tsv \
-x ${MAPPING_DIR}/fusions-${1}-${4}-${5}-Aligned.sortedByCoord.out.bam \
-o ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.tsv -O ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.discarded.tsv \
-a Homo_sapiens.GRCh38.dna.primary_assembly.fa -g Homo_sapiens.GRCh38.105.chr.gtf \
-b /home/vantwisk/arriba_v2.2.1/database/blacklist_hg38_GRCh38_v2.2.1.tsv -k /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv -t /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv -p /home/vantwisk/arriba_v2.2.1/database/protein_domains_hg38_GRCh38_v2.2.1.gff3
#done
8 changes: 4 additions & 4 deletions badread_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
#SBATCH -n 1
#SBATCH --time 24:00:00

OUTDIR=longreads

OUTDIR=/datastore/scratch/users/vantwisk/sim/longreads_training${8}k

[ ! -d ${OUTDIR} ] && mkdir ${OUTDIR}
echo $1
Expand All @@ -16,11 +15,12 @@ echo $4
echo $5
echo $6
echo $7
echo $8

#for j in $(seq 1 10); do
rustyread --threads 32 simulate --reference ${7} \
--quantity ${1}x \
--qscore_model ${6} --glitches 0,0,0 --junk_reads 0 --random_reads 0 \
--error_model ${6} --identity ${5} \
--qscore_model /home/vantwisk/Badread/badread/qscore_models/${6} --glitches 0,0,0 --junk_reads 0 --random_reads 0 \
--error_model /home/vantwisk/Badread/badread/error_models/${6} --identity ${5} \
--chimera 0 --seed $RANDOM | gzip > ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq.gz #pfun/fuse-${1}-${4}.fq.gz #fuse-transcript-${1}-${4}.fq.gz
#done
20 changes: 20 additions & 0 deletions fusionseeker_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name longgf
#SBATCH --partition allnodes
#SBATCH --time UNLIMITED
#SBATCH --cpus-per-task 1
#SBATCH --mem-per-cpu 10g

DATADIR_MINIMAP=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_minimap2
FUSIONSEEKER_DIR=/datastore/scratch/users/vantwisk/sim/longreads_training${12}k_fusionseeker

[ ! -d ${FUSIONSEEKER_DIR} ] && mkdir ${FUSIONSEEKER_DIR}

#singularity exec --pid --bind /datastore longgf_0.1.2--h05f6578_1.sif \
fusionseeker \
--tread 16 \
--bam ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-sorted.bam \
--gtf Homo_sapiens.GRCh38.105.gtf \
--ref ../hg38.fa \
-o ${FUSIONSEEKER_DIR}/fusions-${1}-${5}-${6}-${4}-fusionseeker \
-s 2
55 changes: 55 additions & 0 deletions generate_arriba.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

READ_LENGTHS=(100 150)

#module load R-4.0.3

#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS}

#cp ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTOME}
#cat ${FUSE_TRANSCRIPTS} >> ${FUSE_TRANSCRIPTOME}

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch minimap2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch genself_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!READ_LENGTHS[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch arriba_helper.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]}
done
done
done
done
26 changes: 26 additions & 0 deletions generate_fusionseeker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch fusionseeker_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
24 changes: 24 additions & 0 deletions generate_genion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
MIN_SUPPORT=2

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch genion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_SUPPORT} ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
23 changes: 23 additions & 0 deletions generate_jaffal.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch jaffal_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
26 changes: 26 additions & 0 deletions generate_longgf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch longgf_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
67 changes: 67 additions & 0 deletions generate_mapping.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

READ_LENGTHS=(100 150)

#module load R-4.0.3

#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS}

#cp ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTOME}
#cat ${FUSE_TRANSCRIPTS} >> ${FUSE_TRANSCRIPTOME}

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch minimap2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
done
done
done
done
done

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch genself_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch pbmm2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!READ_LENGTHS[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch star_helper2.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]}
# done
# done
# done
#done
23 changes: 23 additions & 0 deletions generate_pbfusion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch pbfusion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
37 changes: 21 additions & 16 deletions generate_simulated_data.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@

TRANSCRIPTOME=Homo_sapiens.cdna_50k.fa
FUSE_TRANSCRIPTS=test_fusions1.fa
FUSE_TRANSCRIPTOME=test_transcriptome.fa
FUSE_META=test_fusions1.txt
NFUSIONS=500
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50)
QUALITY=('87,97,5') #('75,90,8' '87.5,97.5,5' '95,100,4')
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

READ_LENGTHS=(100 150)

module load R-4.0.3
#module load R-4.0.3

#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS}

Expand All @@ -23,17 +24,21 @@ for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
sbatch badread_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ${FUSE_TRANSCRIPTOME}
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch badread_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} training${N_TRANSCRIPTS[$n]}k_transcriptome.fa ${N_TRANSCRIPTS[$n]}
done
done
done
done
done


for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!READ_LENGTHS[@]}; do
sbatch art_helper.sh ${COVERAGE[$q]} ${READ_LENGTHS[$j]} ${FUSE_TRANSCRIPTOME} ${i}
done
done
done
#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!READ_LENGTHS[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch art_helper.sh ${COVERAGE[$q]} ${READ_LENGTHS[$j]} training${N_TRANSCRIPTS[$n]}k_transcriptome.fa ${i} ${N_TRANSCRIPTS[$n]}
# done
# done
# done
#done
Loading

0 comments on commit 8df3f43

Please sign in to comment.