Skip to content

Commit

Permalink
Added Sequential Run Files
Browse files Browse the repository at this point in the history
  • Loading branch information
dvantwisk committed Oct 17, 2024
1 parent 8df3f43 commit 7cc5f47
Show file tree
Hide file tree
Showing 20 changed files with 312 additions and 224 deletions.
96 changes: 96 additions & 0 deletions environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@

## SYSTEM OPTIONS
export TF_BASH=bash
#export TF_BASH=sbatch

export THREADS=32

## SCRATCH STORAGE DIRECTORY
export STORAGE_DIR=/home/vantwisk/vantwisk/fusions/seq_run
[ ! -d ${STORAGE_DIR} ] && mkdir ${STORAGE_DIR}

## BASE STORAGE DIRECTORIES
export REF_STORAGE_DIR=${STORAGE_DIR}/ref
[ ! -d ${REF_STORAGE_DIR} ] && mkdir ${REF_STORAGE_DIR}
export SIM_STORAGE_DIR=${STORAGE_DIR}/sim
[ ! -d ${SIM_STORAGE_DIR} ] && mkdir ${SIM_STORAGE_DIR}
export RESULTS_STORAGE_DIR=${STORAGE_DIR}/results
[ ! -d ${RESULTS_STORAGE_DIR} ] && mkdir ${RESULTS_STORAGE_DIR}

## ALIGNMENT STORAGE DIRECTORY
export ALIGNMENT_STORAGE_DIR=${STORAGE_DIR}/alignments
[ ! -d ${ALIGNMENT_STORAGE_DIR} ] && mkdir ${ALIGNMENT_STORAGE_DIR}

## RESULTS STORAGE DIRECTORIES
export JAFFAL_STORAGE_DIR=${RESULTS_STORAGE_DIR}/jaffal
[ ! -d ${JAFFAL_STORAGE_DIR} ] && mkdir ${JAFFAL_STORAGE_DIR}
export LONGGF_STORAGE_DIR=${RESULTS_STORAGE_DIR}/longgf
[ ! -d ${LONGGF_STORAGE_DIR} ] && mkdir ${LONGGF_STORAGE_DIR}
export GENION_STORAGE_DIR=${RESULTS_STORAGE_DIR}/genion
[ ! -d ${GENION_STORAGE_DIR} ] && mkdir ${GENION_STORAGE_DIR}
export PBFUSION_STORAGE_DIR=${RESULTS_STORAGE_DIR}/pbfusion
[ ! -d ${PBFUSION_STORAGE_DIR} ] && mkdir ${PBFUSION_STORAGE_DIR}
export FUSIONSEEKER_STORAGE_DIR=${RESULTS_STORAGE_DIR}/fusionseeker
[ ! -d ${FUSIONSEEKER_STORAGE_DIR} ] && mkdir ${FUSIONSEEKER_STORAGE_DIR}

export ARRIBA_STORAGE_DIR=${RESULTS_STORAGE_DIR}/arriba
[ ! -d ${ARRIBA_STORAGE_DIR} ] && mkdir ${ARRIBA_STORAGE_DIR}
export STARFUSION_STORAGE_DIR=${RESULTS_STORAGE_DIR}/starfusion
[ ! -d ${STARFUSION_STORAGE_DIR} ] && mkdir ${STARFUSION_STORAGE_DIR}

## GRAPH STORAGE DIRECTORIES
export GRAPHS_STORAGE_DIR=${RESULTS_STORAGE_DIR}/graphs
[ ! -d ${GRAPHS_STORAGE_DIR} ] && mkdir ${GRAPHS_STORAGE_DIR}

## RESOURCES
export DNA_REFERENCE=${REF_STORAGE_DIR}/Homo_sapiens.GRCh38.dna.primary_assembly.fa
export CDNA_REFERENCE=${REF_STORAGE_DIR}/Homo_sapiens.GRCh38.cdna.all.fa
export GTF_REFERENCE=${REF_STORAGE_DIR}/Homo_sapiens.GRCh38.105.gtf

export PBMM2_MMI=${REF_STORAGE_DIR}/hg38_gencode.mmi

export STAR_INDEX=${REF_STORAGE_DIR}/hg38_star_index

export GENOMIC_SUPER_DUPS=${REF_STORAGE_DIR}/genomicSuperDups.txt

## Annotion Limit Settings
export TRANSCRIPT_LIMIT=1000
export TRANSCRIPT_LIMITED_FILE=${REF_STORAGE_DIR}/Homo_sapiens.GRCh38.cdna.limited_${TRANSCRIPT_LIMIT}.fa

## FUSION SIMULATION SETTINGS
export NFUSIONS=100
export FUSIM_FASTA_FILE=${REF_STORAGE_DIR}/fusim_${NFUSIONS}.fasta
export FUSIM_TXT_FILE=${REF_STORAGE_DIR}/fusim_${NFUSIONS}.fxt
export FUSION_TRANSCRIPTOME=${REF_STORAGE_DIR}/Homo_sapiens.GRCh38.cdna.limited_${TRANSCRIPT_LIMIT}_fusions_${NFUSIONS}.fa

## LONGREAD AND SHORTREAD READ SIMULATION SETTINGS
export N_TRANSCRIPTS=('1') #('1' '2' '4' '8' '16')
export REPLICATES=1 #10
export COVERAGE=(10) #(3 5 10 30 50 100)
export QUALITY=('95,100,4') #('75,90,8' '87,97,5' '95,100,4')
export TECH=('pacbio2016' 'nanopore2020')
export READ_LENGTHS=(100 150)

## LONGGF OPTIONS
export MIN_OVERLAP_LEN=100
export BIN_SIZE=50
export MIN_MAP_LENGTH=100

## JAFFAL OPTIONS

## GENION OPTIONS
export GENION_MIN_SUPPORT=2

## PBFUSION OPTIONS
export PBFUSION_MIN_COVERAGE=2

$ART_P
$RUSTYREAD_P
$MINIMAP2_P
$SAMTOOLS_P
$JAFFA_P
$GENION_P
$PBFUSION_P
$STAR_P
$STAR_FUSION_P
$ARRIBA_P
26 changes: 18 additions & 8 deletions generate_annotation_resources.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@

FASTA_NAME=Homo_sapiens.cdna.gtf_limited.fa
if [ ! -f ${DNA_REFERENCE} ]; then
wget -O ${DNA_REFERENCE}.gz http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
gunzip -d ${DNA_REFERENCE}.gz
fi

if [ ! -f ${CDNA_REFERENCE} ]; then
wget -O ${CDNA_REFERENCE}.gz http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
gunzip -d ${CDNA_REFERENCE}.gz
fi

if [ ! -f Homo_sapiens.GRCh38.105.gtf ]; then
wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
gunzip -d Homo_sapiens.GRCh38.105.gtf.gz
if [ ! -f ${GTF_REFERENCE} ]; then
wget -O ${GTF_REFERENCE}.gz http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
gunzip -d ${GTF_REFERENCE}.gz
fi

if [ ! -f Homo_sapiens.GRCh38.cdna.all.fa ]; then
wget http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
gunzip -d Homo_sapiens.GRCh38.cdna.all.fa.gz
if [ ! -f ${GENOMIC_SUPER_DUPS} ]; then
wget -O ${GENOMIC_SUPER_DUPS}.gz ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/genomicSuperDups.txt.gz
gunzip -d ${GENOMIC_SUPER_DUPS}.gz
fi

Rscript generate_breakpoints.R Homo_sapiens.GRCh38.105.gtf Homo_sapiens.GRCh38.cdna.all.fa ${FASTA_NAME}
if [ ! -f ${TRANSCRIPT_LIMITED_FILE} ]; then
Rscript generate_breakpoints.R ${GTF_REFERENCE} ${CDNA_REFERENCE} ${TRANSCRIPT_LIMITED_FILE} ${TRANSCRIPT_LIMIT}
fi
47 changes: 1 addition & 46 deletions generate_arriba.sh
Original file line number Diff line number Diff line change
@@ -1,54 +1,9 @@

TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')

READ_LENGTHS=(100 150)

#module load R-4.0.3

#Rscript split_transcripts.R ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTS} ${FUSE_META} ${NFUSIONS}

#cp ${TRANSCRIPTOME} ${FUSE_TRANSCRIPTOME}
#cat ${FUSE_TRANSCRIPTS} >> ${FUSE_TRANSCRIPTOME}

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch minimap2_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

#for i in $(seq 1 ${REPLICATES}); do
# for q in ${!COVERAGE[@]}; do
# for j in ${!QUALITY[@]}; do
# for k in ${!TECH[@]}; do
# for n in ${!N_TRANSCRIPTS[@]}; do
# sbatch genself_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
# done
# done
# done
# done
#done

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!READ_LENGTHS[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch arriba_helper.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]}
eval ${TF_BASH} arriba_helper.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]}
done
done
done
Expand Down
23 changes: 20 additions & 3 deletions generate_breakpoints.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager", repo='https://archive.linux.duke.edu/cran/')
if (!require("GenomicFeatures", quietly = TRUE))
BiocManager::install("GenomicFeatures")
if (!require("Biostrings", quietly = TRUE))
BiocManager::install("Biostrings")

library(GenomicFeatures)
library(Biostrings)

arg <- commandArgs(trailingOnly=TRUE)

message(arg[1])
message(arg[2])
message(arg[3])
message(arg[4])

fa <- readDNAStringSet(arg[2])
#fa <- Biostrings::readDNAStringSet('longread-fusion-transcript-pipeline/Homo_sapiens.GRCh38.cdna.all.fa')
#fa <- readDNAStringSet(arg[2])
fa <- Biostrings::readDNAStringSet(arg[2])

#txdb <- makeTxDbFromGFF('longread-fusion-transcript-pipeline/Homo_sapiens.GRCh38.105.gtf')

gtf <- rtracklayer::import(arg[1])
#gtf <- import(arg[1])

fasta_out <- arg[3]
#fasta_out <- arg[3]

ele <- elementMetadata(gtf)
ele <- ele[ele$type == 'transcript',]
Expand All @@ -29,7 +42,11 @@ tx_act <- vapply(nam, function(x) x[1], character(1))
vals1 <- ele[ele$full_tx %in% tx1,]

wh1 <- which(tx_act %in% ele$full_tx)
writeXStringSet(fa[wh1], fasta_out)
fa1 <- fa[wh1]
fa1 <- fa1[lengths(fa1) > 100]
fa1 <- sample(fa1, arg[4], replace=F)

writeXStringSet(fa1, fasta_out)
#gtf1 <- gtf[wh1,]

#fa1 <- fa[wh1]
Expand Down
16 changes: 1 addition & 15 deletions generate_fusionseeker.sh
Original file line number Diff line number Diff line change
@@ -1,24 +1,10 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch fusionseeker_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
eval ${TF_BASH} fusionseeker_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
done
done
done
Expand Down
28 changes: 16 additions & 12 deletions generate_genion.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
N_TRANSCRIPTS=('1') #('1' '2' '4' '8' '16')
REPLICATES=1 #10
COVERAGE=(10) #(3 5 10 30 50 100)
QUALITY=('95,100,4') #('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
MIN_SUPPORT=2
READ_LENGTHS=(100 150)

## LONGGF OPTIONS
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

## JAFFAL OPTIONS

## GENION OPTIONS
GENION_MIN_SUPPORT=1

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch genion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_SUPPORT} ${N_TRANSCRIPTS[$n]}
bash genion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${GENION_MIN_SUPPORT} ${N_TRANSCRIPTS[$n]}
done
done
done
Expand Down
31 changes: 18 additions & 13 deletions generate_jaffal.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
N_TRANSCRIPTS=('1') #('1' '2' '4' '8' '16')
REPLICATES=1 #10
COVERAGE=(10) #(3 5 10 30 50 100)
QUALITY=('95,100,4') #('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
READ_LENGTHS=(100 150)

## LONGGF OPTIONS
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

## JAFFAL OPTIONS

## GENION OPTIONS
GENION_MIN_SUPPORT=2

for i in $(seq 1 ${REPLICATES}); do
#for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch jaffal_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
sbatch jaffal_helper.sh ${COVERAGE[$q]} 1 1 1 ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
#done
24 changes: 13 additions & 11 deletions generate_longgf.sh
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
TRANSCRIPTOME=Homo_sapiens.GRCh38.105.cdna.gtf_confirmed_new_16k.fa
FUSE_TRANSCRIPTS=training16k_fusions.fa
FUSE_TRANSCRIPTOME=training16k_transcriptome.fa
FUSE_META=training16k_fusions.txt
NFUSIONS=100

N_TRANSCRIPTS=('1')
REPLICATES=10
COVERAGE=(3 5 10 30 50 100)
QUALITY=('75,90,8' '87,97,5' '95,100,4')
N_TRANSCRIPTS=('1') #('1' '2' '4' '8' '16')
REPLICATES=1 #10
COVERAGE=(10) #(3 5 10 30 50 100)
QUALITY=('95,100,4') #('75,90,8' '87,97,5' '95,100,4')
TECH=('pacbio2016' 'nanopore2020')
READ_LENGTHS=(100 150)

## LONGGF OPTIONS
MIN_OVERLAP_LEN=100
BIN_SIZE=50
MIN_MAP_LENGTH=100

## JAFFAL OPTIONS

## GENION OPTIONS
GENION_MIN_SUPPORT=2

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch longgf_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
eval ${TF_BASH} longgf_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${MIN_OVERLAP_LEN} ${BIN_SIZE} ${MIN_MAP_LENGTH} ${N_TRANSCRIPTS[$n]}
done
done
done
Expand Down
Loading

0 comments on commit 7cc5f47

Please sign in to comment.