-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_map_reads_to_genome_bowtie_usage.sh
134 lines (111 loc) · 5.15 KB
/
03_map_reads_to_genome_bowtie_usage.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
# # Load necessary modules
# module load CUTADAPT/1.16
# module load BEDTOOLS/2.24
# module load BOWTIE1/1.1.2
# module load BOWTIE2/2.2.5
# module load FASTQC/0.11.4
# module load SAMTOOLS/1.9
# Get input arguments
jobdir=$1 # "path_to_project_directory"
bowtieindexdir=$2 # "path_to_bowtie_index"
outputdir=$3 # "path_to_output_directory"
reffastafile=$4 # "path_to_reference_genome/dna/hsa_dna_mirna_unique.fa"
species=$5 # "hsa" or "mmu"
fastqdir=$6 # "path_to_fastq_files"
rnaClassDir=${7:-"input/annotation/rna_classes"} # input/annotation/rna_classes
queue=${8:-"fat"} # mpi or fat
# Set up relevant filenames and directories
bamdir="${outputdir}/bam/iteration3"
samdir="${outputdir}/sam/iteration3"
finalbamdir="${outputdir}/bamFiles"
unmappedFastqDir="${outputdir}/unmapped_fastq/iteration2"
countsdir="${outputdir}/counts"
mappingLogsDir="${outputdir}/mappingLogs/iteration3"
fastqcDirI3="${outputdir}/fastqc/iteration3"
fastqcDirIm="${outputdir}/fastqc/iteration{1,2,3}"
mergedSamdir="${outputdir}/mergedSam"
scriptsdir=${jobdir}/scripts/03_map_genome_bowtie/$(basename ${outputdir})
mkdir -p ${finalbamdir} ${samdir} ${bamdir} ${countsdir} ${scriptsdir} ${trimmedFastqDir} ${unmappedFastqDir} ${fastqcDirI3} ${mergedSamdir}
# Process and submit jobs
for f in ${fastqdir}/*.fastq.gz
do
# Get the fastq file names
ffname=$(basename ${f} .fastq.gz)
# Get unmapped fastq file
fastqFile="${unmappedFastqDir}/${ffname}.unmapped.fastq"
# Get the jobname to submit each job as a job array
jobname="3_$(basename ${outputdir})_${ffname}"
prevjobname="2_$(basename ${outputdir})_${ffname}"
# Set up output folders
bowtieoutputdir=${mappingLogsDir}/${ffname}
errorsFile="${bowtieoutputdir}/${ffname}_gwdg_mapping.err"
stdoutFile="${bowtieoutputdir}/${ffname}_gwdg_mapping.out"
# Create necessary dirs
mkdir -p ${bowtieoutputdir} ${scriptsdir}
# Initialize a script file
script="${scriptsdir}/${ffname}.sh"
touch ${script}
echo "#!/bin/bash" > "${script}"
echo "" >> "${script}"
# Unzip fastq.gz file before mapping with bowtie
echo "# Unzip fastq.gz file before mapping with bowtie" >> "${script}"
echo "gunzip ${fastqFile}.gz" >> "${script}"
echo "" >> "${script}"
# Get fastQC analysis for iteration 3
echo "# Get fastQC analysis for iteration3" >> "${script}"
echo "fastqc ${fastqFile} -o ${fastqcDirI3} -q" >> "${script}"
echo "" >> "${script}"
# Mapping all unmapped short reads to the reference genome using bowtie aligner
samfile="${samdir}/${ffname}.sam"
echo "# Mapping all unmapped short reads to the reference genome using bowtie aligner" >> "${script}"
echo "bowtie -q -v 1 -p 4 -S -a -m 1 ${bowtieindexdir}/$(basename ${bowtieindexdir}) ${fastqFile} ${samfile}" >> "${script}"
echo "" >> "${script}"
# Get the BAM files from the SAM files for iteration3
bamfile3="${bamdir}/${ffname}.bam"
echo "# Get the BAM files from the SAM files for iteration3" >> "${script}"
echo "samtools view -Sb ${samfile} > ${bamfile3}" >> "${script}"
echo "" >> "${script}"
# Sort the bam file before creating the index
echo "# Sort the bam file before creating the index" >> "${script}"
echo "samtools sort -o ${bamfile3}.sorted ${bamfile3} " >> "${script}"
echo "mv ${bamfile3}.sorted ${bamfile3}" >> "${script}"
echo "" >> "${script}"
# Create the index of the bam file
echo "# Create the index of the bam file" >> "${script}"
echo "samtools index ${bamfile3}" >> "${script}"
echo "" >> "${script}"
# Merge the bam files
# -n The input alignments are sorted by read names rather than by chromosomal coordinates
# -r Attach an RG tag to each alignment. The tag value is inferred from file names
# -O BAM or SAM
echo "# Get the merged bam file from all three iterations" >> "${script}"
bamfile1="${outputdir}/bam/iteration1/${ffname}.bam"
bamfile2="${outputdir}/bam/iteration2/${ffname}.bam"
mergedBam="${finalbamdir}/${ffname}.bam"
echo "samtools merge -n -r -O BAM ${mergedBam} ${bamfile1} ${bamfile2} ${bamfile3}" >> "${script}"
echo "" >> "${script}"
# Sort the final merged (mapped + unmapped) bam
echo "# Sort the final merged (mapped + unmapped) bam file before getting counts" >> "${script}"
echo "samtools sort -o ${mergedBam}.sorted ${mergedBam} " >> "${script}"
echo "mv ${mergedBam}.sorted ${mergedBam}" >> "${script}"
echo "" >> "${script}"
# Create the index of the final merged (mapped + unmapped) bam file
echo "# Create the index of the final merged (mapped + unmapped) bam file" >> "${script}"
echo "samtools index ${mergedBam}" >> "${script}"
echo "" >> "${script}"
# Get high quality (mapq >= 30) mapped sam files for counting
mergedSam=${mergedSamdir}/${ffname}.sam
echo "# Get high quality (mapq >= 30) mapped sam files for counting" >> "${script}"
echo "samtools view -F 4 -q 30 ${mergedBam} > ${mergedSam}" >> "${script}"
echo "" >> "${script}"
# Get the counts from SAM file
countsfile="${countsdir}/${ffname}_counts.txt"
echo "# Get the counts from SAM file" >> "${script}"
echo "python scripts/get_counts_from_sam.py -if=${reffastafile} -is=${mergedSam} -of=${countsfile}" >> "${script}"
echo "" >> "${script}"
chmod 775 "${script}"
echo "Submitting job for $jobname ..."
sh "${script}"
echo " "
done