-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02.2_map_smncrna_bowtie_usage.sh
129 lines (107 loc) · 5.32 KB
/
02.2_map_smncrna_bowtie_usage.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
# # Load necessary modules
# module load CUTADAPT/1.16
# module load BEDTOOLS/2.24
# module load BOWTIE1/1.1.2
# module load BOWTIE2/2.2.5
# module load FASTQC/0.11.4
# module load SAMTOOLS/1.9
# Get input arguments
jobdir=$1 # "path_to_project_directory"
bowtieindexdir=$2 # "path_to_bowtie_index"
fastqdir=$3 # "path_to_fastq_files"
outputdir=$4 # "path_to_output_directory"
reffastafile=$5 # "path_to_reference_genome/dna/hsa_dna_mirna_unique.fa"
ifTRIM=${6:-1} # If you want to trim the reads (default is yes)
threePadapter=${7:-"TGGAATTCTCGGGTGCCAAGG"}
queue=${8:-"fat"} # mpi or fat
# Set up relevant filenames and directories
bamdir=${outputdir}/"bam/iteration2"
samdir=${outputdir}/"sam/iteration2"
countsdir=${outputdir}/"counts"
trimmedFastqDir=${outputdir}/"trimmed_fastq/iteration2"
unmappedFastqDir=${outputdir}/"unmapped_fastq/iteration2"
unmappedFastqDirI1=${outputdir}/"unmapped_fastq/iteration1"
mappingLogsDir="${outputdir}/mappingLogs/iteration2"
fastqcDirI2="${outputdir}/fastqc/iteration2"
scriptsdir=${jobdir}/scripts/02.2_map_smncrna/$(basename ${outputdir})
# Create necessary dirs
mkdir -p ${samdir} ${bamdir} ${countsdir} ${scriptsdir} ${trimmedFastqDir} ${unmappedFastqDir} ${fastqcDirI2}
# Initialize the counter to submit jobs as an array
i=0
for f in ${fastqdir}/*.fastq.gz
do
# Get the fastq file names
ffname=$(basename ${f} .fastq.gz)
fastqFile=${unmappedFastqDirI1}/${ffname}_greater_than_33bp_long.fastq
# Get the jobname to submit each job as a job array
# jobname="02.2_$(basename ${outputdir})_${ffname}"
i=$((i + 1))
jobname="2_$(basename ${outputdir})_${ffname}"
prevjobname="1_$(basename ${outputdir})_${ffname}"
# Set up output folders
bowtieoutputdir=${mappingLogsDir}/${ffname}
errorsFile="${bowtieoutputdir}/${ffname}_iteration2_mapping.err"
stdoutFile="${bowtieoutputdir}/${ffname}_iteration2_mapping.out"
# Create necessary dirs
mkdir -p ${bowtieoutputdir} ${scriptsdir}
# Initialize a script file
script="${scriptsdir}/${ffname}.sh"
touch ${script}
echo "#!/bin/bash" > "${script}"
echo "" >> "${script}"
# Unzip fastq.gz file before mapping with bowtie
echo "# Unzip fastq.gz file before mapping with bowtie" >> "${script}"
echo "gunzip ${fastqFile}.gz" >> "${script}"
echo "" >> "${script}"
# Get fastQC analysis for iteration 2
echo "# Get fastQC analysis for iteration 2" >> "${script}"
echo "fastqc ${fastqFile} -o ${fastqcDirI2} -q" >> "${script}"
echo "" >> "${script}"
# Get the output file names
bamfile="${bamdir}/${ffname}.bam"
samfile="${samdir}/${ffname}.sam"
unmapped_fastq="${unmappedFastqDir}/${ffname}.unmapped.fastq"
# Mapping short reads to the reference genome using bowtie aligner
# -q query input files are FASTQ .fq/.fastq (default)
# -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities; No more than v mismatches in the entire length of the read
# -p/--threads <int> number of alignment threads to launch (default: 1)
# -S/--sam write hits in SAM format
# -k <int> report up to <int> good alignments per read (default: 1)
# -a/--all report all alignments per read (much slower than low -k)
# -m <int> suppress all alignments if > <int> exist (def: no limit)
# -m 3 instructs bowtie to refrain from reporting any alignments for reads having more than 3 reportable alignments. The -m option
# is useful when the user would like to guarantee that reported alignments are "unique", for some definition of unique.
echo "# Mapping short reads to the smncRNA reference genome using bowtie aligner" >> "${script}"
echo "bowtie -q -v 1 -p 4 -S -a -m 1 --un ${unmapped_fastq} ${bowtieindexdir}/$(basename ${bowtieindexdir}) ${fastqFile} ${samfile}" >> "${script}"
echo "" >> "${script}"
# Move unmapped fastq file to unmapped fastq dir
echo "# Zip unmapped_fastq files for next iteration" >> "${script}"
echo "gzip ${unmapped_fastq}" >> "${script}"
echo "" >> "${script}"
# After mapping add the previously unmapped reads from iteration 1 to the unmapped reads from iteration2 that will be used to map on genome in iteration3
echo "# After mapping add the previously unmapped reads from iteration 1 to the unmapped reads from iteration2 that will be used to map on genome in iteration3" >> "${script}"
echo "cat ${unmapped_fastq}.gz ${unmappedFastqDirI1}/${ffname}.unmapped.fastq.gz > ${unmapped_fastq}_tmp.fastq.gz" >> "${script}"
echo "mv ${unmapped_fastq}_tmp.fastq.gz ${unmapped_fastq}.gz" >> "${script}"
echo "" >> "${script}"
# Get the BAM files from the SAM files
echo "# Get the BAM files from the SAM files" >> "${script}"
echo "samtools view -Sb ${samfile} > ${bamfile}" >> "${script}"
echo "" >> "${script}"
# Sort the bam file before creating the index
echo "# Sort the bam file before creating the index" >> "${script}"
echo "samtools sort -o ${bamfile}.sorted ${bamfile}" >> "${script}"
echo "mv ${bamfile}.sorted ${bamfile}" >> "${script}"
echo "" >> "${script}"
# Create the index of the bam file
echo "# Create the index of the bam file" >> "${script}"
echo "samtools index ${bamfile}" >> "${script}"
echo "" >> "${script}"
# Remove the sam files
echo "# Remove the sam files" >> "${script}"
echo "rm -rf ${samfile} " >> "${script}"
echo "" >> "${script}"
chmod 775 "${script}"
sh "${script}"
echo " "
done