-
Notifications
You must be signed in to change notification settings - Fork 2
/
align-mem.sh
executable file
·224 lines (199 loc) · 5.75 KB
/
align-mem.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env bash
# original author: Nick Stoler
# Automates all the steps to generate an indexed BAM file using BWA and SAMTools
set -ue
REQUIRED="bwa samtools" # must be in PATH
REF_THRES=2000000000
BWA_OPTS_DEFAULT="-M"
USAGE="USAGE:
\$ $(basename $0) [-d outdir] [-o \"bwa opts\"] [-b name.bam] [-l logfile]
ref.fa reads_1.fq [reads_2.fq]
Align FASTQ with BWA-MEM and generate an indexed BAM file.
This names the output BAM using the base derived from the first fastq file,
minus any .fq or .fastq extension, and minus any _1 or _2 suffix. It indexes the
reference if it hasn't been already, choosing bwtsw unless the file is smaller
than $REF_THRES bytes. It will not overwrite any existing SAM or BAM file.
Options:
-d outdir: The output directory. The BAM and all other new files (SAM, BAI)
will be created here. Default is the directory containing the
first FASTQ file.
-o \"bwa opts\": The options you want to pass to the \"bwa mem\" command. Make
sure to enclose your options in quotes.
Default: \"$BWA_OPTS_DEFAULT\"
-b name.bam: The filename to use for the output BAM. Don't use a full path;
this is only the filename, which will be created in the FASTQ
directory, or the directory specified with -d.
-l logfile: Output stderr of all commands to this file. Stdout will still
come out of the script's stdout.
-c: Do extra cleanup afterward: Delete any reference index files
created by this script."
function fail {
echo "$@"
exit 1
}
function echev {
echo "+ $1"
eval "$1"
}
# Check that it can find tools it needs like bwa and samtools
for cmd in $REQUIRED; do
if ! which $cmd >/dev/null; then
fail "Error: could not find $cmd in PATH"
fi
done
########## Gather settings ##########
#TODO: Allow giving a full path to the output bam.
# Keep -b as a filename, but add a different option, like -a.
# Get options
outdir=''
logfile=''
bam_name=''
cleanup=''
bwa_opts="$BWA_OPTS_DEFAULT"
while getopts ":d:b:o:l:ch" opt; do
case "$opt" in
d) outdir="$OPTARG";;
b) bam_name="$OPTARG";;
o) bwa_opts="$OPTARG";;
l) logfile="$OPTARG";;
c) cleanup="true";;
[h?]) fail "$USAGE";;
esac
done
# get positional arguments
narg=$OPTIND
while [[ $narg -le $# ]]; do
arg=${@:$narg:1}
if [[ ${arg:0:1} == '-' ]]; then
fail "Error: options like $arg must come before positional arguments."
fi
narg=$((narg+1))
done
positionals=$((narg-OPTIND))
if [[ $positionals -lt 2 ]]; then
fail "$USAGE"
fi
ref="${@:$OPTIND:1}"
fq1="${@:$OPTIND+1:1}"
fq2="${@:$OPTIND+2:1}"
to_be_cleaned=
if [[ "$bam_name" == */* ]]; then
fail "Error: Should only give a filename with -b, not a path."
fi
# set $outdir
if [[ -z $outdir ]]; then
outdir=$(dirname $fq1)
fi
# get basename from fastq filename
# remove .gz, if present
basename=$(basename $fq1 .gz)
# remove .fq or .fastq
if [[ ${basename:$((${#basename}-3))} == '.fq' ]]; then
basename=$(basename $basename .fq)
else
basename=$(basename $basename .fastq)
fi
# remove _1 or _2
if [[ $basename =~ _[12]$ ]]; then
lastchar=$((${#basename}-2))
basename=${basename:0:$lastchar}
fi
# create rest of filenames, print settings
if [[ $bam_name ]]; then
bam_name_base=$(basename $bam_name .bam)
bamtmp=$outdir/$bam_name_base.tmp.bam
bam=$outdir/$bam_name
if [[ $bamtmp == $bam ]]; then
fail "Error: Output bam name can't end in \".tmp.bam\"."
fi
else
bamtmp=$outdir/$basename.tmp.bam
bam=$outdir/$basename.bam
fi
if [[ -n $fq2 ]]; then
fq2line="
fq2: $fq2"
else
fq2line=''
fi
echo "Settings:
basename: $basename
ref: $ref
fq1: $fq1$fq2line
bamtmp: $bamtmp
bam: $bam
bwa opts: $bwa_opts"
# check that input files and output dir exist
files="$ref $fq1 $fq2"
for file in $files; do
if [[ ! -s $file ]]; then
fail "Error: File $file nonexistent, inaccessible, or empty."
fi
done
# stop if any of the output files exists already
for file in "$bam" "$bamtmp"; do
if [[ -e "$file" ]]; then
fail "Error: Output file $file already exists. Please rename either the \
existing file or the input fastq file."
fi
done
if [[ ! -d $outdir ]]; then
fail "Error: could not find output directory $outdir"
fi
if [[ $logfile ]] && [[ -e $logfile ]]; then
fail "Error: specified log file \"$logfile\" already exists"
fi
########## Do alignment ##########
if [[ $logfile ]]; then
logpipe="2>> $logfile"
else
logpipe=''
fi
# Does the reference look indexed?
index_missing=
for ext in amb ann bwt pac sa; do
if ! [[ -s "$ref.$ext" ]]; then
index_missing=true
if [[ "$cleanup" ]]; then
to_be_cleaned="$to_be_cleaned $ref.$ext"
fi
fi
done
if [[ "$index_missing" ]]; then
# use bwtsw (safe for any size) unless it's definitely smaller than 2GB
algo="bwtsw"
if [[ $(du -sb $ref | awk '{print $1}') -lt $REF_THRES ]]; then
algo="is"
fi
if [[ $logfile ]]; then
echo -e "\t\t\t::::: bwa index :::::" >> "$logfile";
fi
echev "bwa index -a $algo $ref $logpipe"
fi
# alignment
if [[ $logfile ]]; then
echo -e "\t\t\t::::: bwa mem | samtools view :::::" >> "$logfile";
fi
echev "bwa mem $bwa_opts $ref $fq1 $fq2 $logpipe | samtools view -Sb - > $bamtmp $logpipe"
to_be_cleaned="$to_be_cleaned $bamtmp"
# sort BAM
if [[ $logfile ]]; then
echo -e "\t\t\t::::: samtools sort :::::" >> "$logfile";
fi
echev "samtools sort -o $bamtmp dummy > $bam $logpipe"
# index BAM
if [[ $logfile ]]; then
echo -e "\t\t\t::::: samtools index :::::" >> "$logfile";
fi
echev "samtools index $bam $logpipe"
for path in $to_be_cleaned; do
echev "rm $path"
done
if [[ $logfile ]]; then
echo -e "\t\t\t::::: done :::::" >> "$logfile";
fi
echo -n "$fq1 "
if [[ -n $fq2 ]]; then
echo -n "and $fq2 "
fi
echo "have been aligned to $ref into $bam"