-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpacbioCCS.sh
executable file
·202 lines (175 loc) · 4.87 KB
/
pacbioCCS.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/bin/bash
#helper function to print usage information
usage () {
cat << EOF
pacbioCCS.sh v0.0.1
by Jochen Weile <[email protected]> 2021
Runs parallel CCS analysis on a Pacbio BAM file using a SLURM HPC cluster.
Recommended number of CPUs to run this script: >12
Usage: pacbioCCS.sh [-d|--demuxIndices <FASTAFILE>] [-j|--jobcount <INTEGER>]
[-p|--passes <INTEGER>] [--] <BAMFILE>
-d|--demuxIndices : FASTA file containing demultiplexing indices. If none
is provided, then demultiplexing is skipped.
-j|--jobcount : Number of jobs to run in parallel. Defaults to 100
-p|--passes : Minimum number of CCS passes required to accept read.
Default 1
-r|--minRQ : Minimum read accuracy (RQ). Default 0.998
<BAMFILE> : The BAM file to process
EOF
exit $1
}
#number of jobs to run in parallel
JOBCOUNT=100
#minimum number of CCS passes
MINPASSES=1
#minimum Read Quality
MINRQ="0.998"
#FASTA file with barcode indices
INDICES=""
#Parse Arguments
PARAMS=""
while (( "$#" )); do
case "$1" in
-h|--help)
usage 0
shift
;;
-d|--demuxIndices)
if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
INDICES=$2
shift 2
else
echo "ERROR: Argument for $1 is missing" >&2
usage 1
fi
;;
-j|--jobcount)
if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
JOBCOUNT=$2
shift 2
else
echo "ERROR: Argument for $1 is missing" >&2
usage 1
fi
;;
-p|--passes)
if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
MINPASSES=$2
shift 2
else
echo "ERROR: Argument for $1 is missing" >&2
usage 1
fi
;;
-r|--minRQ)
if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then
MINRQ=$2
shift 2
else
echo "ERROR: Argument for $1 is missing" >&2
usage 1
fi
;;
--) # end of options indicates that the main command follows
shift
PARAMS="$PARAMS $@"
eval set -- ""
;;
-*|--*=) # unsupported flags
echo "ERROR: Unsupported flag $1" >&2
usage 1
;;
*) # positional parameter
PARAMS="$PARAMS $1"
shift
;;
esac
done
#reset command arguments as only positional parameters
eval set -- "$PARAMS"
#input BAM file from Pacbio
BAMFILE=$1
#check if it exists
if [ -z "$BAMFILE" ]; then
echo "Must provide BAM file!"
usage 1
elif ! [ -r "$BAMFILE" ]; then
echo "Cannot read BAM file $BAMFILE !"
exit 1
fi
mkdir -p demux/chunks
# mkdir -p logs
OUTBAM=demux/$(basename "$BAMFILE"|sed -r "s/\\.bam$/_ccsMerged.bam/")
OUTFQ=demux/$(basename "$BAMFILE"|sed -r "s/\\.bam$/_ccsMergedDemuxed.fastq.gz/")
#helper script to schedule a CCS chunk job
scheduleJob() {
JOBNUM=$1
JOBCOUNT=$2
INFILE=$3
TIME=${4:-"48:00:00"}
THREADS=${5:-4}
MEM=${6:-"4GB"}
OUTFILE=demux/chunks/$(basename "$INFILE"|sed -r "s/\\.bam$/_${JOBNUM}.bam/")
RETVAL=$(submitjob.sh -n ccs$JOBNUM -t $TIME -c $THREADS -m $MEM \
ccs --min-passes $MINPASSES --chunk ${JOBNUM}/${JOBCOUNT} \
--min-rq $MINRQ --num-threads $THREADS $INFILE $OUTFILE)
JOBID=${RETVAL##* }
echo $JOBID
}
# #helper function to wait for the completion of jobs
# waitForJobs() {
# #first argument should be a comma-separated list of job ids.
# JOBS=$1
# echo "Waiting for jobs to finish..."
# #the number of currently active jobs (with 1 pseudo-job to begin with)
# CURRJOBNUM=1
# while (( $CURRJOBNUM > 0 )); do
# sleep 5
# if [ -z "$JOBS" ]; then
# CURRJOBNUM=$(squeue -hu $USER|wc -l)
# else
# CURRJOBNUM=$(squeue -hu $USER -j${JOBS}|wc -l)
# fi
# echo "$CURRJOBNUM jobs remaining"
# done
# }
#if necessary, index the bamfile to allow for running CCS in parallel jobs
PBIFILE="${BAMFILE}.pbi"
if ! [ -r "$PBIFILE" ]; then
echo "Indexing BAM file $BAMFILE ..."
pbindex $BAMFILE
fi
echo "Scheduling CCS jobs..."
#schedule parallel CCS jobs on cluster
JOBS=""
for (( JOBNUM = 1; JOBNUM <= $JOBCOUNT; JOBNUM++ )); do
JOBID=$(scheduleJob $JOBNUM $JOBCOUNT $BAMFILE)
JOBS=${JOBS},$JOBID
done
waitForJobs.sh -v "$JOBS"
# merge the final results
echo "Consolidating job outputs..."
# submitjob.sh -c 4 -m 4G -n pbmerge --
pbmerge -o $OUTBAM demux/chunks/*.bam
# merge text reports
tail -n +1 demux/chunks/*report.txt>demux/reports.txt
# echo "Waiting for job to complete..."
# waitForJobs.sh
#if there are no indices provided, no demuxing will happen
if [ -z "$INDICES" ]; then
# or convert to fastq directoy
echo "BAM2FASTQ conversion..."
# submitjob.sh -c 4 -m 4G -n bam2fastq --
bam2fastq -o $OUTFQ -c 6 $OUTBAM
else
# demultiplexing (if applicable)
echo "Demultiplexing..."
# submitjob.sh -c 12 -m 4G -n lima --
lima $OUTBAM $INDICES $OUTFQ --same --ccs --min-score 80 --num-threads 12 --split-named
fi
# echo "Waiting for job to complete..."
# waitForJobs.sh
#cleanup
rm demux/chunks/*
rmdir demux/chunks
echo "Success!"