-
Notifications
You must be signed in to change notification settings - Fork 0
/
pacbioFilterRQ.sh
executable file
·137 lines (117 loc) · 3.23 KB
/
pacbioFilterRQ.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
#Default Quality score cutoff = 16 (i.e. accept 17 and above)
RQ=0.998
OUTFILE=""
#helper function to print usage information
usage () {
cat << EOF
pacbioFilterRQ.sh v0.0.1
by Jochen Weile <[email protected]> 2021
Filter BAM files by quality score
Usage: pacbioFilterRQ.sh [-q|--qualityCutoff <INTEGER>] [-o|--outfile <OUTBAM>] <BAM>
<BAM> : The input BAM file
-o|--outfile : The output BAM file. <BAM>_RQ<RQ>.bam in current folder.
-q|--qualityCutoff : Only allow reads with RQ greater than this. Default: $RQ
EOF
exit $1
}
set -euo pipefail
echo "pacbioFilterRQ.sh v0.0.1"
#tokenize command line arguments
PARAMS=$(getopt -u -n "pacbioFilterRQ.sh" -o "q:o:h" -l "qualityCutoff:,outfile:,help" -- "$@")
#parse parameter tokens
eval set -- "$PARAMS"
PARAMS=""
NUMRX='^[0-9]+$'
while (( "$#" )); do
case "$1" in
-q|--qualityCutoff)
if ! [[ $2 =~ $NUMRX ]]; then
echo "Error: qualityCutoff must be an integer number!">&2
usage 1
elif [[ $2 < 0 || $2 > 20 ]]; then
echo "Error: qualityCutoff must be between 0 and 20"
fi
RQ=$2
shift 2
;;
-o|--outfile)
OUTFILE=$2
shift 2
;;
-h|--help)
usage 0
shift
;;
--) #end of options. dump the rest into PARAMS
shift
PARAMS="$@"
eval set -- ""
esac
done
eval set -- "$PARAMS"
BAMFILE=$1
BAMRX='\.bam$'
if [[ -z $BAMFILE ]]; then
echo "Error: No BAM file provided!">&2
usage 1;
elif ! [[ $BAMFILE =~ $BAMRX ]]; then
echo "Error: $BAMFILE is not a .bam file!">&2
usage 1;
elif ! [[ -r $BAMFILE ]]; then
echo "Error: BAMFILE does not exist or cannot be read!">&2
usage 1;
fi
if [[ -z "$OUTFILE" ]]; then
OUTFILE=$(basename "${BAMFILE%.bam}_RQ${RQ##*.}.bam")
elif [[ ! -d $(dirname $OUTFILE) ]]; then
echo "Error: Path to outfile $OUTFILE does not exist!"
usage 1;
fi
echo "Output will be written to ${OUTFILE}"
echo "Filtering for RQ=${RQ} ..."
# #Filter file will contain all values of QS to be filtered
# FILTERFILE=$(mktemp)
# #write numbers 0 through QS into filterfile, separated by newlines
# eval echo "{0..${QS}}"|sed -r "s/ /\n/g">$FILTERFILE
# echo "Filtering..."
# #invert filter by writing to "unoutput" and discarding main output
# samtools view $BAMFILE -b -D "qs:${FILTERFILE}" -U $OUTFILE >/dev/null
# echo "Done!"
# #delete temp file
# rm $FILTERFILE
# function filterByRQ_slow() {
# BAMFILE=$1
# RQCUTOFF=${2:-0.95}
# samtools view -H $BAMFILE
# samtools view "$BAMFILE"|while IFS="" read -r SAMLINE; do
# RQVAL=$(echo $SAMLINE|grep -oP 'rq:f:\K[\S]+')
# if (( $(echo "$RQVAL > $RQCUTOFF"|bc -l) )); then
# printf "%s\n" "$SAMLINE"
# fi
# done
# }
function filterByRQ() {
BAMFILE=$1
RQCUTOFF=${2:-0.998}
samtools view -H $BAMFILE
samtools view "$BAMFILE"|python3 -c '
import sys
import re
cutoff = float(sys.argv[1])
with sys.stdin as stream:
for line in stream:
line = line.rstrip()
matchObj = re.search(r"rq:f:(\S+)", line)
if matchObj:
rq = matchObj.group(1)
if (float(rq) > cutoff):
print(line)
' $RQCUTOFF
}
filterByRQ "$BAMFILE" "$RQ"|samtools view -b -o $OUTFILE
#index bam file
pbindex $OUTFILE
#translate to fastq
bam2fastq $OUTFILE -o ${OUTFILE%.bam} -c 6
echo "Done!"