-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathathlon.sh
160 lines (135 loc) · 9.5 KB
/
athlon.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#Athlon: Accurate typing of human leukocyte antigen (HLA) by Oxford Nanopore sequencing
#Copyright (c) 2018 Washington University
#Created by: Chang Liu
#Washington University hereby grants to you a non-transferable, non-exclusive, royalty-free, non-commercial, research license to use and copy the computer code that may be downloaded within this site (the “Software”). You agree to include this license and the above copyright notice in all copies of the Software. The Software may not be distributed, shared, or transferred to any third party. This license does not grant any rights or licenses to any other patents, copyrights, or other forms of intellectual property owned or controlled by Washington University. If interested in obtaining a commercial license, please contact Washington University's Office of Technology Management ([email protected]).
#YOU AGREE THAT THE SOFTWARE PROVIDED HEREUNDER IS EXPERIMENTAL AND IS PROVIDED “AS IS”, WITHOUT ANY WARRANTY OF ANY KIND, EXPRESSED OR IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE, OR NON-INFRINGEMENT OF ANY THIRD-PARTY PATENT, COPYRIGHT, OR ANY OTHER THIRD-PARTY RIGHT. IN NO EVENT SHALL THE CREATORS OF THE SOFTWARE OR WASHINGTON UNIVERSITY BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR IN ANY WAY CONNECTED WITH THE SOFTWARE, THE USE OF THE SOFTWARE, OR THIS AGREEMENT, WHETHER IN BREACH OF CONTRACT, TORT OR OTHERWISE, EVEN IF SUCH PARTY IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
#!/bin/bash
{
start=$(date)
i=$1 # read number following ./athlon.sh
q=$1 # qual threshold for freebayes (proportional to the read number)
datafolder=data$i/
rsltfolder=rslt$i/ #will hold intermediate files after completion of the program.
reffolder=reference/
#make datafolder that contains sampled reads for each locus
mkdir $datafolder
for file in $(ls data) ; do seqtk sample data/$file $i > $datafolder/$file ; done
mkdir $rsltfolder
> $rsltfolder'rslt'.txt #will be moved to the pwd after completion of the program.
for sample in $(ls $datafolder | awk -F"[_]" '{print $1"_"$2}')
do
seqfile=$datafolder$sample'*.fastq'
locus=$(echo $sample | cut -d'-' -f 2)
ref='g_3f_hla3260_'$locus'_exon_2_3_introngap'
echo '*********||| sample: '$sample', locus: '$locus' |||*********'
echo $sample >> $rsltfolder'rslt'.txt
echo 'indexing reference '$reffolder$ref'.fasta ...'
samtools faidx $reffolder$ref'.fasta'
echo 'aligning reads to the reference...'
blasr $seqfile $reffolder$ref'.fasta' -minMatch 14 -hitPolicy randombest -nproc 4 -sam -clipping soft -out $rsltfolder$sample.sam
echo 'converting sam file to bam file, sorting, and indexing ...'
samtools view -b $rsltfolder$sample.sam -o $rsltfolder$sample.bam
samtools sort $rsltfolder$sample.bam $rsltfolder$sample.sorted
samtools index -b $rsltfolder$sample.sorted.bam
echo 'quantify coverage by bedtools ...'
echo 'generating bed file from the reference file ...'
faidx --transform bed $reffolder$ref.fasta > $reffolder$ref.bed
echo 'sum coverage of 3-field typing and sort in descending order ...'
coverageBed -abam $rsltfolder$sample.sorted.bam -b $reffolder$ref.bed -d | groupBy -g 1 -c 14 -o sum | sort -rV -k 2,2 > $rsltfolder$sample$'_3f_coverage'.csv
echo 'sum coverage of 2-field typing and sort in descending order ...'
awk -F":" '$1=$1":"$2' OFS=" \t" $rsltfolder$sample$'_3f_coverage'.csv | sort -k 1,1 | groupBy -g 1 -c 4 -o sum | sort -Vr -k 2,2 > $rsltfolder$sample$'_2f_coverage'.csv
echo 'sum coverage of 1-field typing and sort in descending order ...'
awk -F":" '$1=$1' OFS="\t" $rsltfolder$sample$'_3f_coverage'.csv | sort -k 1,1 | groupBy -g 1 -c 4 -o sum | sort -Vr -k 2,2 > $rsltfolder$sample$'_1f_coverage'.csv
echo '***top 10 1-field typings***'
head $rsltfolder$sample$'_1f_coverage'.csv
echo '***top 10 2-field typings***'
head $rsltfolder$sample$'_2f_coverage'.csv
echo '***top 10 3-field typings***'
head $rsltfolder$sample$'_3f_coverage'.csv
echo 'generating 1-field candidate allele pair ...'
allele0=$(awk 'NR==1 {print $1}' $rsltfolder$sample$'_1f_coverage'.csv) #1-field dominant allele
cover0=$(awk 'NR==1 {print $2}' $rsltfolder$sample$'_1f_coverage'.csv)
allele1=$(awk 'NR==2 {print $1}' $rsltfolder$sample$'_1f_coverage'.csv) #1-field minor allele
cover1=$(awk 'NR==2 {print $2}' $rsltfolder$sample$'_1f_coverage'.csv)
echo $allele0
if (( $(( 100*cover1 )) < $(( 23*cover0 )) )) #if the 1-field minor allele has < 15% coverage of the 1-field dominant allele, call the 1-field dominant allele only
then
echo '-'
echo 'generating 2-field candidate allele pair ...'
cat $rsltfolder$sample$'_2f_coverage'.csv | awk -v a="$allele0" -F":" '$1==a' | head -n 2 > $rsltfolder$sample'candi_2f'.csv
allele0=$(awk 'NR==1 {print $1}' $rsltfolder$sample'candi_2f'.csv) #2-field dominant allele
cover0=$(awk 'NR==1 {print $2}' $rsltfolder$sample'candi_2f'.csv)
allele1=$(awk 'NR==2 {print $1}' $rsltfolder$sample'candi_2f'.csv) #2-field minor allele
cover1=$(awk 'NR==2 {print $2}' $rsltfolder$sample'candi_2f'.csv)
echo $allele0
if (( $(( 100*cover1 )) < $(( 71*cover0 )) )) #if the 2-field minor allele has < 40% coverage of the 2-field dominant allele, call the 2-field dominant allele only
then
echo '-' #'there seems to be only one 2-field candidate allele: '
sed -i /$allele1/d $rsltfolder$sample'candi_2f'.csv #remove the 2-field minor allele
else
#do nothing, keep the 2-field minor allele
echo $allele1
fi
else
echo $allele1 #print the 1-field minor allele
echo 'generating 2-field candidate allele pair ...'
cat $rsltfolder$sample$'_2f_coverage'.csv | awk -v a=$allele0 -F":" '$1==a' | head -n 1 > $rsltfolder$sample'candi_2f'.csv
cat $rsltfolder$sample$'_2f_coverage'.csv | awk -v a=$allele1 -F":" '$1==a' | head -n 1 >> $rsltfolder$sample'candi_2f'.csv
cut -f1 $rsltfolder$sample'candi_2f'.csv
fi
echo 'generating 3-field candidate allele pair ...'
> $rsltfolder$sample'candi_3f.csv' #create an empty file to store 3f candidates
for allele in `cut -f1 $rsltfolder$sample'candi_2f'.csv`
do
grep $(echo "$allele" | sed "s/*/\\\*/g") -m 1 $rsltfolder$sample$'_3f_coverage'.csv | cut -f1 >> $rsltfolder$sample'candi_3f'.csv #Error prone, e.g. searching for A*02:01 results in A*24:02:01G; this is solved by the sed command to replace * with \*
done
cat $rsltfolder$sample'candi_3f'.csv
echo 'preparing fasta files for candidate allele pair ...'
> $rsltfolder$sample'candi_3f'.fasta
for allele in `cat $rsltfolder$sample'candi_3f'.csv`
do
samtools faidx $reffolder'g_3f_hla3260_'$locus.fasta $allele >> $rsltfolder$sample'candi_3f'.fasta
done
samtools faidx $rsltfolder$sample'candi_3f'.fasta
echo 'realign reads to 3-field candidate allele pair ...'
blasr $seqfile $rsltfolder$sample'candi_3f'.fasta -minMatch 14 -hitPolicy randombest -nproc 4 -sam -clipping soft -out $rsltfolder$sample'_3f'.sam
echo 'converting sam file to bam file, sorting, and indexing ...'
samtools view -b $rsltfolder$sample'_3f'.sam -o $rsltfolder$sample'_3f'.bam
samtools sort $rsltfolder$sample'_3f'.bam $rsltfolder$sample'_3f'.sorted
#replace PU with SM in the @RG tag to satisfy freebayes requirement
samtools view $rsltfolder$sample'_3f'.sorted.bam -H | sed 's,\tPU:.*,\tSM:None\tLB:None\tPL:ONT,g' | samtools reheader - $rsltfolder$sample'_3f'.sorted.bam > $rsltfolder$sample'_3fsm'.sorted.bam
samtools index -b $rsltfolder$sample'_3fsm'.sorted.bam
#echo 'quantify coverage for each candidate allele by bedtools ...'
cd $rsltfolder
echo 'generating consensus sequence ...'
#split bam file by reference:
bamtools split -in $sample'_3fsm'.sorted.bam -reference
an=$(cat $sample'candi_3f'.csv | wc -l) #number of candidate alleles
for allele in `cat $sample'candi_3f'.csv`
do
samtools faidx $sample'candi_3f'.fasta $allele > $sample'candi_3f_'$allele.fasta
freebayes -f $sample'candi_3f_'$allele.fasta -p 1 $sample'_3fsm'.sorted.'REF_'$allele.bam > $sample$allele'_unfiltered'.vcf
vcffilter -f "QUAL > $q" $sample$allele'_unfiltered'.vcf > $sample$allele.vcf
fn=$(cat $sample$allele.vcf | awk '/^[[:upper:]]/' | wc -l)
if [ $fn -eq 0 ]
then
blastn -query $sample'candi_3f_'$allele.fasta -subject '../'$reffolder'g_3f_hla3260_'$locus.fasta -outfmt 6 | sort -k1,1 -k12,12nr -k11,11n | sort -u -k1,1 --merge | cat | awk '{print $2" "$3" "$4" "$5" "}' > rslt
else
#vcf2fasta does not work if the vcf file does not have any variant in it.
vcf2fasta -f $sample'candi_3f_'$allele.fasta -P 1 -p $sample $sample$allele.vcf
blastn -query $sample'None_'$allele':0'.fasta -subject '../'$reffolder'g_3f_hla3260_'$locus.fasta -outfmt 6 | sort -k1,1 -k12,12nr -k11,11n | sort -u -k1,1 --merge | cat | awk '{print $2" "$3" "$4" "$5" "}' > rslt
fi
echo $(cat rslt) $fn >> 'rslt'.txt
done
if [ $an -eq 1 ]
then
echo "-" >> 'rslt'.txt
fi
cd ..
done
end=$(date)
echo 'program started at '$start
echo 'program ended at '$end
} > log
mv log $(date -d "today" +"%Y%m%d%H%M%S").data$i.log
mv $rsltfolder'rslt'.txt $(date -d "today" +"%Y%m%d%H%M%S").data$i.rslt