-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathv2ida.sh
138 lines (120 loc) · 7.48 KB
/
v2ida.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
# This pipeline was developed by Andrade, AAS(https://github.com/aandradebio) ([email protected]) at the National Laboratory for Scientific Computing - Bioinformatic Laboratory (LABINFO), with contributions from Soares, AER(https://github.com/aersoares81) and Vasconcelos, ATR.Developed by Andrade, AAS ([email protected]) at the National Laboratory for Scientific Computing - Bioinformatic Laboratory (LABINFO)
#
# This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
###STEP 01 - SETUP AND USER INPUT
## Set Paramters via Command Line
if [ "$1" == "-help" ] ; then
echo
echo " Usage: ./ `basename $0` id ref pair-end initial final parts"
echo
echo " id is the name of a .tab file containing a sample name, r1.fastq file and r2.fastq file in the same line. Other samples should be in the next lines."
echo " ref is the reference genome used for alignment (default: fasta format). Do not input extension, only the name of the file"
echo " pair-end or single-end"
echo " initial is the inicial nucleotide for quasispecies reconstruction"
echo " final is the final nucleotide for quasispecies reconstruction"
echo " parts is how many parts you would like to divide the genome for quasispecies reconstruction (eg. 1 if you dont want to divide)"
echo
echo " Please visit github.com/aandradebio/V2IDA for more information"
echo
exit 0
fi
## The name of a file containing the samples names (without the extension)
ID_NAMES=$1
## Reference genome (default: fasta format)
R=$2
REF=$R.fasta
## Pair-end or Single end alignment mode
MODE="$3"
## Inicial nucleotide for quasispecies reconstruction
GSI=$4
## Final nucleotide for quasispecies reconstruction
GSF=$5
## How many parts you would like to divide the genome (integer number)
PART=$6
## Get the current working directory
WD="$(pwd)"
#Default: the dependencies should be in PATH. If not, you can create an alias (eg. alias gatk='/path/to/gatk-package/gatk') or put the pre-compile executables in the Working directory
if ! [ -x "$(command -v bwa)" ]; then
echo 'Error: Bwa is not installed.' >&2
exit 1
fi
if ! [ -x "$(command -v samtools)" ]; then
echo 'Error: Samtools is not installed.' >&2
exit 1
fi
if ! [ -x "$(command -v ./picard.jar)" ]; then
echo 'Error: Picard is not installed.' >&2
exit 1
fi
if ! [ -x "$(command -v gatk)" ]; then
echo 'Error: GATK4 is not installed.' >&2
exit 1
fi
if ! [ -x "$(command -v ./QuasiRecomb.jar)" ]; then
echo 'Error: QuasiRecomb is not installed.' >&2
exit 1
fi
AL=$WD/01_Alignment
VC=$WD/02_Variant_Calling
QR=$WD/03_Quasispecies_Reconstruction
if [ ! -d $AL ]; then mkdir $AL; fi
if [ ! -d $VC ]; then mkdir $VC; fi
if [ ! -d $QR ]; then mkdir $QR; fi
## Indexing and creating the dictionary
INDEX=$WD/$REF.bwt
if [ ! -f $INDEX ]; then bwa index $REF; fi
FAI=$WD/$REF.fai
if [ ! -f $FAI ]; then samtools faidx $REF; fi
DICT=$WD/$R.dict
if [ ! -f $DICT ]; then java -jar picard.jar CreateSequenceDictionary R=$REF O=$R.dict; fi
ntd=$(((GSF-GSI)/PART))
u=$((GSF-ntd))
while read ID FWDID REVID;
do
echo "Starting $ID"
p=1
f=$GSF
## Alignment
if [[ $MODE != "pair-end" ]];
then
if [ ! -f $AL/aligned_$ID.sam ]; then bwa mem -R "@RG\tID:group1\tSM:ID\tPL:illumina\tLB:lib1\tPU:unit1" $REF $FWID > $AL/aligned_$ID.sam; fi
else
if [ ! -f $AL/aligned_$ID.sam ]; then bwa mem -R "@RG\tID:group1\tSM:ID\tPL:illumina\tLB:lib1\tPU:unit1" $REF $FWID $REVID > $AL/aligned_$ID.sam; fi; fi
if [ ! -f $AL/sorted_$ID.bam ]; then java -jar picard.jar SortSam INPUT=$AL/aligned_$ID.sam OUTPUT=$AL/sorted_$ID.bam SORT_ORDER=coordinate; fi
if [ ! -f $AL/dedup_$ID.bam ]; then java -jar picard.jar MarkDuplicates INPUT=$AL/sorted_$ID.bam OUTPUT=$AL/dedup_$ID.bam METRICS_FILE=$AL/metrics_duplicate_$ID.txt; fi
if [ ! -f $AL/dedup_$ID.bai ]; then samtools index $AL/dedup_$ID.bam; fi
if [ ! -f $AL/depth_$ID.txt ]; then samtools depth $AL/dedup_$ID.bam > $AL/depth_$ID.txt; fi
echo "Mapping step concluded for $ID"
## SNP Calling
if [ ! -f $VC/raw_variants_$ID.vcf ]; then gatk HaplotypeCaller -R $REF -I $AL/dedup_$ID.bam -O $VC/raw_variants_$ID.vcf; fi
if [ ! -f $VC/raw_SNPs_$ID.vcf ]; then gatk SelectVariants -R $REF -V $VC/raw_variants_$ID.vcf --select-type SNP -O $VC/raw_SNPs_$ID.vcf; fi
if [ ! -f $VC/filtered_SNPs_$ID.vcf ]; then gatk VariantFiltration -R $REF -V $VC/raw_SNPs_$ID.vcf -O $VC/filtered_SNPs_$ID.vcf --filter-expression "QD < 2.0 && FS > 60.0 && MQ < 40.0 && SOR > 4.0" --filter-name "filter_SNPs"; fi
if [ ! -f $VC/bqsr_snps_$ID.vcf ]; then gatk SelectVariants --exclude-filtered -V $VC/filtered_SNPs_$ID.vcf -O $VC/bqsr_snps_$ID.vcf; fi
if [ ! -f $VC/raw_indels_$ID.vcf ]; then gatk SelectVariants -R $REF -V $VC/raw_variants_$ID.vcf --select-type INDEL -O $VC/raw_indels_$ID.vcf; fi
if [ ! -f $VC/filtered_indels_$ID.vcf ]; then gatk VariantFiltration -R $REF -V $VC/raw_indels_$ID.vcf -O $VC/filtered_indels_$ID.vcf --filter-expression "QD < 0.2 && FS > 200.0 && SOR > 10.0" --filter-name "filter_indels"; fi
if [ ! -f $VC/bqsr_indels_$ID.vcf ]; then gatk SelectVariants --exclude-filtered -V $VC/filtered_indels_$ID.vcf -O $VC/bqsr_indels_$ID.vcf; fi
if [ ! -f $VC/recal_data_$ID.table ]; then gatk BaseRecalibrator -R $REF -I $AL/sorted_$ID.bam --known-sites $VC/bqsr_snps_$ID.vcf --known-sites $VC/bqsr_indels_$ID.vcf -O $VC/recal_data_$ID.table; fi
if [ ! -f $VC/recal_reads_$ID.bam ]; then gatk ApplyBQSR -R $REF -I $AL/sorted_$ID.bam --bqsr $VC/recal_data_$ID.table -O $VC/recal_reads_$ID.bam; fi
if [ ! -f $VC/raw_variants_recal_$ID.vcf ]; then gatk HaplotypeCaller -R $REF -I $VC/recal_reads_$ID.bam -O $VC/raw_variants_recal_$ID.vcf; fi
if [ ! -f $VC/recal_SNPs_$ID.vcf ]; then gatk SelectVariants -R $REF -V $VC/raw_variants_recal_$ID.vcf --select-type SNP -O $VC/recal_SNPs_$ID.vcf; fi
if [ ! -f $VC/final_SNPs_$ID.vcf ]; then gatk VariantFiltration -R $REF -V $VC/recal_SNPs_$ID.vcf -O $VC/final_SNPs_$ID.vcf --filter-expression "QD < 2.0 && FS > 60.0 && MQ < 40.0 && SOR > 4.0" --filter-name "filter_SNPs"; fi
if [ ! -f $VC/recal_indels_$ID.vcf ]; then gatk SelectVariants -R $REF -V $VC/raw_variants_recal_$ID.vcf --select-type INDEL -O $VC/recal_indels_$ID.vcf; fi
if [ ! -f $VC/final_indels_$ID.vcf ]; then gatk VariantFiltration -R $REF -V $VC/recal_indels_$ID.vcf -O $VC/final_indels_$ID.vcf --filter-expression "QD < 0.2 && FS > 200.0 && SOR > 10.0" --filter-name "filter_indels"; fi
echo "SNP calling step concluded for $ID"
## Quasispecies reconstruction
while [ $p -le $PART ];
do
if [ ! -f $QR/$ID/$u-$f/quasispecies.fasta ]; then java -jar QuasiRecomb.jar -i $AL/dedup_$ID.bam -o $QR/$ID/$u-$f -noGaps -r $u-$f; fi
((f=$u))
u=$((u-ntd))
((p=$p+1))
done
p=1
u=$((GSF-ntd))
f=$GSF
echo "Quasispecies reconstruction step concluded for $ID"
done < $ID_NAMES.tab