-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathexome_auto_gatk_v2.sh
141 lines (118 loc) · 8.75 KB
/
exome_auto_gatk_v2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/sh
#
# MODIFIED: Vivek Das.
# Exome-seq variant calling pieline accoring to GATK Best practices.
# https://www.broadinstitute.org/gatk/guide/article?id=3891
#
# This is just a germline automated workflow that will call SNPs and INDELs
# In order to make this somatic one can simply stop post Step 2 and use Mutect2 or VarScan2 with their own custom scripts and filters on the processed bam files.
# Call with following arguments
# sh exome_auto_gatk_v2..sh <output_basename> <fastq folder> <output_folder_loc> [cpus]
# you can run the script in above mentioned way through another processing script that will log the processing with time at each step
# Putting the bwa mem
#
# GATK bundle set : one can obtain these from gatk ftp (knonw as gatk bundle)
# ftp://ftp.broadinstitute.org/bundle/2.8/hg19/
# The bundles were downloaded from the GATK 2.8 while the GATK 3.5 was used for the analysis
# Please download all the required tools and install them in your HPC and modify the paths accordingly for effective running of the script
# Initializing the variables required for calling the automated script
bn=$1
floc=$2
outloc=$3
nproc=${4:-12}
#Path to reference genome and Index files (This code is based on hg19)
#mention the location of the indexed hg19 file
bwa_idx="/path/vdas/test_exome/exome/hg19"
#mention the location of the hg19 fasta file that will be used as the reference genome
ref="/path/vdas/test_exome/exome/hg19.fa"
#Tools and environment required to run this script
#Path to STAR,java,gatk and picard tools
java_home="/home/vdas/tools/jdk1.8.0_77/bin/java"
#location for bwa mem
bwamem="/path/softwares/bwa/bwa"
#location for the GATK toolkit jar file
gatk="/path/softwares/GATK-3.5/GenomeAnalysisTK.jar"
#location for picard for marking duplicates
picard="/home/vdas/tools/picard-tools-2.2.0/picard.jar"
#location for sambamba which is used for processing the bam files. can be found here http://lomereiter.github.io/sambamba/
sambamba="/home/vdas/tools/sambamba/build/sambamba"
#Path to gatk bundle set files ( these files are used for the different base quality calibration steps that are useful for the variant
#workflow. These bundles were downloaded from GATK 2.8 since they were same as for 3.5
millsIndels="/path/GATK_2.8_bundle/ftp.broadinstitute.org/bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf"
KGIndels="/path/GATK_2.8_bundle/ftp.broadinstitute.org/bundle/2.8/hg19/1000G_phase1.indels.hg19.sites.vcf"
dbSNP138="/path/GATK_2.8_bundle/ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf"
#Exon target baits from Agilent SSv4 set (should be upgraded to new version if new agilent kit is used)
ss4exonbaits="/path/vdas/referenceBed/hg19/ss_v4/Exon_SSV4_clean.bed"
#Create an output directory
mkdir -p "$outloc/${bn}_processed"
#fout="$outloc/${bn}_processed"
echo "output directory for fastq $outloc/${bn}"_processed" ..."
#fout=$outloc/$bn"_processed"
fout="$outloc/${bn}_processed"
echo "$fout ..."
echo "performing assembly to create one fastq file for each read mates ..."
zcat $floc/*1.fastq.gz > $fout/${bn}_R1.fastq
zcat $floc/*2.fastq.gz > $fout/${bn}_R2.fastq
# Aligning the paired end read mates to the reference genome hg19 with bwa mem. This is for paired-end data
echo -e "["$(date)"]\tAligning.."
$bwamem mem -t 8 -M -R "@RG\tID:${bn}\tLB:PairedEnd\tPL:Illumina\tPU:000000000-A442D\tSM:${bn}" $bwa_idx $fout/${bn}_R1.fastq $fout/${bn}_R2.fastq | samtools view -bS -F 0x04 - > $fout/${bn}"_Aligned.out.bam"
#sorting the bam files with sambamba
echo -e "["$(date)"]\tSorting.."
$sambamba sort -o $fout/${bn}"_sorted.bam" -p -t 12 $fout/${bn}"_Aligned.out.bam"
#rm $fout/${bn}"_Aligned.out.bam"
rm $fout/${bn}_R1.fastq
rm $fout/${bn}_R2.fastq
# indexing the sorted aligned bam file
echo -e "["$(date)"]\tIndexing.."
$sambamba index -p -t 12 $fout/${bn}"_sorted.bam"
#picard mark duplicates
#Remove the reads due to PCR duplicates or one can mark them with a flag
echo -e "["$(date)"]\tMarking duplicates.."
$java_home -Xmx2g -jar $picard MarkDuplicates I=$fout/${bn}"_sorted.bam" O=$fout/${bn}"_dupMarked.bam" M=$fout/${bn}"_dup.metrics" CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT 2>$fout/${bn}.MarkDuplicates.log
### Remove the aligned files which are not required downstream for memory management
rm $fout/${bn}"_Aligned.out.bam"
rm $fout/${bn}"_sorted.bam"
rm $fout/${bn}"_sorted.bam.bai"
#SplitNCigarReads
#echo -e "["$(date)"]\tSpliting reads.."
#$java_home -Xmx2g -jar $gatk -T SplitNCigarReads -R $ref -I $fout/${bn}"_dupMarked.bam" -o $fout/${bn}"_split.bam" -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS 2>$fout/${bn}.SplitNCigarReads.log
#Now the GATK 3.5 process handles starts from below
#Step 1 Local realignment around the INDELS (please refer to http://gatkforums.broadinstitute.org/gatk/discussion/38/local-realignment-around-indels#latest)
#Create targets for indel realignment
echo -e "["$(date)"]\tCreating targets for indel realignment.."
$java_home -Xmx2g -jar $gatk -T RealignerTargetCreator -R $ref -I $fout/${bn}"_dupMarked.bam" -o $fout/${bn}".intervals" -nt 12 -known $millsIndels -known $KGIndels -L $ss4exonbaits 2>$fout/${bn}.indel.log
#
#Perform indel realignment
echo -e "["$(date)"]\tPerforming Indel Realignment.."
$java_home -Xmx2g -jar $gatk -T IndelRealigner -R $ref -I $fout/${bn}"_dupMarked.bam" -targetIntervals $fout/${bn}".intervals" -known $millsIndels -known $KGIndels -o $fout/${bn}"_realigned.bam" 2>$fout/${bn}.indel2.log
#rm $fout/${bn}"_split.bam"
#rm $fout/${bn}"_split.bai"
rm $fout/${bn}"_dupMarked.bam"
rm $fout/${bn}"_dupMarked.bai"
#Step 2: Perform BQSR (base quality recalibration after realigment step. This is a standard test that is performed to Detect systematic errors
# in base quality scores.
#Please refer to https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_bqsr_BaseRecalibrator.php)
echo -e "["$(date)"]\tPerforming BQSR.."
$java_home -Xmx2g -jar $gatk -T BaseRecalibrator -I $fout/${bn}"_realigned.bam" -R $ref -knownSites $KGIndels -knownSites $millsIndels -knownSites $dbSNP138 -o $fout/${bn}"_recal.table" -L $ss4exonbaits 2>$fout/${bn}.BQSR.log
#Print recalibrated reads. This handle writes out sequence read data (for filtering, merging, subsetting etc)
#Please refer to https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_readutils_PrintReads.php
echo -e "["$(date)"]\tPrinting recalibrated reads.."
$java_home -Xmx2g -jar $gatk -T PrintReads -R $ref -I $fout/${bn}"_realigned.bam" -nct 12 -BQSR $fout/${bn}"_recal.table" -o $fout/${bn}"_recal.bam" 2>$fout/${bn}.BQSR2.log
#Step 3:Call the variants post realignment and recalibration with Haplotype caller. It calls germline SNPs and indels via local re-assembly of haplotypes
#Run HaplotypeCaller (please refer to https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)
echo -e "["$(date)"]\tRunning HaplotypeCaller.."
$java_home -Xmx2g -jar $gatk -T HaplotypeCaller -R $ref -I $fout/${bn}"_recal.bam" -L $ss4exonbaits -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o $fout/${bn}".vcf" 2>$fout/${bn}.HaplotypeCaller.log
# Seperate out the SNPs from the from INDELs followed by filtration
#Filter variants for SNPs and then filter for qaulity scores. Variant filtration methods
echo -e "["$(date)"]\tFiltering Point Variants.."
$java_home -Xmx2g -jar $gatk -T SelectVariants -R $ref -V $fout/${bn}".vcf" -selectType SNP -o $fout/${bn}"_raw_snps.vcf"
#### filter over snps for scores that includes the combination of different Variant filter expression
# Filter variant calls based on INFO and FORMAT annotations
# Please refer to https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_filters_VariantFiltration.php
$java_home -Xmx2g -jar $gatk -T VariantFiltration -R $ref -V $fout/${bn}"_raw_snps.vcf" --filterExpression "DP < 5 || SB > -0.1 || QUAL < 50.0 || QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || DP > 500" --filterName "DAS_snp_filters" -o $fout/${bn}"_filtered_snps.vcf" 2>$fout/${bn}.VariantFilter_snps.log
echo -e "["$(date)"]\tFiltering INDEL Variants.."
###select indels with the SelectVariants handle and -selectType process to give raw indels
$java_home -Xmx2g -jar $gatk -T SelectVariants -R $ref -V $fout/${bn}".vcf" -selectType INDEL -o $fout/${bn}"_raw_indels.vcf"
#### filter over raw indel indel files based on ased on INFO and FORMAT annotations
$java_home -Xmx2g -jar $gatk -T VariantFiltration -R $ref -V $fout/${bn}"_raw_indels.vcf" --filterExpression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0 || SOR > 10.0" --filterName "DAS_indel_filters" -o $fout/${bn}"_filtered_indels.vcf" 2>$fout/${bn}.VariantFilter_indels.log
echo -e "["$(date)"]\tDONE!"