-
Notifications
You must be signed in to change notification settings - Fork 0
/
intergenicVCFfilteringAllPurpose
64 lines (43 loc) · 3.11 KB
/
intergenicVCFfilteringAllPurpose
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
##Filter VCF by intergenic bed file
bcftools view -I -O z Schil_capture_variants_Chr1-12.reheader.filtered.vcf.gz \
-R ../bedFiles/intergenic_regions.bed -o Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.vcf.gz
##Filter VCF by high depth intergenic regions
awk '$6=="yes"' intergenic_regions.bed > intergenic_regions_goodDP.bed
bcftools view -I -O z Schil_capture_variants_Chr1-12.reheader.filtered.vcf.gz \
-R ../bedFiles/intergenic_regions_goodDP.bed -o Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.vcf.gz
#######################Decompose complex variants################################
zcat Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.vcf.gz | \
vcfallelicprimitives > Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.vcf
bgzip Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.vcf
#######################Create VCFs with no indels###############################
vcftools --gzvcf Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.vcf.gz \
--remove-indels --out Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels --recode-INFO-all --recode
bgzip Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels.recode.vcf
bcftools index Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels.recode.vcf.gz
#######################Create files to measure VCF stats and apply filtering parameters based on results###########################
VCF=../VCFs/Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels.recode.vcf.gz;
OUT=intergenic.goodDP;
vcftools --gzvcf $VCF --freq2 --out $OUT --max-alleles 2;
vcftools --gzvcf $VCF --depth --out $OUT;
vcftools --gzvcf $VCF --site-mean-depth --out $OUT;
vcftools --gzvcf $VCF --site-quality --out $OUT;
vcftools --gzvcf $VCF --missing-indv --out $OUT;
vcftools --gzvcf $VCF --missing-site --out $OUT;
vcftools --gzvcf $VCF --het --out $OUT;
#######################Filter VCF based on R calculated stats###################################
##No MAF
vcftools --gzvcf Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels.recode.vcf.gz \
--remove-indels --minQ 30 --minDP 10 --max-missing 0.98 --out intergenicSNPs.noMAF --recode-INFO-all --recode
bcftools view -s ^Spenn_LA0716_ERR418107,\
S.chiCGN15532_1_ERR418097,\
S.chiCGN15530_1_ERR418098,\
Sper_LA1278_ERR418084,\
Sper_LA1954_ERR418094 intergenicSNPs.noMAF.recode.vcf > intergenicSNPs.noMAF.noOutgroup.vcf
##No singletons
vcftools --gzvcf Schil_capture_variants_Chr1-12.reheader.filtered.intergenic.goodDP.decomposedVariants.noIndels.recode.vcf.gz --remove-indels \
--minQ 30 --mac 2 --minDP 10 --max-missing 0.98 --out intergenicSNPs.noSingletons --recode-INFO-all --recode
bcftools view -s ^Spenn_LA0716_ERR418107,\
S.chiCGN15532_1_ERR418097,\
S.chiCGN15530_1_ERR418098,\
Sper_LA1278_ERR418084,\
Sper_LA1954_ERR418094 intergenicSNPs.noSingletons.recode.vcf > intergenicSNPs.noSingletons.noOutgroup.vcf