forked from sirselim/diagnostics_exome_reporting
-
Notifications
You must be signed in to change notification settings - Fork 1
/
assess_variants.sh
executable file
·131 lines (115 loc) · 4.84 KB
/
assess_variants.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
#
# Created: 2016/04/04
# Last modified: 2019/03/21
# Author: Miles Benton
#
# """
# This script forms the start of the diagnostic WES pipeline, finding the "most damaging" muations predicted by
# and MutationTaster and MutationAssessor and generating MutationAssessor html links for them.
#
# E.g. INPUT
# ./assess_variants.sh sample_file.vcf.gz genome_build
#
# """
## Notes:
# write a script that generates html links for variants listed as MutationAssessor_pred=H (high prediction to cause damage)
# here is the url structure required:
# http://mutationassessor.org/r3/?cm=var&var=<GENOME_BUILD>,<CHR>,<POSITION>,<REF>,<ALT>&fts=all
# real example html link
# http://mutationassessor.org/r3/?cm=var&var=hg19,1,201046184,A,G&fts=all
##
# define the sample being processed
# INPUTFILE="$1" # now defined initially
# INPUTFILE="$sampleID"
# GENOME_BUILD="$2" # now defined initially
# filename=$(echo "$INPUTFILE" | tr "/ && ." " " | tr "_" " " | awk '{printf $2}')
echo "...starting annotation of sample $filename..."
# date
DATE=$(date +%Y_%m_%d)
OUTFILE=$(paste <(echo "$filename") <(echo "MutationAssessor_links") <(echo "$DATE") -d '_')
OUTFILE=$(paste <(echo results/mutation_links/) <(echo "$OUTFILE") <(echo ".txt") -d '')
# create output file or delete contents if it exists
if [ -f "$OUTFILE" ]; then
echo "csv file found... deleting its contents!"
cat /dev/null > "$OUTFILE"
else
echo "output csv file created"
> "$OUTFILE"
fi
# add header to csv file (i.e. write one line)
header=$(paste <(echo "GENOME_BUILD") \
<(echo "CHR") \
<(echo "POSITION") \
<(echo "REF") \
<(echo "ALT") \
<(echo "RSNO") \
<(echo "GENESYM") \
<(echo "DP_coverage") \
<(echo "MutTaster") \
<(echo "MutAssesor") \
<(echo "CADD") \
<(echo "URL") \
--delimiters '\t')
echo "$header" >> "$OUTFILE"
# check for variables
[[ -z "$INPUTFILE" ]] && { echo "...Please provide a vcf file to analyse..." ; exit 1; }
[[ -z "$GENOME_BUILD" ]] && { echo "...Please provide a genome build (i.e. hg19, hg38, ...)..." ; exit 1; }
echo "...You are using $GENOME_BUILD..."
# extract all variants considered 'damaging' under Mutation Assessor and Mutation Taster
# zcat "$INPUTFILE" | grep 'MutationAssessor_pred=H\|MutationTaster_pred=A' | grep 'MutationTaster_pred=D\|MutationTaster_pred=A' | grep -P ';DP=[0-9]{1,}' > tmp_variants.txt
# the above is currently both, could look at moving to an OR situation (so it's damaging in at least one, doesn't have to be both)
# TO-DO: testing the below
zcat "$INPUTFILE" | grep 'MutationAssessor_pred=H\|MutationTaster_pred=A' > tmp_variants.txt
VARNO=$(wc -l tmp_variants.txt | awk '{print $1}')
GENOME_BUILD=$(for (( c=1; c<="$VARNO"; c++)) ; do echo "$GENOME_BUILD" ; done)
echo "...identified $VARNO variants that are MutationAssessor and MutationTaster damaging..."
# chromosome
CHR=$(awk '{print $1}' tmp_variants.txt | sed -e 's/^chr//g')
# echo "$CHR"
# position info
POSITION=$(awk '{print $2}' tmp_variants.txt)
POSITION=$(echo "$POSITION")
# ref geno
REF=$(awk '{print $4}' tmp_variants.txt)
# echo "$REF"
# alt geno
ALT=$(awk '{print $5}' tmp_variants.txt)
# echo "$ALT"
# get rs id (if present)
RSNO=$(cut -f 3 tmp_variants.txt)
# get coverage
DP_coverage=$(sed 's/^.*;DP=//' tmp_variants.txt | tr ";" " " | awk '{print $1}')
#
MutationTaster=$(sed 's/^.*MutationTaster_pred=//' tmp_variants.txt | tr "; && |" " " | awk '{print $1}' | sed -e 's/chr.*/./g' | sed 's/,.//g')
#
MutationAssessor=$(sed 's/^.*MutationAssessor_pred=//' tmp_variants.txt | tr "; && |" " " | awk '{print $1}' | sed -e 's/chr.*/./g')
#
CADD=$(sed 's/^.*CADD_phred=//' tmp_variants.txt | tr "; && |" " " | awk '{print $1}' | sed -e 's/chr.*/./g')
# extract gene symbol
gene_symbol=$(sed -e 's/^.*GENEINFO=//' tmp_variants.txt | tr "| && :" " " | awk '{print $1}' | sed -e 's/chr.*/./g')
gene_symbol2=$(sed -e 's/^.*GENE=//' tmp_variants.txt | tr "| && :" " " | awk '{print $1}' | sed -e 's/chr.*/./g')
GENESYM=$(paste -d' ' <(echo "$gene_symbol" | tr ' ' '\n') <(echo "$gene_symbol2" | tr ' ' '\n') | awk '{ while(++i<=NF) printf (!a[$i]++) ? $i FS : ""; i=split("",a); print ""}' | sed 's/\. //g' | sed 's/\b\s\b/;/g')
# generate url to Mutation Assessor
URL1=$(for (( c=1; c<="$VARNO"; c++)) ; do echo "http://mutationassessor.org/r3/?cm=var&var=" ; done)
URL2=$(for (( c=1; c<="$VARNO"; c++)) ; do echo "&fts=all" ; done)
URL3=$(paste <(echo "$GENOME_BUILD") <(echo "$CHR") <(echo "$POSITION") <(echo "$REF") <(echo "$ALT") -d ',')
URL=$(paste <(echo "$URL1") <(echo "$URL3") <(echo "$URL2") -d '')
# clean up
rm tmp_variants.txt
# create output
dataset=$(paste <(echo "$GENOME_BUILD") \
<(echo "$CHR") \
<(echo "$POSITION") \
<(echo "$REF") \
<(echo "$ALT") \
<(echo "$RSNO") \
<(echo "$GENESYM") \
<(echo "$DP_coverage") \
<(echo "$MutationTaster") \
<(echo "$MutationAssessor") \
<(echo "$CADD") \
<(echo "$URL") \
--delimiters '\t')
echo "$dataset" | sed 's/\.;//g' | sed 's/;\.//g' >> "$OUTFILE"
##/END