-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_vcfIndex.sh
82 lines (69 loc) · 1.86 KB
/
generate_vcfIndex.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash
HPSD_PATH=$(pwd)
# export VARIOUS="one"
# Error message
error_msg ()
{
echo 1>&2 "Error: $1"
exit 1
}
# Usage
usage()
{
echo "Usage: load_vcf_to_hpds.sh [-s | --singlevcf <VCF filename>] [-v | --variousvcfs <text file with list of VCFs (one VCF per line)>] [-h | --help]
Options:
-v | --variousvcfs <list of VCFs>
This option takes the list of VCFs that need to be processed in form of text file; one line per VCF; example content of list file (list.txt):
HG00096
HG00097
HG00099
Command example:
./load_vcf_to_hpds.sh -v list.txt
-s | --singlevcf <git hash>
Provide the VCF file name to process, i.e.: 1000KG.vcf.gz
Command example:
./load_vcf_to_hpds.sh -s 1000KG.vcf.gz
-h | --help
Displays this menu"
exit 1
}
# Read input parameters
while [ "$1" != "" ]; do
case $1 in
-v|--variousvcfs) shift
VARIOUS="$1"
;;
-s|--singlevcf) shift
SINGLE="$1"
;;
-h|--help) usage
;;
-*)
error_msg "unrecognized option: $1"
;;
*) usage
esac
shift
done
echo "Generating hpds index file: "
CHR="ALL"
ANNOTATED="1"
FILE_PATH="$HPSD_PATH/$SINGLE"
EXT="${FILE_PATH##*.}"
GZIP="0"
if [ "$EXT" != "vcf" ] || [ "$EXT" != "VCF" ]; then
echo "The VCF is zipped"
GZIP="1"
fi
bcftools query -l $SINGLE > tmp.tsv
SAMPLES="$(wc -l tmp.tsv | awk '{ print $1 }')"
SAMPLEIDS="$(paste -sd, - < tmp.tsv)"
SEQ="$(seq -s , 1 $SAMPLES)"
OS=$(uname)
# Required to avoid trailing comma
if [ "$OS" = "Darwin" ]; then
echo "Sequence for Mac"
SEQ=${SEQ%%?}
fi
echo -e "filename\tchromosome\tannotated\tgzip\tsample_ids\tpatient_ids\tsample_relationship\trelated_sample_ids\n$FILE_PATH\t$CHR\t$ANNOTATED\t$GZIP\t$SAMPLEIDS\t$SEQ" > vcfIndex.tsv
rm tmp.tsv