-
Notifications
You must be signed in to change notification settings - Fork 4
/
make_group_file.sh
executable file
·416 lines (342 loc) · 16.1 KB
/
make_group_file.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
#!/bin/bash
version="v13 Last modified: 2020.Oct.09"
today=$(date "+%Y.%b.%d")
# Folder with the variant selector script:
scriptDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
regionSelector=${scriptDir}/"Burden_testing.pl"
#missing_cutoff=1 : Missingness threshold, individuals having missingness higher than this threshold will be excluded.
configFile=""
chunksTotal=1
chunkNo=""
#MAF=1.0
# --- print out help message and exit:
function display_help() {
echo "$1"
echo ""
echo "Create group file"
echo "version: ${version}"
echo ""
echo "Usage: $0 <parameters>"
echo ""
echo "Variant filter options:"
echo " -C - config file (reguired, no default)"
echo " -i - input variant list, tab-separated, bgzipped and TABIX indexed (required, no default, each line has to have 5 fields: chr,pos,ID,ref,alt)"
echo " -g - comma separated list of GENCODE features (gene, exon, transcript, CDS or UTR)"
echo " -e - comma separated list of GTEx features (promoter, CTCF, enhancer, promoterFlank, openChrom, TF_bind or allreg)"
echo " -l - comma separated list of overlap features (promoter, CTCF, enhancer, promoterFlank, openChrom, TF_bind or allreg)"
echo " -x - extend genomic regions by this amount (bp) (default: 0)"
echo " -o - include variants with severe consequences only (more severe than missense)"
echo ""
echo "Parameters to set up scores for variants:"
echo " -s - apply weighting; accepted values: CADD, EigenPhred"
echo " -t - the value with which the scores will be shifted (default value: if Eigen score weighting specified: 1, otherwise: 0)"
echo " -k - below the specified cutoff value, the variants will be excluded (default: 0)"
echo ""
echo "Gene list and chunking:"
echo " -L - file with gene IDs (if not specified all genes will be analyzed)."
echo " -d - total number of chunks (default: 1)."
echo " -c - chunk number (default: 1). Takes precedence over SLURM_ARRAY_TASK_ID etc."
echo ""
echo "General options:"
echo " -w - output directory (required, no default)"
echo ""
echo "Other options:"
echo " -h - print this message and exit"
echo ""
echo ""
exit 1
}
# --- Capture command line options --------------------------------------------
if [ $# == 0 ]; then display_help; fi
OPTIND=1
score=""
geneListFile=""
#while getopts ":hL:c:d:bg:m:s:l:e:x:k:t:ofw:jC:V:" optname; do
while getopts ":hL:c:d:bg:s:l:e:x:k:t:ow:C:i:" optname; do
case "$optname" in
# Gene list related parameters:
"L") geneListFile=${OPTARG} ;;
"c") chunkNo=${OPTARG} ;;
"d") chunksTotal=${OPTARG} ;;
# variant filter parameters:
"g") gencode=${OPTARG} ;;
"s") score=${OPTARG} ;;
"l") overlap=${OPTARG} ;;
"e") gtex=${OPTARG} ;;
"x") xtend=${OPTARG} ;;
"k") cutoff=${OPTARG} ;;
"t") scoreshift=${OPTARG} ;;
"o") lof=1 ;;
"C") configFile=${OPTARG} ;;
"i") inputFile=${OPTARG} ;;
# Other parameters:
"w") outputDir=${OPTARG} ;;
"h") display_help ;;
"?") display_help "[Error] Unknown option $OPTARG" ;;
":") display_help "[Error] No argument value for option $OPTARG" ;;
*) display_help "[Error] Unknown error while processing options" ;;
esac
done
if [[ -z "${inputFile}" ]]; then
"[Error] input file not specified!"
exit 1
fi
# -------------------------------- TESTING INPUT FILE FORMAT -------------------------------------
echo "Checking compression integrity"
if ! gzip -t ${inputFile};then
echo "[Error]: GZIP compression integrity failed for $inputFile"
exit 1
fi
echo "Checking format"
zcat $inputFile | awk 'BEGIN{FS="\t";x=0;}{if (NF!=5){print "Wrong number of fields (should be 5) in line" NR;print $0;print "";x=1;} if ($2 !~ /^[1-9][0-9]*$/){print "Coordinate (" $2 ") in line " NR " should be an integer";print "";x=1;} if ($4 !~ /^[ACGT]+$/){print "REF allele (" $4 ")in line " NR " has wrong format";print "";x=1;} if ($5 !~ /^[ACGT]+$/){print "ALT allele (" $5 ")in line " NR " has wrong format";print "";x=1;}}END{exit x;}'
if [[ $? != 0 ]];then
echo "[Error]: $inputFile has wrong format"
exit 1
fi
# ------------------------------------------------------------------------------------------------
if [[ -z "${outputDir}" ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Output directory not specified"
exit 1
fi
# full path
outputDir=`readlink -f $outputDir`
outputDir=${outputDir%/}
if [[ -z "${configFile}" ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Config file was not specified"
exit 1
fi
if [[ ! -e "${configFile}" ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Config file does not exist: $configFile"
exit 1
fi
no_list_warning=""
if [[ -z ${geneListFile} ]];then
gencode_file=$(grep "^gencode_file" ${configFile} | cut -f 2 -d '=')
if [[ ! -e ${gencode_file} ]];then
echo "[Error] GENCODE file (${gencode_file}) specified in the config file (${configFile} does not exist)"
exit 1
fi
totalGenes=$(zcat ${gencode_file} | cut -f 4 | wc -l)
no_list_warning="[Warning] No gene list specified; using all genes from $gencode_file"
else
totalGenes=$(cat ${geneListFile} | wc -l)
fi
if [ $totalGenes -lt $chunksTotal ];then
echo "[Error] Number of chunks ($chunksTotal) is larger than the number of genes in the gene list ($totalGenes) "
exit 1
fi
commandOptions=" --config ${configFile} --smmat ${inputFile} "
# -----------------------------------------------------------------------------------------------------------------------------
# setting chunkNo
chunk_warning=""
if [[ ! -z ${SLURM_ARRAY_TASK_ID} ]];then
if [[ ! -z ${chunkNo} ]];then
chunk_warning="WARNING: both SLURM_ARRAY_TASK_ID and chunkNo ( -c ) are defined; using chunkNo"
else
chunkNo=${SLURM_ARRAY_TASK_ID}
fi
elif [[ ! -z ${LSB_JOBINDEX} ]];then
if [[ ! -z ${chunkNo} ]];then
chunk_warning="WARNING: both LSB_JOBINDEX and chunkNo ( -c ) are defined; using chunkNo"
else
chunkNo=${LSB_JOBINDEX}
fi
else
if [[ -z ${chunkNo} ]];then
chunkNo=1 # default
fi
fi
if [[ ${chunkNo} -gt ${chunksTotal} ]];then
echo "ERROR: current chunk number ($chunkNo) is greater than the total number of chunks ($chunksTotal). EXIT"
exit 1
fi
# -----------------------------------------------------------------------------------------------------------------------------
outprefix="group_file"
# GENCODE features
if [[ ! -z "${gencode}" ]]; then
commandOptions="${commandOptions} --GENCODE ${gencode}"
str=$( echo "${gencode}" | perl -lane '$_ =~ s/^\.//;$_ =~ s/,/_/g; print $_;')
outprefix=${outprefix}"_GENCODE_"${str}
fi
# GTEx - expecting a list of feature names separeted by comma.
if [[ ! -z "$gtex" ]]; then
commandOptions="${commandOptions} --GTEx ${gtex}"
str=$( echo "${gtex}" | perl -lane '$_ =~ s/^\.//;$_ =~ s/,/_/g; print $_;')
outprefix=${outprefix}"_GTEx_"${str}
fi
# Overlap - expecting a list of features separeated by comma.
if [[ ! -z "${overlap}" ]]; then
commandOptions="${commandOptions} --overlap ${overlap}"
str=$( echo "${overlap}" | perl -lane '$_ =~ s/^\.//;$_ =~ s/,/_/g; print $_;')
outprefix=${outprefix}"_overlap_"${str}
fi
# If lof is set, only variants with severe consequences will be selected.
if [[ ! -z "$lof" ]]; then
commandOptions="${commandOptions} --lof "
outprefix=${outprefix}"_severe"
fi
warning1=""
warning2=""
score_tmp=""
# Score - If score is not given we apply no score. Otherwise we test the submitted value:
# Accepted scores:
if [[ ! -z "${score}" ]]; then
score="${score^^}"
case "${score}" in
EIGENPHRED ) score="EigenPhred";;
CADD ) score="CADD";;
* ) score_tmp="noweight";;
esac
else
score="noweight"
fi
if [[ ! -z ${score_tmp} ]];then
warning1="[Warning] Submitted score name ($score) is not recognized! Accepted scores: CADD, EigenPhred."
warning2="[Warning] No scoring will be applied."
score="noweight"
fi
# Only adding score to command line if score is requested:
if [[ "${score}" != "noweight" ]]; then
commandOptions="${commandOptions} --score ${score}";
fi
outprefix=${outprefix}"_score_"${score}
# If Eigen score is applied, we shift the scores by 1, if no other value is specified:
if [[ ("${score}" == "Eigen") && (-z "${scoreshift}" ) ]]; then scoreshift=1; fi
if [[ ! -z "${xtend}" ]]; then
commandOptions="${commandOptions} --extend ${xtend}"
outprefix=${outprefix}"_extend_"${xtend}
fi
# Setting score cutoff, below which the variant will be removed from the test:
if [[ ! -z "${cutoff}" ]]; then
commandOptions="${commandOptions} --cutoff ${cutoff}"
fi
# Setting score shift
if [[ ! -z "${scoreshift}" ]]; then
commandOptions="${commandOptions} --shift ${scoreshift}"
fi
outputDir2=${outputDir}"/"${outprefix}
mkdir -p ${outputDir2}
if [[ ! -d ${outputDir2} ]];then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Could not create ${outputDir2}"
exit 1
fi
# -----------------------------------------------------------------------------------------------------------------------------
outFile="group_file_gene_set."${chunkNo}
outputDir3=${outputDir2}"/chunk_${chunkNo}"
mkdir -p ${outputDir3}
if [[ ! -d ${outputDir3} ]];then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Could not create ${outputDir3}"
exit 1
fi
commandOptions="${commandOptions} --output-dir ${outputDir3} --output ${outFile}"
LOGFILE=${outputDir3}/"make_group_file_gene_set.${chunkNo}.log"
#--------------------------------------------------------------------------------------------
if [[ -z ${geneListFile} ]];then
gencode_file=$(grep "^gencode_file" ${configFile} | cut -f 2 -d '=')
geneListFile="${outputDir3}/gencode_gene_list.txt"
zcat ${gencode_file} | cut -f 4 > "${geneListFile}"
fi
if [[ ! -e "${geneListFile}" ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Gene list file could not be opened: $geneListFile"
echo `date "+%Y.%b.%d_%H:%M"` "[Info] MAKE GROUP FILE DONE" >> ${LOGFILE}
exit 1
fi
# -----------------------------------------------------------------------------------------------------------------------------
# creating gene list
rem=$(( totalGenes % chunksTotal ))
chunkSize=$(( totalGenes / chunksTotal ))
lastChunkSize=$(( chunkSize + rem ))
inputGeneList=${outputDir3}"/input_gene.list"
if [[ ${chunkNo} -eq ${chunksTotal} ]];then
tail -n ${lastChunkSize} ${geneListFile} > ${inputGeneList}
else
awk -v cn="${chunkNo}" -v cs="${chunkSize}" 'NR > (cn-1)*cs && NR <= cn*cs' ${geneListFile} > ${inputGeneList}
fi
n=$( cat ${inputGeneList} | wc -l)
if [[ $n -eq 0 ]];then
echo "[Error]: Chunk ${chunkNo} is empty" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] MAKE GROUP FILE DONE" >> ${LOGFILE}
exit 1
fi
# -----------------------------------------------------------------------------------------------------------------------------
if [[ ! -z ${warning1} ]];then
echo `date "+%Y.%b.%d_%H:%M"` ${warning1} >> ${LOGFILE}
fi
if [[ ! -z ${warning2} ]];then
echo `date "+%Y.%b.%d_%H:%M"` ${warning2} >> ${LOGFILE}
fi
if [[ ! -z ${chunk_warning} ]];then
echo `date "+%Y.%b.%d_%H:%M"` ${chunk_warning} >> ${LOGFILE}
fi
if [[ ! -z ${no_list_warning} ]];then
echo `date "+%Y.%b.%d_%H:%M"` ${no_list_warning} >> ${LOGFILE}
fi
# --- Reporting parameters ------------------------------------------------------
echo `date "+%Y.%b.%d_%H:%M"` "##" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "## Version ${version}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "## Date: ${today}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "##" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] General options:" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Variant selector: ${regionSelector}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Output directory: ${outputDir}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] Gene list options:" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Gene list file: ${geneListFile}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Number of chunks the gene list is split into: ${chunksTotal}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Current chunk: ${chunkNo}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Number of genes in one chunk: ${chunkSize}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] Variant filtering options:" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "input file: ${inputFile}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "GENCODE feaures: ${gencode:--}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "GTEx feaures: ${gtex:--}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Overlapping reg.features: ${overlap:-NA}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Features are extended by ${xtend:-0}bp" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] Weighting options:" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Weighting: ${score}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Score cutoff: ${cutoff:-0}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "Scores shifted by: ${scoreshift:-0}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] command line options for the selector script: ${commandOptions}" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` "" >> ${LOGFILE}
# -----------------------------------------------------------------------------------------------------------------------------
selectorLog=${outputDir3}/"selector_chunk_"${chunkNo}.log
echo `date "+%Y.%b.%d_%H:%M"` "Calling ${regionSelector} --input ${inputGeneList} ${commandOptions} --verbose > ${selectorLog} 2 > ${selectorLog}" >> ${LOGFILE}
${regionSelector} --input ${inputGeneList} ${commandOptions} --verbose > ${selectorLog} 2 > ${selectorLog}
echo `date "+%Y.%b.%d_%H:%M"` "[Info] Checking output..." >> ${LOGFILE}
cd ${outputDir3}
# We have to check if both files are generated AND they have enough lines.
gene_notenough=$(cat ${selectorLog} | grep -c NOT_ENOUGH_VAR)
gene_toomany=$(cat ${selectorLog} | grep -c TOO_MANY_VAR)
gene_noremain=$(cat ${selectorLog} | grep -c NO_VAR_REMAIN)
gene_absent=$(cat ${selectorLog} | grep -c NO_GENE)
region_absent=$(cat ${selectorLog} | grep -c NO_REGION)
echo `date "+%Y.%b.%d_%H:%M"` -e "[Info] WARNING/ERROR REPORTING FROM VARIANT SELECTOR" >> ${LOGFILE}
echo `date "+%Y.%b.%d_%H:%M"` -e "[Info] =====================================" >> ${LOGFILE}
if [[ "$gene_notenough" -ne 0 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` -e "[Warning] Not enough variants [NOT_ENOUGH_VAR]: $(cat ${selectorLog} | grep NOT_ENOUGH_VAR | sed 's/.*Gene.//;s/ .*//' | tr '\n' ' ')" >> ${LOGFILE}
fi
if [[ "$gene_toomany" -ne 0 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` -e "[Warning] Too many variants [TOO_MANY_VAR]: $(cat ${selectorLog} | grep TOO_MANY_VAR | sed 's/.*Gene.//;s/ .*//'| tr '\n' ' ')" >> ${LOGFILE}
fi
if [[ "$gene_noremain" -ne 0 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` -e "[Warning] No variants after scoring [NO_VAR_REMAIN] for genes: $(cat ${selectorLog} | grep NO_VAR_REMAIN | sed 's/.*Gene.//;s/ .*//'| tr '\n' ' ')" >> ${LOGFILE}
fi
if [[ "$gene_absent" -ne 0 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` -e "[Warning] Gene name unknown [NO_GENE]: $(cat ${selectorLog} | grep NO_GENE | sed 's/.*Gene.//;s/ .*//'| tr '\n' ' ')" >> ${LOGFILE}
fi
if [[ "$region_absent" -ne 0 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` -e "[Warning] No region in gene [NO_REGION]: $(cat ${selectorLog} | grep NO_REGION | sed 's/.*Gene.//;s/ .*//'| tr '\n' ' ')" >> ${LOGFILE}
fi
# true output group file name, as the --output to the Burden_testing.pl specifies output prefix only
outFile=${outputDir3}/${outFile}".txt"
if [[ ! -e ${outFile} ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Gene set ${chunkNo} has failed. No group file has been generated" >> ${LOGFILE}
# exit 1
elif [[ $(cat ${outFile} | wc -l ) -lt 1 ]]; then
echo `date "+%Y.%b.%d_%H:%M"` "[Error] Gene set ${chunkNo} has failed, group file is empty" >> ${LOGFILE}
# exit 1
fi
echo `date "+%Y.%b.%d_%H:%M"` "[Info] MAKE GROUP FILE DONE" >> ${LOGFILE}