-
Notifications
You must be signed in to change notification settings - Fork 133
/
scoreSets.sh
executable file
·336 lines (274 loc) · 14 KB
/
scoreSets.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#!/bin/bash
# Copyright xmuspeech (Author:Snowdar 2018-11-07)
set -e
#####################
# All processes are tuning and you could use bool varibles conveniently to ignore what process you don't need.
#
# All datasets should be in same directory (datadir and vectordir), and should have a fixed name, such as
#
# data/plp_20_5.0/enroll -> exp/xvector/tdnn6/enroll
# data/plp_20_5.0/test -> exp/xvector/tdnn6/test
#
# If your condition is not satisfied with this, please correct it by yourself.
# This script has not enough checking yet, so use it carefully.
#####################
eval=false # if true, will just generate the score rather than compute metric
prefix=plp_20_5.0 # if NULL, datadir will be data/$someset rather than data/$prefix/$someset.
extra_name= # it could be used to mark a trainset in score name
trainset= # for convenience with using default config. if NULL, will be set by enrollset
enrollset=train # should be one set only
testset=dev # should be one set only
trials= # if not NULL, will be generated by spk2utt of enrollset and utt2spk of testset in which one speaker is one class
vectordir=exp/base_xv/tdnn6
vectortype= # xvector or ivector or any others defined by yourself.If NULL, find it automatically but only support xvector and ivector
# Process could consist of "lda" "whiten" "norm" "mean" and "submean" now.
enroll_process="mean-lda-submean-whiten-norm" # this process could have "mean" process for multi-utterance enroll.
test_process="lda-submean-whiten-norm" # this process should not contain "mean" because the testset should be kept in all utterances.
#####################
# There are some processes with providing resource, such as mat, and these processes is used to deal with foo dataset
# before generating the resource file. It is a suggestion that to keep these processes consistent with testset_process before the
# resource needed by a process in testset_process, for example,
# testset_process lda - submean - whiten - norm
# ^ ^ ^
# | | |
# lda_process [norm -] trainlda | |
# | |
# submean_process lda - getmean |
# |
# whiten_process lda - submean - trainwhiten
#
# But at the same time, there is an exception that to add a extra 'norm' process before trainlda in lda_process as the same as 'trainplda'
# and 'trainaplda', which chould be better, not matter whether the testset_process contains the 'norm' process before 'lda' (Actually, add 'norm' process just at
# the end of testset_process could perform better). As for 'submean' and 'whiten', maybe it works, too.
#####################
default_config=false # if true, all of the data config of lda, submean and whiten are ""$trainset[$trainset $enrollset $testset]""
lda=false # if false, forcely ignore lda process
clda=10
lda_process="norm-trainlda"
lda_data_config="train[train dev]" # if NULL, will be set "$enrollset[$enrollset $testset]"
submean=false
submean_process="lda-getmean" # getmean means computing the global mean vector from a dataset
submean_data_config="train[train dev]" # if NULL, will be set "$enrollset[$enrollset $testset]"
whiten=false # if false, forcely ignore whiten process
whiten_process="lda-submean-trainwhiten" # trainwhiten means train a ZCA whitening mat and trainpcawhiten means PCA
whiten_data_config="train[train dev]" # if NULL, will be set "$enrollset[$enrollset $testset]"
#####################
score="cosine" # cosine | plda | aplda | svm | gmm | lr #
metric="eer" # eer | Cavg #
#####################
# SVM #
curve=rbf
Cvalue=0.1
##svm_trainset is just the enrollset but has its own process
svm_process="lda-submean-whiten-norm" # this process should not contain "mean" for too few training data of every speaker (only one vector).
# GMM #
nj=20
mmi=true # for mmi
init_mmi=true # for mmi
tau=400 # for mmi
weight_tau=10 # for mmi
smooth_tau=0 # for mmi
E=2 # for mmi
cnum=64 # num of Gaussions
num_iters_init=20
num_iters=4 # for every GMM
num_gselect=30 # Number of Gaussian-selection indices to use while training the model.
num_frames=500000 # for inition
num_frames_den=500000 # for mmi
min_gaussian_weight=0.0001
adapt=false # if true and mmi=false, use adapt-gmm
##gmm_trainset is just the enrollset but has its own process
gmm_process="norm" # this process should not contain "mean" for too few training data of every speaker (only one vector).
# Logistic Regression #
max_steps=20
mix_up=0
apply_log=true
scale= # for example: "[ 0.6 0.4]" for two classes
lr_process="lda-submean-whiten-norm" # this process should not contain "mean" for too few training data of every speaker (only one vector).
# plda #
plda_smoothing=0.0 # work for plda or adapt-plda
plda_trainset="train" # should be one set only
plda_process="lda-submean-whiten-norm-trainplda"
# adapt-plda #
aplda_smoothing=0.0
within_covar_scale=0.70
between_covar_scale=0.30
mean_diff_scale=1
aplda_trainset="train" # should be one set only
aplda_process="lda-submean-whiten-norm-trainaplda"
#####################
# clear existent generated file or not #
force_clear= # if NULL, the follows could be set up separately
process_force_clear=true
trials_force_clear=false
score_force_clear=true
# source fuctions #
################################################################################
. subtools/parse_options.sh
. subtools/score/process.sh
. subtools/score/score.sh
. subtools/path.sh
# check and generate config #
################################################################################
check "$enroll_process" "lda submean whiten norm mean" enroll_process
check "$test_process" "lda submean whiten norm mean" test_process
check "$lda_process" "submean whiten norm trainlda" lda_process
check "$submean_process" "mean lda whiten norm getmean" submean_process
check "$whiten_process" "lda norm submean trainwhiten trainpcawhiten" whiten_process
check "$svm_process" "lda submean whiten norm" svm_process
check "$gmm_process" "lda submean whiten norm" gmm_process
check "$lr_process" "lda submean whiten norm" lr_process
check "$plda_process" "lda submean whiten norm trainplda" plda_process
check "$aplda_process" "lda submean whiten norm trainaplda" aplda_process
check "$score" "cosine svm plda aplda gmm lr" score
check "$metric" "eer Cavg" metric
[ -f $vectordir/$enrollset/xvector.scp ] && vectortype=xvector && echo -e "$0: [Auto find] Your vectortype is xvector\n"
[ -f $vectordir/$enrollset/ivector.scp ] && vectortype=ivector && echo -e "$0: [Auto find] Your vectortype is ivector\n"
[ "$vectortype" == "" ] && echo "Don't find xvector or ivector type in $vectordir/$enrollset and please specify your own vectortype" && exit 1
[ "$trainset" == "" ] && trainset=$enrollset
if [ "$default_config" == "true" ];then
echo "[Notice] It will set the default config $trainset[$trainset $enrollset $testset] for lda, submean and whiten, if used."
lda_data_config="$trainset[$trainset $enrollset $testset]"
submean_data_config="$trainset[$trainset $enrollset $testset]"
whiten_data_config="$trainset[$trainset $enrollset $testset]"
else
[ "$lda_data_config" == "" ] && lda_data_config="$trainset[$trainset $enrollset $testset]" && echo "[Notice] It will set the default config $trainset[$trainset $enrollset $testset] for lda, if used."
[ "$submean_data_config" == "" ] && submean_data_config="$trainset[$trainset $enrollset $testset]" && echo "[Notice] It will set the default config $trainset[$trainset $enrollset $testset] for submean, if used."
[ "$whiten_data_config" == "" ] && whiten_data_config="$trainset[$trainset $enrollset $testset]" && echo "[Notice] It will set the default config $trainset[$trainset $enrollset $testset] for whiten, if used."
fi
[ "$lda" != "true" ] && lda_data_config=""
[ "$submean" != "true" ] && submean_data_config=""
[ "$whiten" != "true" ] && whiten_data_config=""
[[ "$score" != *"plda"* && "$score" != *"aplda"* ]] && plda_trainset=""
[[ "$score" != *"aplda"* ]] && aplda_trainset=""
allsets="$enrollset $testset $plda_trainset $aplda_trainset \
$(echo $lda_data_config | sed 's/]/ /g' | sed 's/\[/ /g') \
$(echo $submean_data_config | sed 's/]/ /g' | sed 's/\[/ /g') \
$(echo $whiten_data_config | sed 's/]/ /g' | sed 's/\[/ /g')"
for set in $(echo $allsets | sed 's/ /\n/g' | sed '/^$/d' | sort -u);do
[ ! -d data/$prefix/$set ] && echo "[exit] No such dir data/$prefix/$set" && exit 1
[ ! -d $vectordir/$set ] && echo "[exit] No such dir $vectordir/$set" && exit 1
errorNum=0
logNum=0
[ -d $vectordir/$set/log ] && logNum=$(find $vectordir/$set/log/ -name "extract.*.log" | wc -l)
[[ "$logNum" -gt 0 ]] && errorNum=$(grep ERROR $vectordir/$set/log/extract.*.log | wc -l)
[[ "$errorNum" -gt 0 ]] && echo "There are some ERRORS in $vectordir/$set/log/extract.*.log and it means you lose many vectors which is so bad thing and I suggest you to extract vectors of this dataset again." && exit 1
echo -e "name $set\ndata data/$prefix/$set\ndir $vectordir/$set\ninput $vectortype.scp" > $vectordir/$set/config
done
if [ "$lda" == "true" ];then
echo $lda_data_config | sed 's/]/\n/g' | sed 's/\[/ /g' | sed '/^$/d' | \
awk -v vdir=$vectordir -v lda_process=$lda_process -v clda=$clda '{
if($1){print "lda_process",lda_process >> vdir"/"$1"/config";}
for(i=2;i<=NF;i++){
if(!a[$i]){
print "lda_data_conf",vdir"/"$1"/config" >> vdir"/"$i"/config";
a[$i]=1;
}
}
}'
fi
if [ "$submean" == "true" ];then
echo $submean_data_config | sed 's/]/\n/g' | sed 's/\[/ /g' | sed '/^$/d'| \
awk -v vdir=$vectordir -v submean_process=$submean_process '{
if($1){print "submean_process",submean_process >> vdir"/"$1"/config";}
for(i=2;i<=NF;i++){
if(!a[$i]){
print "submean_data_conf",vdir"/"$1"/config" >> vdir"/"$i"/config";
a[$i]=1;
}
}
}'
fi
if [ "$whiten" == "true" ];then
echo $whiten_data_config | sed 's/]/\n/g' | sed 's/\[/ /g' | sed '/^$/d' | \
awk -v vdir=$vectordir -v whiten_process=$whiten_process '{
if($1){print "whiten_process",whiten_process >> vdir"/"$1"/config";}
for(i=2;i<=NF;i++){
if(!a[$i]){
print "whiten_data_conf",vdir"/"$1"/config" >> vdir"/"$i"/config";
a[$i]=1;
}
}
}'
fi
# Build connection between score process and enrollset (cosine and svm do not need.
# Specially, process svm in this top script rather than using connection.)
[ "$plda_trainset" != "" ] && writeconf plda_process $plda_process $vectordir/$plda_trainset/config
[ "$aplda_trainset" != "" ] && writeconf aplda_process $aplda_process $vectordir/$aplda_trainset/config
[ "$plda_trainset" != "" ] && writeconf plda_data_conf $vectordir/$plda_trainset/config $vectordir/$enrollset/config
[ "$plda_trainset" != "" ] && writeconf plda_data_conf $vectordir/$plda_trainset/config $vectordir/$aplda_trainset/config
[ "$aplda_trainset" != "" ] && writeconf aplda_data_conf $vectordir/$aplda_trainset/config $vectordir/$enrollset/config
# false to speed up and true to clear files with error
if [ "$force_clear" == "true" ];then
process_force_clear=true
trials_force_clear=true
score_force_clear=true
fi
if [ "$force_clear" == "false" ];then
process_force_clear=false
trials_force_clear=false
score_force_clear=false
fi
# score and compute metric #
################################################################################
enroll_conf=$vectordir/$enrollset/config
test_conf=$vectordir/$testset/config
if [ "$trials" == "" ];then
trials=$vectordir/$testset/trials
writeconf "trials" $trials $test_conf
[[ ! -f $trials || $trials_force_clear == "true" ]] && \
get_trials $enroll_conf $test_conf
else
writeconf "trials" $trials $test_conf
fi
list="$vectordir/$testset/list.tmp" # a global file which is used to avoiding re-computation
> $list
[ "$process_force_clear" == "true" ] && rm -f $vectordir/$enrollset/num_utts.ark # Fix a bug when using this script in a bad way
[ "$eval" == "true" ] && metric=""
outsets=""
outscores=""
for the_classfier in $(echo $score | sed 's/-/ /g');do
echo "[ $the_classfier ]"
if [ "$the_classfier" == "svm" ];then
enroll_file=$(process $enroll_conf $svm_process)
writeconf final $enroll_file $enroll_conf
elif [ "$the_classfier" == "gmm" ];then
enroll_file=$(process $enroll_conf $gmm_process)
writeconf final $enroll_file $enroll_conf
elif [ "$the_classfier" == "lr" ];then
enroll_file=$(process $enroll_conf $lr_process)
writeconf final $enroll_file $enroll_conf
else
enroll_file=$(process $enroll_conf $enroll_process)
writeconf final $enroll_file $enroll_conf
fi
test_file=$(process $test_conf $test_process)
writeconf final $test_file $test_conf
tmp=$(get_params_for_score $the_classfier $enroll_conf $test_conf $extra_name)
outname=$(echo "$tmp" | awk '{print $1}')
params=$(echo "$tmp" | awk '{$1="";print $0}')
[[ ! -f "${outname}.score" || "$score_force_clear" == "true" ]] && \
$the_classfier $params
outscores="$outscores ${outname}.score"
for the_metric in $(echo $metric | sed 's/-/ /g');do
[ "$the_metric" == "eer" ] && subtools/computeEER.sh --write-file ${outname}.eer $trials ${outname}.score && outsets="$outsets ${outname}.eer"
[ "$the_metric" == "Cavg" ] && subtools/computeCavg.py -pairs $trials ${outname}.score > ${outname}.Cavg && \
cat ${outname}.Cavg && outsets="$outsets ${outname}.Cavg"
done
done
rm -f $list
# print metric #
################################################################################
if [ "$eval" == "false" ];then
echo -e "\n[ $testset ]"
for x in $outsets;do
echo -e `cat $x`"\t$x"
done
else
echo -e "\n[ Eval Mod ]-[ $testset ]"
for x in $outscores;do
echo "[ $x ]"
done
fi
echo -e "\n"
#### done ####