-
Notifications
You must be signed in to change notification settings - Fork 6
/
run.sh
273 lines (217 loc) · 11 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/bin/bash
# This script is adapted from swbd Kaldi run.sh (https://github.com/kaldi-asr/kaldi
# Copyright 2019 Kaldi developers (see: https://github.com/kaldi-asr/kaldi/blob/master/COPYING)
# Setting data format to utf-8
utf8()
{
iconv -f ISO-8859-1 -t UTF-8 $1 > $1.tmp
mv $1.tmp $1
}
export LC_ALL=C
export LANG=C
export LANGUAGE=C
# Refer dependencies from Wall Street Journal Project (WSJ)
[ ! -L "steps" ] && ln -s ../../wsj/steps
[ ! -L "utils" ] && ln -s ../../wsj/utils
[ ! -L "rnnlm" ] && ln -s ../../../scripts/rnnlm/
# Loading path and other constants
. cmd.sh;
. path.sh
. utils/parse_options.sh
# Defining contants
mfccdir=mfcc
dict_suffix=_300k4
FILTERBYNAME="*.xml"
lang_dir=data/lang${dict_suffix}
g2p_model=${g2p_dir}/de_g2p_model
final_g2p_model=${g2p_model}-6
lm_dir=data/local/lm${dict_suffix}
arpa_lm=${lm_dir}/4gram-mincount/lm_pr10.0.gz
g2p_dir=data/local/g2p${dict_suffix}
format_lang_out_dir=${lang_dir}_test
dict_dir=data/local/dict${dict_suffix}
local_lang_dir=data/local/lang${dict_suffix}
lang_dir_nosp=${lang_dir}_nosp${dict_suffix}
extra_words_file=local/filtered_300k_vocab_de_wiki.txt
RAWDATA=data/wav/german-speechdata-package-v2
# Create Kaldi's directory structure
python3 local/prepare_dir_structure.py
# Download and proces the data
wget --directory-prefix=data/wav/ http://speech.tools/kaldi_tuda_de/german-speechdata-package-v2.tar.gz
cd data/wav/
tar xvfz german-speechdata-package-v2.tar.gz
cd ../../
mkdir -p data/wav/swc/
wget --directory-prefix=data/wav/swc/ https://www2.informatik.uni-hamburg.de/nats/pub/SWC/SWC_German.tar
cd data/wav/swc/
tar xvf SWC_German.tar
cd ../../../
wget --directory-prefix=data/ http://speech.tools/kaldi_tuda_de/swc_train_v2.tar.gz
cd data/
tar xvfz swc_train_v2.tar.gz
cd ../
mkdir -p data/wav/m_ailabs/
wget --directory-prefix=data/wav/m_ailabs/ http://speech.tools/kaldi_tuda_de/m-ailabs.bayern.de_DE.tgz
cd data/wav/m_ailabs/
tar xvfz m-ailabs.bayern.de_DE.tgz
cd ../../../
python3 local/prepare_m-ailabs_data.py
# Move files, which would later produce errors. They are saved in backup location
python3 local/move_files_to_skip.py data/wav/german-speechdata-package-v2/train/
find $RAWDATA/*/$FILTERBYNAME -type f > data/waveIDs.txt
# Prepares directories in Kaldi's format
python3 local/data_prepare.py -f data/waveIDs.txt --separate-mic-dirs
local/get_utt2dur.sh data/tuda_train
mv data/tuda_train data/train
# Phoneme dictionary
wget --directory-prefix=data/lexicon/ https://raw.githubusercontent.com/marytts/marytts-lexicon-de/master/modules/de/lexicon/de.txt
echo "data/lexicon/de.txt">> data/lexicon_ids.txt
# Transform phoneme dictionary
mkdir -p ${dict_dir}/
python3 local/build_big_lexicon.py -f data/lexicon_ids.txt -e data/local/combined.dict --export-dir ${dict_dir}/
python3 local/export_lexicon.py -f data/local/combined.dict -o ${dict_dir}/_lexiconp.txt
# Sequitur Grapheme-to-Phoneme (G2P)
mkdir -p ${g2p_dir}/
train_file=${g2p_dir}/lexicon.txt
cut -d" " -f 1,3- ${dict_dir}/_lexiconp.txt > $train_file
cut -d" " -f 1 ${dict_dir}/_lexiconp.txt > ${g2p_dir}/lexicon_wordlist.txt
$sequitur_g2p -e utf8 --train $train_file --devel 3% --write-model ${g2p_model}-1
$sequitur_g2p -e utf8 --model ${g2p_model}-1 --ramp-up --train $train_file --devel 3% --write-model ${g2p_model}-2
$sequitur_g2p -e utf8 --model ${g2p_model}-2 --ramp-up --train $train_file --devel 3% --write-model ${g2p_model}-3
$sequitur_g2p -e utf8 --model ${g2p_model}-3 --ramp-up --train $train_file --devel 3% --write-model ${g2p_model}-4
$sequitur_g2p -e utf8 --model ${g2p_model}-4 --ramp-up --train $train_file --devel 3% --write-model ${g2p_model}-5
$sequitur_g2p -e utf8 --model ${g2p_model}-5 --ramp-up --train $train_file --devel 3% --write-model ${g2p_model}-6
cp data/tuda_train/text ${g2p_dir}/complete_text
cat data/swc_train/text >> ${g2p_dir}/complete_text
cat data/m_ailabs_train/text >> ${g2p_dir}/complete_text
gawk "{ printf(\"extra-word-%i %s\n\",NR,\$1) }" $extra_words_file | cat ${g2p_dir}/complete_text - > ${g2p_dir}/complete_text_new
mv ${g2p_dir}/complete_text_new ${g2p_dir}/complete_text
python3 local/find_oov.py -c ${g2p_dir}/complete_text -w ${g2p_dir}/lexicon_wordlist.txt -o ${g2p_dir}/oov.txt
$sequitur_g2p -e utf8 --model $final_g2p_model --apply ${g2p_dir}/oov.txt > ${dict_dir}/oov_lexicon.txt
cat ${dict_dir}/oov_lexicon.txt | gawk '{$1=$1" 1.0"; print }' > ${dict_dir}/_oov_lexiconp.txt
gawk 'NF>=3' ${dict_dir}/_oov_lexiconp.txt > ${dict_dir}/oov_lexiconp.txt
sort -u ${dict_dir}/_lexiconp.txt ${dict_dir}/oov_lexiconp.txt > ${dict_dir}/lexiconp.txt
rm ${dict_dir}/lexicon.txt
unixtime=$(date +%s)
mkdir -p ${lang_dir}/old_$unixtime/
mv ${lang_dir}/* ${lang_dir}/old_$unixtime/
utils/prepare_lang.sh ${dict_dir} "<UNK>" ${local_lang_dir} ${lang_dir}
# Language Modelling
mkdir -p ${lm_dir}/
wget --directory-prefix=${lm_dir}/ http://speech.tools/kaldi_tuda_de/German_sentences_8mil_filtered_maryfied.txt.gz
mv ${lm_dir}/German_sentences_8mil_filtered_maryfied.txt.gz ${lm_dir}/cleaned.gz
local/build_lm.sh --srcdir ${local_lang_dir} --dir ${lm_dir}
local/format_data.sh --arpa_lm $arpa_lm --lang_in_dir $lang_dir --lang_out_dir $format_lang_out_dir
# MFCC
rm data/swc_train/spk2utt
cat data/swc_train/segments | sort > data/swc_train/segments_sorted
cat data/swc_train/text | sort | gawk 'NF>=2' > data/swc_train/text_sorted
cat data/swc_train/utt2spk | sort > data/swc_train/utt2spk_sorted
cat data/swc_train/wav.scp | sort > data/swc_train/wav.scp_sorted
mv data/swc_train/wav.scp_sorted data/swc_train/wav.scp
mv data/swc_train/utt2spk_sorted data/swc_train/utt2spk
mv data/swc_train/text_sorted data/swc_train/text
mv data/swc_train/segments_sorted data/swc_train/segments
utils/utt2spk_to_spk2utt.pl data/swc_train/utt2spk > data/swc_train/spk2utt
for x in swc_train tuda_train dev test; do
utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
steps/make_mfcc.sh --cmd "$train_cmd" --nj $nJobs data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x
done
combine_data.sh data/train data/tuda_train data/swc_train
mv data/train data/train_without_mailabs
x=m_ailabs_train
utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
steps/make_mfcc.sh --cmd "$train_cmd" --nj $nJobs data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x
combine_data.sh data/train data/train_without_mailabs data/m_ailabs_train
# Acoustic Modelling
utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
if [ -f data/train/segments ]; then
n=$[`cat data/train/segments | wc -l` - 4000]
else
n=$[`cat data/train/wav.scp | wc -l` - 4000]
fi
utils/subset_data_dir.sh --last data/train $n data/train_nodev
utils/subset_data_dir.sh --shortest data/train_nodev 150000 data/train_100kshort
utils/subset_data_dir.sh data/train_100kshort 50000 data/train_30kshort
utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k
utils/data/remove_dup_utts.sh 1000 data/train_100k data/train_100k_nodup
utils/data/remove_dup_utts.sh 1000 data/train_nodev data/train_nodup
cp -R ${lang_dir} ${lang_dir_nosp}
# Mono traning
steps/train_mono.sh --nj $nJobs --cmd "$train_cmd" \
data/train_30kshort ${lang_dir_nosp} exp/mono
# Triphone-1 traning
steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
data/train_100k_nodup ${lang_dir_nosp} exp/mono exp/mono_ali
steps/train_deltas.sh --cmd "$train_cmd" \
3200 30000 data/train_100k_nodup ${lang_dir_nosp} exp/mono_ali exp/tri1
graph_dir=exp/tri1/graph_nosp
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh ${lang_dir}_test exp/tri1 $graph_dir
for dset in dev test; do
steps/decode_si.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/${dset} exp/tri1/decode_${dset}_nosp
done
# Triphone-2 traning
steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
data/train_100k_nodup ${lang_dir_nosp} exp/tri1 exp/tri1_ali
steps/train_deltas.sh --cmd "$train_cmd" \
4000 70000 data/train_100k_nodup ${lang_dir_nosp} exp/tri1_ali exp/tri2
while [ ! -s ${lang_dir_nosp}_sw1_tg/tmp/CLG_3_1.fst ]; do sleep 60; done
sleep 20;
graph_dir=exp/tri2/graph_nosp
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh ${lang_dir}_test exp/tri2 $graph_dir
for dset in dev test; do
steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/${dset} exp/tri2/decode_${dset}_nosp
done
# Triphone-3 traning
steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
data/train_100k_nodup ${lang_dir_nosp} exp/tri2 exp/tri2_ali_100k_nodup
steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
data/train_nodup ${lang_dir_nosp} exp/tri2 exp/tri2_ali_nodup
steps/train_lda_mllt.sh --cmd "$train_cmd" \
6000 140000 data/train_nodup ${lang_dir_nosp} exp/tri2_ali_nodup exp/tri3
graph_dir=exp/tri3/graph_nosp
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh ${lang_dir}_test exp/tri3 $graph_dir
for dset in dev test; do
steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/${dset} exp/tri3/decode_${dset}_nosp
done
steps/get_prons.sh --cmd "$train_cmd" data/train_nodup ${lang_dir_nosp} exp/tri3
utils/dict_dir_add_pronprobs.sh --max-normalize true \
${dict_dir} exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
exp/tri3/pron_bigram_counts_nowb.txt ${dict_dir}_pron
utils/prepare_lang.sh ${dict_dir}_pron "<UNK>" ${local_lang_dir} ${lang_dir}
./local/format_data.sh --arpa_lm $arpa_lm --lang_in_dir $lang_dir --lang_out_dir ${lang_dir}_test_pron
graph_dir=exp/tri3/graph_pron
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh ${lang_dir}_test_pron exp/tri3 $graph_dir
for dset in dev test; do
steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
$graph_dir data/${dset} exp/tri3/decode_${dset}_pron
done
# Triphone-4 traning
steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \
data/train ${lang_dir}_test_pron exp/tri3 exp/tri3_ali
steps/train_sat.sh --cmd "$train_cmd" \
11500 200000 data/train ${lang_dir} exp/tri3_ali exp/tri4
graph_dir=exp/tri4/graph_pron
$train_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh ${lang_dir}_test_pron exp/tri4 $graph_dir
for dset in dev test; do
steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
--config conf/decode.config \
$graph_dir data/${dset} exp/tri4/decode_${dset}_pron
done
# TDNN training
./local/run_cleanup_segmentation.sh --langdir ${lang_dir}
./local/run_tdnn_1f.sh --lang_dir ${lang_dir}