forked from NVIDIA/OpenSeq2Seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_lm.py
49 lines (41 loc) · 1.32 KB
/
build_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import os
import argparse
def get_corpus(csv_files):
'''
Get text corpus from a list of CSV files
'''
SEP = '\n'
corpus = ''
for f in csv_files:
df = pd.read_csv(f)
corpus += SEP.join(df['transcript']) + SEP
# remove the last SEP
corpus = corpus[:-1]
return corpus
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Build N-gram LM model from CSV files')
parser.add_argument('csv', metavar='csv', type=str, nargs='+', help='DeepSpeech CSV file')
parser.add_argument('--n', type=int, help='n for n-grams', default=3)
args = parser.parse_args()
corpus = get_corpus(args.csv)
path_prefix, _ = os.path.splitext(args.csv[0])
corpus_name = path_prefix + '.txt'
arpa_name = path_prefix + '.arpa'
lm_name = path_prefix + '-lm.binary'
trie_name = path_prefix + '-lm.trie'
with open(corpus_name, 'w') as f:
f.write(corpus)
command = 'lmplz --text {} --arpa {} --o {}'.format(
corpus_name, arpa_name, args.n)
print(command)
os.system(command)
command = 'build_binary -s {} {}'.format(
arpa_name, lm_name)
print(command)
os.system(command)
command = 'ctc_decoder_with_lm/generate_trie \
open_seq2seq/test_utils/toy_speech_data/alphabet.txt {} {} {}'.format(
lm_name, corpus_name, trie_name)
print(command)
os.system(command)