Skip to content

Commit

Permalink
fix bugs and improve asrserver
Browse files Browse the repository at this point in the history
  • Loading branch information
nl8590687 committed May 11, 2018
1 parent 08f7033 commit fb89aab
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

__pycache__
*.wav
*.model_yaml
Test_Report_*

dataset
Expand Down
49 changes: 43 additions & 6 deletions SpeechModel22.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import numpy as np
import random

from keras.models import model_from_yaml
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Reshape # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge
from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation,Conv2D, MaxPooling2D #, Merge,Conv1D
Expand All @@ -40,7 +41,14 @@ def __init__(self, datapath):
self.label_max_string_length = 64
self.AUDIO_LENGTH = 1600
self.AUDIO_FEATURE_LENGTH = 200

self.model_name = 'm22'

#if(not os.path.exists(self.model_name + '.model_yaml')): # 判断保存模型的目录是否存在
self._model, self.base_model = self.CreateModel()
#else:
# self._model, self.base_model = self.load_model_yaml(self.model_name)


self.datapath = datapath
self.slash = ''
Expand Down Expand Up @@ -112,7 +120,7 @@ def CreateModel(self):

model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

model.summary()
#model.summary()

# clipnorm seems to speeds up convergence
#sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
Expand All @@ -125,6 +133,9 @@ def CreateModel(self):
# captures output of softmax so we can decode the output during visualization
test_func = K.function([input_data], [y_pred])

#kr.utils.plot_model(model, to_file='model.png', show_shapes=False, show_layer_names=True) # 可视化展示模型
self.save_model_yaml(model, model_data)

print('[*提示] 创建模型成功,模型编译成功')
return model, model_data

Expand All @@ -135,7 +146,32 @@ def ctc_lambda_func(self, args):
#y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

def save_model_yaml(self,model,model_data):
'''
保存模型的配置结构
'''
str_yaml_model = model.to_yaml()
str_yaml_model_data = model_data.to_yaml()
f = open(self.model_name + '.model_yaml','w')
f.write(str_yaml_model)
f.close()
f = open(self.model_name + '_base.model_yaml','w')
f.write(str_yaml_model_data)
f.close()

def load_model_yaml(self, model_name):
'''
加载模型的配置结构
'''
f = open(self.model_name + '.model_yaml','r')
str_yaml_model = f.read()
f.close()
f = open(self.model_name + '_base.model_yaml','r')
str_yaml_model_data = f.read()
f.close()
model = model_from_yaml(str_yaml_model)
model_data = model_from_yaml(str_yaml_model_data)
return model, model_data

def TrainModel(self, datapath, epoch = 2, save_step = 1000, batch_size = 32, filename = 'model_speech/speech_model2'):
'''
Expand Down Expand Up @@ -230,7 +266,7 @@ def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report
txt += 'True:\t' + str(data_labels) + '\n'
txt += 'Pred:\t' + str(pre) + '\n'
txt += '\n'
txt_obj.write(txt)
txt_obj.write(txt)

print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率:', word_error_num / words_num * 100, '%')
if(out_report == True):
Expand Down Expand Up @@ -356,7 +392,7 @@ def model(self):

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#进行配置,使用70%的GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.7
Expand Down Expand Up @@ -385,9 +421,10 @@ def model(self):

ms = ModelSpeech(datapath)

#ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_159000.model')
ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
#ms.TestModel(datapath, str_dataset='test', data_count = 128, out_report = True)
#ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_327500.model')
ms.LoadModel(modelpath + 'm22_2/1/speech_model22_e_0_step_327500.model')
#ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
#ms.TestModel(datapath, str_dataset='train', data_count = 128, out_report = True)
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')
Expand Down
20 changes: 8 additions & 12 deletions asrserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
from SpeechModel22 import ModelSpeech
from LanguageModel import ModelLanguage

datapath = 'data/'
modelpath = 'model_speech/'
ms = ModelSpeech(datapath)
ms.LoadModel(modelpath + 'speech_model22_e_0_step_216500.model')

ml = ModelLanguage('model_language')
ml.LoadModel()

class TestHTTPHandle(http.server.BaseHTTPRequestHandler):

Expand Down Expand Up @@ -66,32 +73,21 @@ def do_POST(self):

if(token == 'qwertasd'):
#buf = '成功\n'+'wavs:\n'+str(wavs)+'\nfs:\n'+str(fs)
buf = r[0]
buf = r
else:
buf = '403'

#print(datas)

self._set_response()





#buf = '<!DOCTYPE HTML> \n<html> \n<head>\n<title>Post page</title>\n</head> \n<body>Post Data:%s <br />Path:%s\n</body> \n</html>'%(datas,self.path)
buf = bytes(buf,encoding="utf-8")
self.wfile.write(buf)

def recognize(self, wavs, fs):
datapath = 'data/'
modelpath = 'model_speech/'
ms = ModelSpeech(datapath)
ms.LoadModel(modelpath + 'speech_model22_e_0_step_6500.model')

r_speech = ms.RecognizeSpeech(wavs, fs)

ml = ModelLanguage('model_language')
ml.LoadModel()
str_pinyin = r_speech
r = ml.SpeechToText(str_pinyin)
return r
Expand Down
24 changes: 12 additions & 12 deletions dict.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
a1 阿啊呵腌吖锕雅
a1 阿啊呵腌吖锕
a2 啊呵嗄
a3 啊呵
a4 啊呵
a5 阿啊呵娃
a5 阿啊呵
ai1 哀挨埃唉哎捱锿诶
ai2 呆挨癌皑捱矮
ai3 矮哎蔼霭嗳
ai4 爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛以
an1 安谙鞍氨庵桉鹌广厂
ai4 爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛
an1 安谙鞍氨庵桉鹌
an3 俺铵揞埯
an4 案按暗岸黯胺犴
ang1 肮
Expand All @@ -19,7 +19,7 @@ ao3 袄拗媪
ao4 奥澳傲懊坳拗骜岙鏊
ba1 八吧巴叭芭扒疤笆粑岜捌
ba2 八拔跋茇菝魃
ba3 把靶钯靶星
ba3 把靶钯靶
ba4 把爸罢霸坝耙灞鲅壩
ba5 吧罢巴叭
bai1 掰
Expand All @@ -34,11 +34,11 @@ bang3 膀榜绑
bang4 棒膀傍磅谤镑蚌蒡
bao1 包胞炮剥褒苞孢煲龅
bao2 薄雹保
bao3 保宝饱堡葆褓鸨乖
bao4 报暴抱爆鲍曝刨瀑豹趵在
bao3 保宝饱堡葆褓鸨
bao4 报暴抱爆鲍曝刨瀑豹趵
bei1 背悲杯碑卑陂埤萆鹎
bei3 北
bei4 被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴宝
bei4 被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴
bei5 臂呗备
ben1 奔贲锛
ben3 本苯畚
Expand Down Expand Up @@ -82,7 +82,7 @@ cai1 猜
cai2 才财材裁采
cai3 采彩踩睬
cai4 采菜蔡
can1 参餐骖食
can1 参餐骖
can2 残惭蚕
can3 惨黪
can4 惨灿掺璨孱粲
Expand All @@ -95,7 +95,7 @@ ce4 策测侧厕册恻
cen1 参
cen2 岑涔
ceng1 噌
ceng2 曾层太
ceng2 曾层
ceng4 蹭
cha1 差插叉碴喳嚓杈馇锸
cha2 查察茶叉茬碴楂猹搽槎檫
Expand Down Expand Up @@ -484,8 +484,8 @@ kao3 考烤拷栲
kao4 靠铐犒
ke1 科颗柯呵棵苛磕坷嗑瞌轲稞疴蝌钶窠颏珂髁
ke2 咳壳颏可
ke3 可渴坷轲岢以
ke4 可克客刻课恪嗑溘骒缂氪锞蚵科谎
ke3 可渴坷轲岢
ke4 克客刻课恪嗑溘骒缂氪锞蚵科可
ken3 肯恳啃垦龈
ken4 裉
keng1 坑吭铿
Expand Down
51 changes: 44 additions & 7 deletions general_function/file_wav.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,37 @@ def GetFrequencyFeature2(wavsignal, fs):
#print(data_input.shape)
return data_input

def GetFrequencyFeature3(wavsignal, fs):
# wav波形 加时间窗以及时移10ms
time_window = 25 # 单位ms
window_length = fs / 1000 * time_window # 计算窗长度的公式,目前全部为400固定值

wav_arr = np.array(wavsignal)
#wav_length = len(wavsignal[0])
wav_length = wav_arr.shape[1]

range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置,也就是最终生成的窗数
data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
data_line = np.zeros((1, 400), dtype = np.float)
for i in range(0, range0_end):
p_start = i * 160
p_end = p_start + 400

data_line = wav_arr[0, p_start:p_end]

x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
data_line = data_line * w # 加窗

data_line = np.abs(fft(data_line)) / wav_length


data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的

#print(data_input.shape)
data_input = np.log(data_input + 1)
return data_input

def wav_scale(energy):
'''
语音信号能量归一化
Expand Down Expand Up @@ -178,13 +209,19 @@ def get_wav_symbol(filename):
return dic_symbol_list,list_symbolmark

if(__name__=='__main__'):
#dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.syllable.txt')
#print(dic)
#dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst')
#for i in dic:
#print(i,dic[i])

wave_data, fs = read_wav_data("A2_0.wav")
#wave_data[0]=wav_scale(wave_data[0])
#print(fs)

wav_show(wave_data[0],fs)
#t0=time.time()
freimg = GetFrequencyFeature3(wave_data,fs)
#t1=time.time()
#print('time cost:',t1-t0)

freimg = freimg.T
plt.subplot(111)

plt.imshow(freimg)
plt.colorbar(cax=None,ax=None,shrink=0.5)

plt.show()
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
ms.LoadModel(modelpath + 'm22_2/0/speech_model22_e_0_step_257000.model')

#ms.TestModel(datapath, str_dataset='test', data_count = 64, out_report = True)
r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav')
r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav')
#r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')
Expand Down

0 comments on commit fb89aab

Please sign in to comment.