From efce11ec23e54a1314da362a41844a3cf2ad00f6 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 30 Mar 2021 20:06:09 -0700 Subject: [PATCH] Add stt example --- requirements.txt | 20 +++++++++++ setup_stt.sh | 2 ++ stt_example.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 requirements.txt create mode 100644 setup_stt.sh create mode 100644 stt_example.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1065127 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +deepspeech==0.9.3 +PyAudio==0.2.11 + +gdown +numpy>=1.16.0 +torch>=1.5 +librosa>=0.5.1 +Unidecode>=0.4.20 +matplotlib +Pillow +flask +scipy +tqdm +soundfile +phonemizer +bokeh==1.4.0 +inflect==5.3.0 +sounddevice==0.4.1 + +rasa==2.4.2 \ No newline at end of file diff --git a/setup_stt.sh b/setup_stt.sh new file mode 100644 index 0000000..4f5230c --- /dev/null +++ b/setup_stt.sh @@ -0,0 +1,2 @@ +wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm" +wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer" \ No newline at end of file diff --git a/stt_example.py b/stt_example.py new file mode 100644 index 0000000..22b5b29 --- /dev/null +++ b/stt_example.py @@ -0,0 +1,92 @@ + +import deepspeech +import numpy as np +import os +import pyaudio +import time + +# Cuda for deepspeech is controlled at the pip package level +# pip install deepspeech-gpu + +# DeepSpeech parameters +BEAM_WIDTH = 700 +LM_ALPHA = 0.75 +LM_BETA = 1.85 + +MODEL_FILE_PATH = os.path.join('models', 'deepspeech-0.9.3-models.pbmm') +SCORER_PATH = os.path.join('models', 'deepspeech-0.9.3-models.scorer') + + +class Transcriber: + def __init__(self, model): + self.model = model + self.model.enableExternalScorer(SCORER_PATH) + self.model.setScorerAlphaBeta(LM_ALPHA, LM_BETA) + self.model.setBeamWidth(BEAM_WIDTH) + # self.model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA) + + # Create a Streaming session + self.ds_stream = self.model.createStream() + + # Encapsulate DeepSpeech audio feeding into a callback for PyAudio + self.text_so_far = '' + self.t_start = time.time() + self.t_wait = .5 + self.final_text = None + + def process_audio(self, in_data, frame_count, time_info, status): + data16 = np.frombuffer(in_data, dtype=np.int16) + self.ds_stream.feedAudioContent(data16) + text = self.ds_stream.intermediateDecode() + try: + if text != self.text_so_far: + if text not in ["i ", "he ", "the "]: + print('Interim text = {};'.format(text)) + self.text_so_far = text + self.t_start = time.time() + elif text != '' and (time.time() - self.t_start > self.t_wait): + if text not in ["i ", "he ", "the "]: + print("Finishing stream") + text = self.ds_stream.finishStream() + print('Final text = {}.\n'.format(text)) + self.final_text = text + self.ds_stream = self.model.createStream() + except Exception as e: + print(f"Text: '{text}'; So far: '{self.text_so_far}") + print(self.t_start) + raise e + return (in_data, pyaudio.paContinue) + + def listen(self): + print("setting up to listen") + # Feed audio to deepspeech in a callback to PyAudio + self.audio = pyaudio.PyAudio() + self.stream = self.audio.open( + format=pyaudio.paInt16, + channels=1, + rate=16000, + input=True, + frames_per_buffer=1024, + stream_callback=self.process_audio + ) + + print('Please start speaking, when done press Ctrl-C ...') + self.stream.start_stream() + print("listening now") + return + + +if __name__ == '__main__': + # Make DeepSpeech Model + model = deepspeech.Model(MODEL_FILE_PATH) + stt = Transcriber(model) + stt.listen() + try: + while stt.stream.is_active(): + time.sleep(0.05) + except KeyboardInterrupt: + # PyAudio + stt.stream.stop_stream() + stt.stream.close() + stt.audio.terminate() + print('Finished recording.')