From efce11ec23e54a1314da362a41844a3cf2ad00f6 Mon Sep 17 00:00:00 2001
From: Chris <cmbirmingham19@gmail.com>
Date: Tue, 30 Mar 2021 20:06:09 -0700
Subject: [PATCH] Add stt example

---
 requirements.txt | 20 +++++++++++
 setup_stt.sh     |  2 ++
 stt_example.py   | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 setup_stt.sh
 create mode 100644 stt_example.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1065127
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+deepspeech==0.9.3
+PyAudio==0.2.11
+
+gdown
+numpy>=1.16.0
+torch>=1.5
+librosa>=0.5.1
+Unidecode>=0.4.20
+matplotlib
+Pillow
+flask
+scipy
+tqdm
+soundfile
+phonemizer
+bokeh==1.4.0
+inflect==5.3.0
+sounddevice==0.4.1
+
+rasa==2.4.2
\ No newline at end of file
diff --git a/setup_stt.sh b/setup_stt.sh
new file mode 100644
index 0000000..4f5230c
--- /dev/null
+++ b/setup_stt.sh
@@ -0,0 +1,2 @@
+wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm"
+wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer"
\ No newline at end of file
diff --git a/stt_example.py b/stt_example.py
new file mode 100644
index 0000000..22b5b29
--- /dev/null
+++ b/stt_example.py
@@ -0,0 +1,92 @@
+
+import deepspeech
+import numpy as np
+import os
+import pyaudio
+import time
+
+# Cuda for deepspeech is controlled at the pip package level
+# pip install deepspeech-gpu
+
+# DeepSpeech parameters
+BEAM_WIDTH = 700
+LM_ALPHA = 0.75
+LM_BETA = 1.85
+
+MODEL_FILE_PATH = os.path.join('models', 'deepspeech-0.9.3-models.pbmm')
+SCORER_PATH = os.path.join('models', 'deepspeech-0.9.3-models.scorer')
+
+
+class Transcriber:
+    def __init__(self, model):
+        self.model = model
+        self.model.enableExternalScorer(SCORER_PATH)
+        self.model.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
+        self.model.setBeamWidth(BEAM_WIDTH)
+        # self.model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA)
+
+        # Create a Streaming session
+        self.ds_stream = self.model.createStream()
+
+        # Encapsulate DeepSpeech audio feeding into a callback for PyAudio
+        self.text_so_far = ''
+        self.t_start = time.time()
+        self.t_wait = .5
+        self.final_text = None
+
+    def process_audio(self, in_data, frame_count, time_info, status):
+        data16 = np.frombuffer(in_data, dtype=np.int16)
+        self.ds_stream.feedAudioContent(data16)
+        text = self.ds_stream.intermediateDecode()
+        try:
+            if text != self.text_so_far:
+                if text not in ["i ", "he ", "the "]:
+                    print('Interim text = {};'.format(text))
+                self.text_so_far = text
+                self.t_start = time.time()
+            elif text != '' and (time.time() - self.t_start > self.t_wait):
+                if text not in ["i ", "he ", "the "]:
+                    print("Finishing stream")
+                    text = self.ds_stream.finishStream()
+                    print('Final text = {}.\n'.format(text))
+                    self.final_text = text
+                    self.ds_stream = self.model.createStream()
+        except Exception as e:
+            print(f"Text: '{text}'; So far: '{self.text_so_far}")
+            print(self.t_start)
+            raise e
+        return (in_data, pyaudio.paContinue)
+
+    def listen(self):
+        print("setting up to listen")
+        # Feed audio to deepspeech in a callback to PyAudio
+        self.audio = pyaudio.PyAudio()
+        self.stream = self.audio.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=16000,
+            input=True,
+            frames_per_buffer=1024,
+            stream_callback=self.process_audio
+        )
+
+        print('Please start speaking, when done press Ctrl-C ...')
+        self.stream.start_stream()
+        print("listening now")
+        return
+
+
+if __name__ == '__main__':
+    # Make DeepSpeech Model
+    model = deepspeech.Model(MODEL_FILE_PATH)
+    stt = Transcriber(model)
+    stt.listen()
+    try:
+        while stt.stream.is_active():
+            time.sleep(0.05)
+    except KeyboardInterrupt:
+        # PyAudio
+        stt.stream.stop_stream()
+        stt.stream.close()
+        stt.audio.terminate()
+        print('Finished recording.')