-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4ac63e9
commit efce11e
Showing
3 changed files
with
114 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
deepspeech==0.9.3 | ||
PyAudio==0.2.11 | ||
|
||
gdown | ||
numpy>=1.16.0 | ||
torch>=1.5 | ||
librosa>=0.5.1 | ||
Unidecode>=0.4.20 | ||
matplotlib | ||
Pillow | ||
flask | ||
scipy | ||
tqdm | ||
soundfile | ||
phonemizer | ||
bokeh==1.4.0 | ||
inflect==5.3.0 | ||
sounddevice==0.4.1 | ||
|
||
rasa==2.4.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm" | ||
wget -P models "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
|
||
import deepspeech | ||
import numpy as np | ||
import os | ||
import pyaudio | ||
import time | ||
|
||
# Cuda for deepspeech is controlled at the pip package level | ||
# pip install deepspeech-gpu | ||
|
||
# DeepSpeech parameters | ||
BEAM_WIDTH = 700 | ||
LM_ALPHA = 0.75 | ||
LM_BETA = 1.85 | ||
|
||
MODEL_FILE_PATH = os.path.join('models', 'deepspeech-0.9.3-models.pbmm') | ||
SCORER_PATH = os.path.join('models', 'deepspeech-0.9.3-models.scorer') | ||
|
||
|
||
class Transcriber: | ||
def __init__(self, model): | ||
self.model = model | ||
self.model.enableExternalScorer(SCORER_PATH) | ||
self.model.setScorerAlphaBeta(LM_ALPHA, LM_BETA) | ||
self.model.setBeamWidth(BEAM_WIDTH) | ||
# self.model.enableDecoderWithLM(LM_FILE_PATH, TRIE_FILE_PATH, LM_ALPHA, LM_BETA) | ||
|
||
# Create a Streaming session | ||
self.ds_stream = self.model.createStream() | ||
|
||
# Encapsulate DeepSpeech audio feeding into a callback for PyAudio | ||
self.text_so_far = '' | ||
self.t_start = time.time() | ||
self.t_wait = .5 | ||
self.final_text = None | ||
|
||
def process_audio(self, in_data, frame_count, time_info, status): | ||
data16 = np.frombuffer(in_data, dtype=np.int16) | ||
self.ds_stream.feedAudioContent(data16) | ||
text = self.ds_stream.intermediateDecode() | ||
try: | ||
if text != self.text_so_far: | ||
if text not in ["i ", "he ", "the "]: | ||
print('Interim text = {};'.format(text)) | ||
self.text_so_far = text | ||
self.t_start = time.time() | ||
elif text != '' and (time.time() - self.t_start > self.t_wait): | ||
if text not in ["i ", "he ", "the "]: | ||
print("Finishing stream") | ||
text = self.ds_stream.finishStream() | ||
print('Final text = {}.\n'.format(text)) | ||
self.final_text = text | ||
self.ds_stream = self.model.createStream() | ||
except Exception as e: | ||
print(f"Text: '{text}'; So far: '{self.text_so_far}") | ||
print(self.t_start) | ||
raise e | ||
return (in_data, pyaudio.paContinue) | ||
|
||
def listen(self): | ||
print("setting up to listen") | ||
# Feed audio to deepspeech in a callback to PyAudio | ||
self.audio = pyaudio.PyAudio() | ||
self.stream = self.audio.open( | ||
format=pyaudio.paInt16, | ||
channels=1, | ||
rate=16000, | ||
input=True, | ||
frames_per_buffer=1024, | ||
stream_callback=self.process_audio | ||
) | ||
|
||
print('Please start speaking, when done press Ctrl-C ...') | ||
self.stream.start_stream() | ||
print("listening now") | ||
return | ||
|
||
|
||
if __name__ == '__main__': | ||
# Make DeepSpeech Model | ||
model = deepspeech.Model(MODEL_FILE_PATH) | ||
stt = Transcriber(model) | ||
stt.listen() | ||
try: | ||
while stt.stream.is_active(): | ||
time.sleep(0.05) | ||
except KeyboardInterrupt: | ||
# PyAudio | ||
stt.stream.stop_stream() | ||
stt.stream.close() | ||
stt.audio.terminate() | ||
print('Finished recording.') |