-
Notifications
You must be signed in to change notification settings - Fork 1
/
mySTT.py
55 lines (44 loc) · 1.82 KB
/
mySTT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import time
import openai
import soundfile as sf
from loguru import logger
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'google_key.json'
class MySTT:
def __init__(self, openai_key):
self.whisper_types = ['flac', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm']
self.max_cache = 10
self.cached = {}
openai.api_key = openai_key
def stt_whisper(self, voice_path):
self._clean_up()
if voice_path in self.cached:
if self.cached[voice_path].get('whisper'):
return self.cached[voice_path]['whisper']['transcript']
else:
self.cached[voice_path]['whisper'] = {}
else:
self.cached[voice_path] = {'whisper': {}}
if voice_path[voice_path.rfind('.') + 1:] not in self.whisper_types:
supported_format = voice_path.replace(voice_path[voice_path.rfind('.'):], '.wav')
data, samplerate = sf.read(voice_path)
sf.write(supported_format, data, samplerate)
else:
supported_format = voice_path
with open(supported_format, 'rb') as voice:
time_start = time.time()
transcript = openai.Audio.transcribe("whisper-1", voice)
time_end = time.time()
self.cached[voice_path]['whisper'] = {
'transcript': transcript['text']
}
logger.info("[Whisper STT] Transcript: {}".format(transcript['text']))
logger.info("[Whisper STT] Done in: {} seconds".format(time_end - time_start))
os.remove(supported_format)
return transcript['text']
def _clean_up(self):
if len(self.cached) > self.max_cache:
self.cached.pop(0)
dict_items = list(self.cached.items())
del dict_items[0]
self.cached = dict(dict_items)