-
Notifications
You must be signed in to change notification settings - Fork 12
/
audioWhisper.py
87 lines (73 loc) · 3.42 KB
/
audioWhisper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sounddevice as sd
import whisper
from scipy.io import wavfile
import numpy as np
from datetime import datetime as time
import argparse
import os
parser = argparse.ArgumentParser(description="parameters for audioWhisper.py")
parser.add_argument('--devices', default='False', type=str, help='print all available devices id')
parser.add_argument('--model', type=str, choices=['tiny','tiny.en', 'small', 'small.en', 'medium', 'medium.en', 'large'], default='small', help='model to be use for generating audio transcribe')
parser.add_argument('--task', type=str, choices=['transcribe', 'translate'], default='transcribe', help='task for the model whether to only transcribe the audio or translate the audio to english')
parser.add_argument('--device_index', default= 2, type=int, help='the id of the device ')
parser.add_argument('--channel', default= 2, type=int, help='number of channels for the device')
parser.add_argument('--rate', default= 44100, type=int, help="polling rate of the output device")
parser.add_argument('--audioseconds', default=5, type=int, help='the length of the audio for recording')
parser.add_argument('--audiocounts', default=5, type=int, help='counts to save audio files in audio folder')
parser.add_argument('--output_dir', default="audio", type=str, help='output directory for audio files recorded')
args = parser.parse_args()
def main():
model: str=args.model
task: str=args.task
rate: int=args.rate
channel: int=args.channel
device_index: int=args.device_index
seconds: int=args.audioseconds
audio_counts: int=args.audiocounts
output_dir: str=args.output_dir
language: str=None
os.makedirs(output_dir, exist_ok=True)
if model.endswith('.en'):
language = 'english'
if model == 'large.en':
raise ValueError("english model does not have large model, please choose from {tiny.en, small.en, medium.en}")
audio_model = whisper.load_model(model)
sd.default.device[0] = device_index
sd.default.dtype[0] = np.float32
index = 0
while True:
#print("recording...")
recording = sd.rec(frames=rate*seconds, samplerate = rate, channels=channel, dtype=np.float32)
if index == 0:
indexPath = audio_counts
else:
indexPath = index - 1
try:
#print("transcribing...")
prevTime = time.now()
result = audio_model.transcribe(f"{output_dir}/audio{indexPath}.wav",
task=task,
no_speech_threshold=0.6,
compression_ratio_threshold=2.0,
language=language)
translatedText = result.get('text')
print(f'\n {task}d text:', translatedText)
except:
print("no audio track recorded yet")
print(f"time: {time.now() - prevTime}")
sd.wait()
wavfile.write(f"{output_dir}/audio{index}.wav", rate=rate, data=recording)
index += 1
if index > audio_counts:
index = 0
def str2bool(string):
str2val = {"true": True, "false": False}
if string.lower() in str2val:
return str2val[string.lower()]
else:
raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
if __name__ == '__main__':
if str2bool(args.devices) == True:
print(sd.query_devices())
else:
main()