-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasterwhisper.py
125 lines (109 loc) · 6.2 KB
/
fasterwhisper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
import speech_recognition as sr
from faster_whisper import WhisperModel
from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from collections import deque
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
text_queue = Queue()
def initial_model():
global text_queue
print("开始初始化模型")
record_timeout = 5 # 默认录音超时时间,限制recognizer最长连续录制时间
phrase_timeout = 0.5 # 两个小段录音之间有多少空白时间间隔时,我们会将其视为新的一行。在record_timeout》phrase_timeout时,无作用
energy_threshold = 1000 # 默认能量阈值,禁止自动能量阈值,防止不能自动停止。
model_size = "small"
model_path = "./model/" + model_size + ".pt"
device = "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"
print("加载模型中,如第一次使用会自动下载模型,预计300M大小。等到提示‘Model loaded,开始监听后’,则加载完毕。")
audio_model = WhisperModel(model_size, device="cpu", compute_type="int8")
print("Using_device:", device)
# sr一个有用的功能,即能够检测语音何时结束。
recorder = sr.Recognizer()
recorder.energy_threshold = energy_threshold
recorder.dynamic_energy_threshold = False
source = sr.Microphone(sample_rate=16000) # 麦克风输入源,采样率16000Hz,whisper模型要求16000Hz
# 自适应噪声,只设置一次,禁止动态
with source:
print("Adjusting for ambient noise...energy_threshold:", recorder.energy_threshold)
recorder.adjust_for_ambient_noise(source, duration=1)
print("Ready.energy_threshold:", recorder.energy_threshold)
transcription = deque(maxlen=10) # 用于保存识别结果的列表
transcription.append("") # 初始值为空,防止直接调用-1 索引异常
# transcription = [] # 用于保存识别结果的列表
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue()
# The last time a recording was retrieved from the queue.
phrase_time = None
def record_callback(_, audio: sr.AudioData) -> None :
# Grab the raw bytes and push it into the thread safe queue.
print("麦克风有新内容callback,队列新增数据", datetime.now())
data = audio.get_raw_data()
data_queue.put(data)
# 用于在后台持续监听音频源的函数。会将监听音频传入callback后加入转换队列
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
print("Model loaded,开始监听.\n")
phrase_time = datetime.now() # 初始时间戳
while True:
try:
now = datetime.now()
# print("循环监测中")
if not data_queue.empty() : # 队列中有数据,有麦克风监测的数据,则进行间隔超时判断
phrase_complete = False
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
# 对列有数据未转写,且间隔超时,则认为是新的一句,
# 注意需要允许sr记录的时间record-timeout小于间隔时间phase——timeout,否则会记录到语句结束,每次记录肯定是新的句子
phrase_complete = True
print("队列中有数据,间隔超时,认为是一句结束,添加新的一行.")
phrase_time = now # 重新计时
# Combine audio data from queue
audio_data = b''.join(data_queue.queue)
data_queue.queue.clear()
# 如果是cpu,无法处理int16,需要转换为float32
# np.int16表示16位有符号整数,其取值范围是-32768到32767。需要归一化,避免使用模型归一化,实际不除不准确
print("开始转换格式开始时间", datetime.now())
if device == "cpu":
audio_np = np.frombuffer(audio_data, dtype=np.int16)/ 32768
# audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
# audio_np = np.frombuffer(audio_data, dtype=np.int16)
else:
audio_np = np.frombuffer(audio_data, dtype=np.int16) / 32768.0
print("准备转换开始时间", datetime.now())
segments, info = audio_model.transcribe(audio_np, beam_size=3, language="zh", without_timestamps=True, )
print("转换结束时间", datetime.now())
text_transformed = ""
for segment in segments:
# print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
text_transformed += segment.text
print("识别结果递归完毕", datetime.now())
# If we detected a pause between recordings, add a new item to our transcription.
# Other——wise edit the existing one.
if phrase_complete:
transcription.append(text_transformed)
text_queue.put(text_transformed)
print("队列加入数据",text_transformed, "队列是否为空", text_queue.empty())
else :
transcription[-1] = text_transformed # 断句没超时,说明是连续的句子,是加上新的词语重复转换,编辑最后一行,
# print("开始打印", datetime.now())
for line in transcription :
print(line)
# Flush stdout.
# print('', end='', flush=True)
# print("打印结束", datetime.now())
else:
# 无数据时间隔一段时间再循环,防止CPU占用过高。检测频率越高,越早触发循环
sleep(0.1)
except KeyboardInterrupt :
print("Ctrl+C,KeyboardInterrupt,退出程序.")
break
except Exception as e :
print("Error:", e)
break
print("\n\nTranscription:")
for line in transcription :
print(line)
if __name__ == "__main__" :
initial_model()