@@ -98,56 +98,56 @@ class WhisperFullParams(ctypes.Structure):
9898whisper .whisper_full_get_segment_text .restype = ctypes .c_char_p
9999ctx = whisper .whisper_init_from_file (fname_model .encode ('utf-8' ))
100100
101- data_30_secs = np .zeros (SAMPLES_30_SECS , dtype = np .float32 )
102- written_samples = 0 # nb. of samples written to data_30_secs for the cur. inference
103101
104-
105- def on_audio_frame (frame : livekit .AudioFrame ):
106- global data_30_secs , written_samples
107-
108- # whisper requires 16kHz mono, so resample the data
109- # also convert the samples from int16 to float32
110- frame = frame .remix_and_resample (
111- WHISPER_SAMPLE_RATE , 1 )
112-
113- data = np .array (frame .data , dtype = np .float32 ) / 32768.0
114-
115- # write the data inside data_30_secs at written_samples
116- data_start = SAMPLES_KEEP + written_samples
117- data_30_secs [data_start :data_start + len (data )] = data
118- written_samples += len (data )
119-
120- if written_samples >= SAMPLES_STEP :
121- params = whisper .whisper_full_default_params (
122- WhisperSamplingStrategy .WHISPER_SAMPLING_GREEDY )
123- params .print_realtime = False
124- params .print_progress = False
125-
126- ctx_ptr = ctypes .c_void_p (ctx )
127- data_ptr = data_30_secs .ctypes .data_as (ctypes .POINTER (ctypes .c_float ))
128- res = whisper .whisper_full (ctx_ptr ,
129- params ,
130- data_ptr ,
131- written_samples + SAMPLES_KEEP )
132-
133- if res != 0 :
134- logging .error ("error while running inference: %s" , res )
135- return
136-
137- n_segments = whisper .whisper_full_n_segments (ctx_ptr )
138- for i in range (n_segments ):
139- t0 = whisper .whisper_full_get_segment_t0 (ctx_ptr , i )
140- t1 = whisper .whisper_full_get_segment_t1 (ctx_ptr , i )
141- txt = whisper .whisper_full_get_segment_text (ctx_ptr , i )
142-
143- logging .info (
144- f"{ t0 / 1000.0 :.3f} - { t1 / 1000.0 :.3f} : { txt .decode ('utf-8' )} " )
145-
146- # write old data to the beginning of the buffer (SAMPLES_KEEP)
147- data_30_secs [:SAMPLES_KEEP ] = data_30_secs [data_start +
148- written_samples - SAMPLES_KEEP :
149- data_start + written_samples ]
150- written_samples = 0
102+ async def whisper_task (stream : livekit .AudioStream ):
103+ data_30_secs = np .zeros (SAMPLES_30_SECS , dtype = np .float32 )
104+ written_samples = 0 # nb. of samples written to data_30_secs for the cur. inference
105+
106+ async for frame in stream :
107+ # whisper requires 16kHz mono, so resample the data
108+ # also convert the samples from int16 to float32
109+ frame = frame .remix_and_resample (
110+ WHISPER_SAMPLE_RATE , 1 )
111+
112+ data = np .array (frame .data , dtype = np .float32 ) / 32768.0
113+
114+ # write the data inside data_30_secs at written_samples
115+ data_start = SAMPLES_KEEP + written_samples
116+ data_30_secs [data_start :data_start + len (data )] = data
117+ written_samples += len (data )
118+
119+ if written_samples >= SAMPLES_STEP :
120+ params = whisper .whisper_full_default_params (
121+ WhisperSamplingStrategy .WHISPER_SAMPLING_GREEDY )
122+ params .print_realtime = False
123+ params .print_progress = False
124+
125+ ctx_ptr = ctypes .c_void_p (ctx )
126+ data_ptr = data_30_secs .ctypes .data_as (
127+ ctypes .POINTER (ctypes .c_float ))
128+ res = whisper .whisper_full (ctx_ptr ,
129+ params ,
130+ data_ptr ,
131+ written_samples + SAMPLES_KEEP )
132+
133+ if res != 0 :
134+ logging .error ("error while running inference: %s" , res )
135+ return
136+
137+ n_segments = whisper .whisper_full_n_segments (ctx_ptr )
138+ for i in range (n_segments ):
139+ t0 = whisper .whisper_full_get_segment_t0 (ctx_ptr , i )
140+ t1 = whisper .whisper_full_get_segment_t1 (ctx_ptr , i )
141+ txt = whisper .whisper_full_get_segment_text (ctx_ptr , i )
142+
143+ logging .info (
144+ f"{ t0 / 1000.0 :.3f} - { t1 / 1000.0 :.3f} : { txt .decode ('utf-8' )} " )
145+
146+ # write old data to the beginning of the buffer (SAMPLES_KEEP)
147+ data_30_secs [:SAMPLES_KEEP ] = data_30_secs [data_start +
148+ written_samples - SAMPLES_KEEP :
149+ data_start + written_samples ]
150+ written_samples = 0
151151
152152
153153async def main ():
@@ -172,7 +172,7 @@ def on_track_subscribed(track: livekit.Track,
172172 logging .info ("starting listening to: %s" , participant .identity )
173173 nonlocal audio_stream
174174 audio_stream = livekit .AudioStream (track )
175- audio_stream . add_listener ( 'frame_received' , on_audio_frame )
175+ asyncio . create_task ( whisper_task ( audio_stream ) )
176176
177177 try :
178178 logging .info ("connecting to %s" , URL )
0 commit comments