From b8bdd8b2b4ac1daa109a1310078411470824738c Mon Sep 17 00:00:00 2001 From: boocmp Date: Thu, 22 Aug 2024 20:38:05 +0700 Subject: [PATCH] Vad options. Check for last chunk. --- src/stream_transcriber.py | 28 ++++++++++++++++------------ src/stt_api.py | 2 -- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/stream_transcriber.py b/src/stream_transcriber.py index ecacd44..5751291 100644 --- a/src/stream_transcriber.py +++ b/src/stream_transcriber.py @@ -42,12 +42,15 @@ def __init__(self): self._vad_detected_offset = 0 self._speech_audio_buffers = [] self._speech_timestamps = [] + self._last_chunk_received = False self._vad_options = VadOptions( - min_speech_duration_ms=60, min_silence_duration_ms=60 + min_speech_duration_ms=125, min_silence_duration_ms=125, speech_pad_ms=125 ) def consume(self, stream_data: bytes): + self._last_chunk_received = len(stream_data) == 0 + self._raw_stream_data += stream_data try: raw_audio_buffer = decode_audio(io.BytesIO(self._raw_stream_data)) @@ -64,16 +67,17 @@ def consume(self, stream_data: bytes): if not speech_timestamps: return - # remove the speech chunks which probably are not ended - while ( - speech_timestamps - and speech_timestamps[-1]["end"] - > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16 - ): - del speech_timestamps[-1] + if not self._last_chunk_received: + # remove the speech chunks which probably are not ended + while ( + speech_timestamps + and speech_timestamps[-1]["end"] + > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16 + ): + del speech_timestamps[-1] - if not speech_timestamps: - return + if not speech_timestamps: + return self._vad_detected_offset += speech_timestamps[-1]["end"] @@ -105,7 +109,7 @@ def consume(self, stream_data: bytes): [print(buf2secs(x)) for x in self._speech_audio_buffers] - print(len2secs(self._raw_stream_data_duration), self._vad_detected_offset) + print(self._raw_stream_data_duration, len2secs(self._vad_detected_offset)) def should_transcribe(self): if not self._speech_audio_buffers: @@ -114,7 +118,7 @@ def should_transcribe(self): return True if self._raw_stream_data_duration > 3: return True - return False + return self._last_chunk_received def get_speech_audio(self) -> bytes: assert self.should_transcribe() diff --git a/src/stt_api.py b/src/stt_api.py index fe1a2ac..9a6f1e0 100644 --- a/src/stt_api.py +++ b/src/stt_api.py @@ -64,8 +64,6 @@ async def handleUpstream( async with ipc.client.Publisher(pair) as pipe: try: async for chunk in request.stream(): - if len(chunk) == 0: - break stream.consume(chunk) while stream.should_transcribe():