Vad options. Check for last chunk.

brave-experiments · Aug 22, 2024 · b8bdd8b · b8bdd8b
1 parent 2741d3a
commit b8bdd8b
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 14 deletions.
diff --git a/src/stream_transcriber.py b/src/stream_transcriber.py
@@ -42,12 +42,15 @@ def __init__(self):
         self._vad_detected_offset = 0
         self._speech_audio_buffers = []
         self._speech_timestamps = []
+        self._last_chunk_received = False
 
         self._vad_options = VadOptions(
-            min_speech_duration_ms=60, min_silence_duration_ms=60
+            min_speech_duration_ms=125, min_silence_duration_ms=125, speech_pad_ms=125
         )
 
     def consume(self, stream_data: bytes):
+        self._last_chunk_received = len(stream_data) == 0
+
         self._raw_stream_data += stream_data
         try:
             raw_audio_buffer = decode_audio(io.BytesIO(self._raw_stream_data))
@@ -64,16 +67,17 @@ def consume(self, stream_data: bytes):
         if not speech_timestamps:
             return
 
-        # remove the speech chunks which probably are not ended
-        while (
-            speech_timestamps
-            and speech_timestamps[-1]["end"]
-            > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16
-        ):
-            del speech_timestamps[-1]
+        if not self._last_chunk_received:
+            # remove the speech chunks which probably are not ended
+            while (
+                speech_timestamps
+                and speech_timestamps[-1]["end"]
+                > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16
+            ):
+                del speech_timestamps[-1]
 
-        if not speech_timestamps:
-            return
+            if not speech_timestamps:
+                return
 
         self._vad_detected_offset += speech_timestamps[-1]["end"]
 
@@ -105,7 +109,7 @@ def consume(self, stream_data: bytes):
 
         [print(buf2secs(x)) for x in self._speech_audio_buffers]
 
-        print(len2secs(self._raw_stream_data_duration), self._vad_detected_offset)
+        print(self._raw_stream_data_duration, len2secs(self._vad_detected_offset))
 
     def should_transcribe(self):
         if not self._speech_audio_buffers:
@@ -114,7 +118,7 @@ def should_transcribe(self):
             return True
         if self._raw_stream_data_duration > 3:
             return True
-        return False
+        return self._last_chunk_received
 
     def get_speech_audio(self) -> bytes:
         assert self.should_transcribe()

diff --git a/src/stt_api.py b/src/stt_api.py
@@ -64,8 +64,6 @@ async def handleUpstream(
         async with ipc.client.Publisher(pair) as pipe:
             try:
                 async for chunk in request.stream():
-                    if len(chunk) == 0:
-                        break
                     stream.consume(chunk)
 
                     while stream.should_transcribe():