From b8bdd8b2b4ac1daa109a1310078411470824738c Mon Sep 17 00:00:00 2001
From: boocmp <boocmp@yandex.ru>
Date: Thu, 22 Aug 2024 20:38:05 +0700
Subject: [PATCH] Vad options. Check for last chunk.

---
 src/stream_transcriber.py | 28 ++++++++++++++++------------
 src/stt_api.py            |  2 --
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/stream_transcriber.py b/src/stream_transcriber.py
index ecacd44..5751291 100644
--- a/src/stream_transcriber.py
+++ b/src/stream_transcriber.py
@@ -42,12 +42,15 @@ def __init__(self):
         self._vad_detected_offset = 0
         self._speech_audio_buffers = []
         self._speech_timestamps = []
+        self._last_chunk_received = False
 
         self._vad_options = VadOptions(
-            min_speech_duration_ms=60, min_silence_duration_ms=60
+            min_speech_duration_ms=125, min_silence_duration_ms=125, speech_pad_ms=125
         )
 
     def consume(self, stream_data: bytes):
+        self._last_chunk_received = len(stream_data) == 0
+
         self._raw_stream_data += stream_data
         try:
             raw_audio_buffer = decode_audio(io.BytesIO(self._raw_stream_data))
@@ -64,16 +67,17 @@ def consume(self, stream_data: bytes):
         if not speech_timestamps:
             return
 
-        # remove the speech chunks which probably are not ended
-        while (
-            speech_timestamps
-            and speech_timestamps[-1]["end"]
-            > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16
-        ):
-            del speech_timestamps[-1]
+        if not self._last_chunk_received:
+            # remove the speech chunks which probably are not ended
+            while (
+                speech_timestamps
+                and speech_timestamps[-1]["end"]
+                > len(raw_audio_buffer) - self._vad_options.min_silence_duration_ms * 16
+            ):
+                del speech_timestamps[-1]
 
-        if not speech_timestamps:
-            return
+            if not speech_timestamps:
+                return
 
         self._vad_detected_offset += speech_timestamps[-1]["end"]
 
@@ -105,7 +109,7 @@ def consume(self, stream_data: bytes):
 
         [print(buf2secs(x)) for x in self._speech_audio_buffers]
 
-        print(len2secs(self._raw_stream_data_duration), self._vad_detected_offset)
+        print(self._raw_stream_data_duration, len2secs(self._vad_detected_offset))
 
     def should_transcribe(self):
         if not self._speech_audio_buffers:
@@ -114,7 +118,7 @@ def should_transcribe(self):
             return True
         if self._raw_stream_data_duration > 3:
             return True
-        return False
+        return self._last_chunk_received
 
     def get_speech_audio(self) -> bytes:
         assert self.should_transcribe()
diff --git a/src/stt_api.py b/src/stt_api.py
index fe1a2ac..9a6f1e0 100644
--- a/src/stt_api.py
+++ b/src/stt_api.py
@@ -64,8 +64,6 @@ async def handleUpstream(
         async with ipc.client.Publisher(pair) as pipe:
             try:
                 async for chunk in request.stream():
-                    if len(chunk) == 0:
-                        break
                     stream.consume(chunk)
 
                     while stream.should_transcribe():