rhasspy · synesthesiam · Jun 27, 2024 · Jun 27, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/README.md b/README.md
@@ -28,22 +28,23 @@ script/run --uri 'tcp://0.0.0.0:10400'
 
 See `script/run --help` for more options, including:
 
+* `--threshold [0-1]` - default is 0.5, increase to avoid false activations
+* `--vad-threshold [0-1]` - default is 0, use [Silero VAD](https://github.com/snakers4/silero-vad) to filter predictions
 * `--custom-model-dir <DIR>` - look for custom wake word models in `<DIR>`
-* `--debug` - print lots of information to console
+* `--debug` - print extra information to console
+* `--debug-probability` - print even more information for each audio chunk
 
 
 ## Docker Image
 
 ``` sh
-docker run -it -p 10400:10400 rhasspy/wyoming-openwakeword \
-    --preload-model 'ok_nabu'
+docker run -it -p 10400:10400 rhasspy/wyoming-openwakeword
 ```
 
 ### Custom Models
 
 ```sh
 docker run -it -p 10400:10400 -v /path/to/custom/models:/custom rhasspy/wyoming-openwakeword \
-    --preload-model 'ok_nabu' \
     --custom-model-dir /custom
 ```
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-tflite-runtime
+openwakeword==0.6.0
 wyoming==1.5.3
 numpy<2
diff --git a/tests/test_openwakeword.py b/tests/test_openwakeword.py
@@ -26,9 +26,10 @@ async def test_openwakeword() -> None:
         "wyoming_openwakeword",
         "--uri",
         "stdio://",
+        "--vad-threshold",
+        "0.5",
         stdin=PIPE,
         stdout=PIPE,
-        stderr=PIPE,
     )
     assert proc.stdin is not None
     assert proc.stdout is not None
@@ -58,7 +59,7 @@ async def test_openwakeword() -> None:
                 model_found = True
                 break
 
-        assert model_found, "Expected 'ok nabu' model"
+        assert model_found, f"Expected 'ok nabu' model in {wake.models}"
         break
 
     # We want to use the 'ok nabu' model

diff --git a/wyoming_openwakeword/VERSION b/wyoming_openwakeword/VERSION
@@ -1 +1 @@
-1.10.0
+2.0.0
diff --git a/wyoming_openwakeword/__main__.py b/wyoming_openwakeword/__main__.py
@@ -4,53 +4,51 @@
 import logging
 from functools import partial
 from pathlib import Path
-from threading import Thread
 
+import openwakeword
 from wyoming.server import AsyncServer
 
 from . import __version__
-from .handler import OpenWakeWordEventHandler, ensure_loaded
-from .openwakeword import embeddings_proc, mels_proc
-from .state import State
+from .const import Settings
+from .handler import OpenWakeWordEventHandler
 
 _LOGGER = logging.getLogger()
 _DIR = Path(__file__).parent
 
 
 async def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument("--uri", default="stdio://", help="unix:// or tcp://")
     parser.add_argument(
         "--models-dir",
         default=_DIR / "models",
         help="Path to directory with built-in models",
     )
+    parser.add_argument("--uri", default="stdio://", help="unix:// or tcp://")
     parser.add_argument(
         "--custom-model-dir",
         action="append",
         default=[],
         help="Path to directory with custom wake word models",
     )
-    parser.add_argument(
-        "--preload-model",
-        action="append",
-        default=[],
-        help="Name or path of wake word model(s) to pre-load",
-    )
     parser.add_argument(
         "--threshold",
         type=float,
         default=0.5,
         help="Wake word model threshold (0.0-1.0, default: 0.5)",
     )
+    parser.add_argument("--output-dir", help="Path to save audio and detections")
     parser.add_argument(
-        "--trigger-level",
-        type=int,
-        default=1,
-        help="Number of activations before detection (default: 1)",
+        "--refractory-seconds",
+        type=float,
+        default=0.5,
+        help="Seconds before the same wake word can be triggered again",
+    )
+    parser.add_argument(
+        "--vad-threshold",
+        type=float,
+        default=0,
+        help="Use Silero VAD model to filter predictions when greater than 0 (default: 0)",
     )
-    #
-    parser.add_argument("--output-dir", help="Path to save audio and detections")
     #
     parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
     parser.add_argument(
@@ -62,8 +60,6 @@ async def main() -> None:
         help="Log all wake word probabilities (VERY noisy)",
     )
     parser.add_argument("--version", action="store_true", help="Print version and exit")
-    #
-    parser.add_argument("--model", action="append", default=[], help="Deprecated")
 
     args = parser.parse_args()
 
@@ -85,51 +81,61 @@ async def main() -> None:
         args.output_dir.mkdir(parents=True, exist_ok=True)
         _LOGGER.info("Audio will be saved to %s", args.output_dir)
 
-    # Resolve wake word model paths
-    state = State(
-        models_dir=Path(args.models_dir),
-        custom_model_dirs=[Path(d) for d in args.custom_model_dir],
-        debug_probability=args.debug_probability,
-        output_dir=args.output_dir,
-    )
+    models_dir = Path(args.models_dir)
 
-    # Pre-load models
-    ensure_loaded(
-        state,
-        args.preload_model,
-        threshold=args.threshold,
-        trigger_level=args.trigger_level,
-    )
+    if args.vad_threshold > 0:
+        _LOGGER.debug("Using Silero VAD (threshold=%s)", args.vad_threshold)
+
+        # Patch VAD path
+        openwakeword.VAD_MODELS["silero_vad"]["model_path"] = str(
+            models_dir / "silero_vad.onnx"
+        )
+
+        # Patch VAD constructor to use configured model path
+        original_vad_init = openwakeword.VAD.__init__
+
+        def new_vad_init(self, **kwargs):
+            original_vad_init(
+                self,
+                model_path=openwakeword.VAD_MODELS["silero_vad"]["model_path"],
+                **kwargs,
+            )
+
+        openwakeword.VAD.__init__ = new_vad_init
 
-    # audio -> mels
-    mels_thread = Thread(target=mels_proc, daemon=True, args=(state,))
-    mels_thread.start()
+    # Patch model paths
+    for model_dict in (
+        openwakeword.FEATURE_MODELS,
+        openwakeword.VAD_MODELS,
+        openwakeword.MODELS,
+    ):
+        for model_value in model_dict.values():
+            model_path = Path(model_value["model_path"])
+            model_path = models_dir / model_path.name
+            model_value["model_path"] = str(model_path)
 
-    # mels -> embeddings
-    embeddings_thread = Thread(target=embeddings_proc, daemon=True, args=(state,))
-    embeddings_thread.start()
     _LOGGER.info("Ready")
 
     # Start server
     server = AsyncServer.from_uri(args.uri)
 
     try:
-        await server.run(partial(OpenWakeWordEventHandler, args, state))
+        await server.run(
+            partial(
+                OpenWakeWordEventHandler,
+                Settings(
+                    builtin_models_dir=models_dir,
+                    custom_model_dirs=[Path(d) for d in args.custom_model_dir],
+                    detection_threshold=args.threshold,
+                    vad_threshold=args.vad_threshold,
+                    refractory_seconds=args.refractory_seconds,
+                    output_dir=Path(args.output_dir) if args.output_dir else None,
+                    debug_probability=args.debug_probability,
+                ),
+            )
+        )
     except KeyboardInterrupt:
         pass
-    finally:
-        # Graceful shutdown
-        _LOGGER.debug("Shutting down")
-        state.is_running = False
-        state.audio_ready.release()
-        mels_thread.join()
-
-        state.mels_ready.release()
-        embeddings_thread.join()
-
-        for ww_name, ww_state in state.wake_words.items():
-            ww_state.embeddings_ready.release()
-            state.ww_threads[ww_name].join()
 
 
 # -----------------------------------------------------------------------------

diff --git a/wyoming_openwakeword/const.py b/wyoming_openwakeword/const.py
@@ -1,78 +1,14 @@
-from dataclasses import dataclass, field
-from typing import Dict, Final, Optional, Set, Tuple
-
-import numpy as np
-from wyoming.server import AsyncEventHandler
-
-_AUTOFILL_SECONDS: Final = 3
-_MAX_SECONDS: Final = 10
-
-_SAMPLE_RATE: Final = 16000  # 16Khz
-_SAMPLE_WIDTH: Final = 2  # 16-bit samples
-_MAX_SAMPLES: Final = _MAX_SECONDS * _SAMPLE_RATE
-
-SAMPLES_PER_CHUNK: Final = 1280  # 80 ms @ 16Khz
-_BYTES_PER_CHUNK: Final = SAMPLES_PER_CHUNK * _SAMPLE_WIDTH
-MS_PER_CHUNK: Final = SAMPLES_PER_CHUNK // _SAMPLE_RATE
-
-# window = 400, hop length = 160
-_MELS_PER_SECOND: Final = 97
-_MAX_MELS: Final = _MAX_SECONDS * _MELS_PER_SECOND
-MEL_SAMPLES: Final = 1760
-NUM_MELS: Final = 32
-
-EMB_FEATURES: Final = 76  # 775 ms
-EMB_STEP: Final = 8
-_MAX_EMB: Final = _MAX_SECONDS * EMB_STEP
-WW_FEATURES: Final = 96
-
-CLIENT_ID_TYPE = Tuple[str, int]
-
-
-@dataclass
-class WakeWordData:
-    new_embeddings: int = 0
-    embeddings: np.ndarray = field(
-        default_factory=lambda: np.zeros(
-            shape=(_MAX_EMB, WW_FEATURES), dtype=np.float32
-        )
-    )
-    embeddings_timestamp: int = 0
-    ww_windows: Optional[int] = None
-    is_detected: bool = False
-    activations: int = 0
-    threshold: float = 0.5
-    trigger_level: int = 1
-    is_processing: bool = False
-
-    def reset(self) -> None:
-        self.new_embeddings = 0
-        self.embeddings.fill(0)
-        self.is_detected = False
-        self.activations = 0
-        self.is_processing = False
-
-
-@dataclass
-class ClientData:
-    event_handler: AsyncEventHandler
-    new_audio_samples: int = _AUTOFILL_SECONDS * _SAMPLE_RATE
-    audio_timestamp: int = 0
-    audio: np.ndarray = field(
-        default_factory=lambda: np.zeros(shape=(_MAX_SAMPLES,), dtype=np.float32)
-    )
-    new_mels: int = 0
-    mels_timestamp: int = 0
-    mels: np.ndarray = field(
-        default_factory=lambda: np.zeros(shape=(_MAX_MELS, NUM_MELS), dtype=np.float32)
-    )
-    wake_words: Dict[str, WakeWordData] = field(default_factory=dict)
-    wake_word_names: Optional[Set[str]] = None
-
-    def reset(self) -> None:
-        self.audio.fill(0)
-        self.new_audio_samples = _AUTOFILL_SECONDS * _SAMPLE_RATE
-        self.mels.fill(0)
-        self.new_mels = 0
-        for ww_data in self.wake_words.values():
-            ww_data.reset()
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+
+@dataclass(frozen=True)
+class Settings:
+    builtin_models_dir: Path
+    custom_model_dirs: List[Path]
+    detection_threshold: float
+    vad_threshold: float
+    refractory_seconds: float
+    output_dir: Optional[Path]
+    debug_probability: bool