Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Detected by Transcribe for awake_wav & debug rec #143

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ You can play a WAV file when the wake word is detected (locally or remotely), an
* `--awake-wav <WAV>` - played when the wake word is detected
* `--done-wav <WAV>` - played when the voice command is finished

If you want to play audio files other than WAV, use [event commands](#event-commands). Specifically, the `--detection-command` to replace `--awake-wav` and `--transcript-command` to replace `--done-wav`.
If you want to play audio files other than WAV, use [event commands](#event-commands). Specifically, the `--transcribe-command` to replace `--awake-wav` and `--transcript-command` to replace `--done-wav`.

## Audio Enhancements

Expand Down Expand Up @@ -169,6 +169,7 @@ Satellites can respond to events from the server by running commands:
* `--streaming-start-command` - audio has started streaming to server (no stdin)
* `--streaming-stop-command` - audio has stopped streaming to server (no stdin)
* `--detection-command` - wake word is detected (wake word name on stdin)
* `--transcribe-command` - speech-to-text transcribe is started
* `--transcript-command` - speech-to-text transcript is returned (text on stdin)
* `--stt-start-command` - user started speaking (no stdin)
* `--stt-stop-command` - user stopped speaking (no stdin)
Expand Down
5 changes: 5 additions & 0 deletions wyoming_satellite/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,10 @@ async def main() -> None:
parser.add_argument(
"--detection-command", help="Command to run when wake word is detected"
)
parser.add_argument(
"--transcribe-command",
help="Command to run when speech to text transcribe is started",
)
parser.add_argument(
"--transcript-command",
help="Command to run when speech to text transcript is returned",
Expand Down Expand Up @@ -369,6 +373,7 @@ async def main() -> None:
detect=split_command(args.detect_command),
detection=split_command(args.detection_command),
played=split_command(args.tts_played_command),
transcribe=split_command(args.transcribe_command),
transcript=split_command(args.transcript_command),
stt_start=split_command(args.stt_start_command),
stt_stop=split_command(args.stt_stop_command),
Expand Down
130 changes: 58 additions & 72 deletions wyoming_satellite/satellite.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Callable, Dict, Final, List, Optional, Set, Union

from pyring_buffer import RingBuffer
from wyoming.asr import Transcript
from wyoming.asr import Transcribe, Transcript
from wyoming.audio import AudioChunk, AudioFormat, AudioStart, AudioStop
from wyoming.client import AsyncClient
from wyoming.error import Error
Expand Down Expand Up @@ -240,6 +240,27 @@ async def _stop(self) -> None:
async def stopped(self) -> None:
"""Called when satellite has stopped."""

def _debug_recording_start(self, writer: Optional[DebugAudioWriter]):
if writer is not None:
writer.start()

def _debug_recording_stop(self, writer: Optional[DebugAudioWriter]):
if writer is not None:
writer.stop()

def _debug_recording_write(
self,
writer: Optional[DebugAudioWriter],
event: Event,
audio_bytes: Optional[bytes],
):
if writer is not None:
if audio_bytes is None:
chunk = AudioChunk.from_event(event)
audio_bytes = chunk.audio

writer.write(audio_bytes)

async def event_from_server(self, event: Event) -> None:
"""Called when an event is received from the server."""
if Ping.is_type(event.type):
Expand Down Expand Up @@ -278,16 +299,24 @@ async def event_from_server(self, event: Event) -> None:
elif VoiceStopped.is_type(event.type):
# STT stop
await self.trigger_stt_stop()
elif Transcribe.is_type(event.type):
# STT start
self._debug_recording_start(self.stt_audio_writer)
await self.trigger_transcribe()
elif Transcript.is_type(event.type):
# STT text
self._debug_recording_stop(self.stt_audio_writer)
_LOGGER.debug(event)
await self.trigger_transcript(Transcript.from_event(event))
elif Synthesize.is_type(event.type):
# TTS request
_LOGGER.debug(event)
await self.trigger_synthesize(Synthesize.from_event(event))
elif PauseSatellite.is_type(event.type):
self._debug_recording_stop(self.stt_audio_writer)
elif Error.is_type(event.type):
_LOGGER.warning(event)
self._debug_recording_stop(self.stt_audio_writer)
await self.trigger_error(Error.from_event(event))

# Forward everything except audio to event service
Expand Down Expand Up @@ -813,16 +842,20 @@ async def trigger_detect(self) -> None:
async def trigger_detection(self, detection: Detection) -> None:
"""Called when wake word is detected."""
await run_event_command(self.settings.event.detection, detection.name)
await self._play_wav(
self.settings.snd.awake_wav,
mute_microphone=self.settings.mic.mute_during_awake_wav,
)

async def trigger_played(self) -> None:
"""Called when audio stopped playing"""
await run_event_command(self.settings.event.played)
await self.forward_event(Played().event())

async def trigger_transcribe(self) -> None:
"""Called when speech-to-text is started."""
await run_event_command(self.settings.event.transcribe)
await self._play_wav(
self.settings.snd.awake_wav,
mute_microphone=self.settings.mic.mute_during_awake_wav,
)

async def trigger_transcript(self, transcript: Transcript) -> None:
"""Called when speech-to-text text is received."""
await run_event_command(self.settings.event.transcript, transcript.text)
Expand Down Expand Up @@ -935,22 +968,13 @@ async def event_from_server(self, event: Event) -> None:
elif PauseSatellite.is_type(event.type):
self.is_streaming = False
_LOGGER.info("Satellite paused")
elif Detection.is_type(event.type):
# Start debug recording
if self.stt_audio_writer is not None:
self.stt_audio_writer.start()
elif Transcript.is_type(event.type) or Error.is_type(event.type):
# Stop debug recording
if self.stt_audio_writer is not None:
self.stt_audio_writer.stop()

if Transcript.is_type(event.type):
# We're always streaming
_LOGGER.info("Streaming audio")

# Re-trigger streaming start even though we technically don't stop
# so the event service can reset LEDs, etc.
await self.trigger_streaming_start()
elif Transcript.is_type(event.type):
# We're always streaming
_LOGGER.info("Streaming audio")

# Re-trigger streaming start even though we technically don't stop
# so the event service can reset LEDs, etc.
await self.trigger_streaming_start()

async def event_from_mic(
self, event: Event, audio_bytes: Optional[bytes] = None
Expand All @@ -963,12 +987,7 @@ async def event_from_mic(
await self.event_to_server(event)

# Debug audio recording
if self.stt_audio_writer is not None:
if audio_bytes is None:
chunk = AudioChunk.from_event(event)
audio_bytes = chunk.audio

self.stt_audio_writer.write(audio_bytes)
self._debug_recording_write(self.stt_audio_writer, event, audio_bytes)


# -----------------------------------------------------------------------------
Expand Down Expand Up @@ -1010,10 +1029,6 @@ async def event_from_server(self, event: Event) -> None:
if RunSatellite.is_type(event.type):
self._is_paused = False
_LOGGER.info("Waiting for speech")
elif Detection.is_type(event.type):
# Start debug recording
if self.stt_audio_writer is not None:
self.stt_audio_writer.start()
elif (
Transcript.is_type(event.type)
or Error.is_type(event.type)
Expand All @@ -1025,10 +1040,6 @@ async def event_from_server(self, event: Event) -> None:

self.is_streaming = False

# Stop debug recording
if self.stt_audio_writer is not None:
self.stt_audio_writer.stop()

async def event_from_mic(
self, event: Event, audio_bytes: Optional[bytes] = None
) -> None:
Expand All @@ -1043,13 +1054,7 @@ async def event_from_mic(
chunk: Optional[AudioChunk] = None

# Debug audio recording
if self.stt_audio_writer is not None:
if audio_bytes is None:
# Need to unpack
chunk = AudioChunk.from_event(event)
audio_bytes = chunk.audio

self.stt_audio_writer.write(audio_bytes)
self._debug_recording_write(self.stt_audio_writer, event, audio_bytes)

if (
self.is_streaming
Expand All @@ -1060,9 +1065,7 @@ async def event_from_mic(
self.is_streaming = False
self.timeout_seconds = None

# Stop debug recording
if self.stt_audio_writer is not None:
self.stt_audio_writer.stop()
self._debug_recording_stop(self.stt_audio_writer)

# Stop pipeline
await self.event_to_server(AudioStop().event())
Expand Down Expand Up @@ -1162,6 +1165,11 @@ def __init__(self, settings: SatelliteSettings) -> None:
self._wake_info: Optional[Info] = None
self._wake_info_ready = asyncio.Event()

def _debug_recording_start(self, writer: Optional[DebugAudioWriter]):
# Override the base method to set a timestamp
if writer is not None:
writer.start(timestamp=self._debug_recording_timestamp)

async def event_from_server(self, event: Event) -> None:
# Only check event types once
is_run_satellite = False
Expand All @@ -1185,10 +1193,6 @@ async def event_from_server(self, event: Event) -> None:
# play the "done" WAV.
self.is_streaming = False

# Stop debug recording (stt)
if self.stt_audio_writer is not None:
self.stt_audio_writer.stop()

await super().event_from_server(event)

if is_run_satellite or is_transcript or is_error or is_pause_satellite:
Expand All @@ -1209,19 +1213,14 @@ async def event_from_server(self, event: Event) -> None:

# Start debug recording (wake)
self._debug_recording_timestamp = time.monotonic_ns()
if self.wake_audio_writer is not None:
self.wake_audio_writer.start(
timestamp=self._debug_recording_timestamp
)
self._debug_recording_start(self.wake_audio_writer)

async def trigger_server_disonnected(self) -> None:
await super().trigger_server_disonnected()

self.is_streaming = False

# Stop debug recording (stt)
if self.stt_audio_writer is not None:
self.stt_audio_writer.stop()
self._debug_recording_stop(self.stt_audio_writer)

await self.trigger_streaming_stop()

Expand All @@ -1236,16 +1235,8 @@ async def event_from_mic(
return

# Debug audio recording
if (self.wake_audio_writer is not None) or (self.stt_audio_writer is not None):
if audio_bytes is None:
chunk = AudioChunk.from_event(event)
audio_bytes = chunk.audio

if self.wake_audio_writer is not None:
self.wake_audio_writer.write(audio_bytes)

if self.stt_audio_writer is not None:
self.stt_audio_writer.write(audio_bytes)
self._debug_recording_write(self.wake_audio_writer, event, audio_bytes)
self._debug_recording_write(self.stt_audio_writer, event, audio_bytes)

if self.is_streaming:
# Forward to server
Expand Down Expand Up @@ -1276,12 +1267,7 @@ async def event_from_wake(self, event: Event) -> None:
return

# Stop debug recording (wake)
if self.wake_audio_writer is not None:
self.wake_audio_writer.stop()

# Start debug recording (stt)
if self.stt_audio_writer is not None:
self.stt_audio_writer.start(timestamp=self._debug_recording_timestamp)
self._debug_recording_stop(self.wake_audio_writer)

_LOGGER.debug(detection)

Expand Down
1 change: 1 addition & 0 deletions wyoming_satellite/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ class EventSettings(ServiceSettings):
detect: Optional[List[str]] = None
detection: Optional[List[str]] = None
played: Optional[List[str]] = None
transcribe: Optional[List[str]] = None
transcript: Optional[List[str]] = None
stt_start: Optional[List[str]] = None
stt_stop: Optional[List[str]] = None
Expand Down