-
Notifications
You must be signed in to change notification settings - Fork 163
/
Copy pathmain_typer_assistant.py
100 lines (88 loc) · 3.86 KB
/
main_typer_assistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from RealtimeSTT import AudioToTextRecorder
from modules.assistant_config import get_config
from modules.typer_agent import TyperAgent
from modules.utils import create_session_logger_id, setup_logging
import logging
import typer
from typing import List
import os
app = typer.Typer()
@app.command()
def ping():
print("pong")
@app.command()
def awaken(
typer_file: str = typer.Option(
..., "--typer-file", "-f", help="Path to typer commands file"
),
scratchpad: str = typer.Option(
..., "--scratchpad", "-s", help="Path to scratchpad file"
),
context_files: List[str] = typer.Option(
[], "--context", "-c", help="List of context files"
),
mode: str = typer.Option(
"default",
"--mode",
"-m",
help="Options: ('default', 'execute', 'execute-no-scratch'). Execution mode: default (no exec), execute (exec + scratch), execute-no-scratch (exec only)",
),
):
"""Run STT interface that processes speech into typer commands"""
# Remove the list concatenation - pass scratchpad as a single string
assistant, typer_file, _ = TyperAgent.build_agent(typer_file, [scratchpad])
print("🎤 Speak now... (press Ctrl+C to exit)")
recorder = AudioToTextRecorder(
spinner=False,
# wake_words="deep" # specific wake words to trigger the assistant using the realtime-stt library. we do this manually below so we can use any word.
# realtime_processing_pause=0.3,
post_speech_silence_duration=1.5, # how long to wait after speech ends before processing
# compute_type="int8",
compute_type="float32",
model="tiny.en", # VERY fast (.5s), but not accurate
# model="small.en", # decent speed (1.5s), improved accuracy
# Beam size controls how many alternative transcription paths are explored
# Higher values = more accurate but slower, lower values = faster but less accurate
# beam_size=3,
# beam_size=5,
beam_size=8,
# Batch size controls how many audio chunks are processed together
# Higher values = faster processing but uses more memory, lower values = slower processing but uses less memory
batch_size=25,
# model="large-v3", # very slow, but accurate
# model="distil-large-v3", # very slow (but faster than large-v3) but accurate
# realtime_model_type="tiny.en", # realtime models are used for the on_realtime_transcription_update() callback
# realtime_model_type="large-v3",
language="en",
print_transcription_time=True,
# enable_realtime_transcription=True,
# on_realtime_transcription_update=lambda text: print(
# f"🎤 on_realtime_transcription_update(): {text}"
# ),
# on_realtime_transcription_stabilized=lambda text: print(
# f"🎤 on_realtime_transcription_stabilized(): {text}"
# ),
# on_recorded_chunk=lambda chunk: print(f"🎤 on_recorded_chunk(): {chunk}"),
# on_transcription_start=lambda: print("🎤 on_transcription_start()"),
# on_recording_stop=lambda: print("🎤 on_transcription_stop()"),
# on_recording_start=lambda: print("🎤 on_recording_start()"),
)
def process_text(text):
print(f"\n🎤 Heard: {text}")
try:
assistant_name = get_config("typer_assistant.assistant_name")
if assistant_name.lower() not in text.lower():
print(f"🤖 Not {assistant_name} - ignoring")
return
recorder.stop()
output = assistant.process_text(
text, typer_file, scratchpad, context_files, mode
)
print(f"🤖 Response:\n{output}")
recorder.start()
except Exception as e:
print(f"❌ Error: {str(e)}")
while True:
recorder.text(process_text)
if __name__ == "__main__":
app()