diff --git a/README.md b/README.md index 27c4d9b..a6406aa 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,8 @@ This assistant can run offline on your local machine, and it respects your priva ## TODO - [x] 🖼️ Support multimodal model: [moondream2](https://huggingface.co/vikhyatk/moondream2). -- [ ] 🎙️ Add offline STT support: WhisperCPP. -- [ ] 🗣️ Add wake word detection: "Hey Llama!". +- [x] 🗣️ Add wake word detection: "Hey Llama!". +- [ ] 🎙️ Add offline STT support: WhisperCPP. [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py). - [ ] 📚 Support 5 other text models. - [ ] 🖼️ Support 5 other multimodal models. - [ ] 🧠 Knowledge database: Langchain or LlamaIndex?. diff --git a/llama_assistant/llama_assistant.py b/llama_assistant/llama_assistant.py index 01d77d4..98e0c80 100644 --- a/llama_assistant/llama_assistant.py +++ b/llama_assistant/llama_assistant.py @@ -36,6 +36,7 @@ QFont, QBitmap, ) +from llama_assistant.wake_word_detector import WakeWordDetector from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit from llama_assistant.global_hotkey import GlobalHotkey @@ -54,6 +55,7 @@ class LlamaAssistant(QMainWindow): def __init__(self): super().__init__() + self.wake_word_detector = None self.load_settings() self.init_ui() self.init_tray() @@ -66,6 +68,18 @@ def __init__(self): self.current_text_model = self.settings.get("text_model") self.current_multimodal_model = self.settings.get("multimodal_model") + def init_wake_word_detector(self): + if self.wake_word_detector is not None: + self.deinit_wake_word_detector() + self.wake_word_detector = WakeWordDetector() + self.wake_word_detector.wakeword_detected.connect(self.on_wake_word_detected) + self.wake_word_detector.start() + + def deinit_wake_word_detector(self): + if self.wake_word_detector.running: + self.wake_word_detector.stop() + self.wake_word_detector = None + def load_settings(self): home_dir = Path.home() settings_dir = home_dir / "llama_assistant" @@ -90,8 +104,14 @@ def load_settings(self): "transparency": 90, "text_model": "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", "multimodal_model": "vikhyatk/moondream2", + "hey_llama_chat": False, + "hey_llama_mic": False, } self.save_settings() + if self.settings.get("hey_llama_chat", False) and self.wake_word_detector is None: + self.init_wake_word_detector() + if not self.settings.get("hey_llama_chat", False) and self.wake_word_detector is not None: + self.deinit_wake_word_detector() self.current_text_model = self.settings.get("text_model") self.current_multimodal_model = self.settings.get("multimodal_model") @@ -577,6 +597,13 @@ def mouseMoveEvent(self, event): self.move(self.x() + delta.x(), self.y() + delta.y()) self.oldPos = event.globalPosition().toPoint() + def on_wake_word_detected(self, model_name): + self.show() + self.activateWindow() + self.raise_() + if self.settings.get("hey_llama_mic", False): + self.start_voice_input() + def toggle_voice_input(self): if not self.is_listening: self.start_voice_input() @@ -627,9 +654,15 @@ def on_speech_recognized(self, text): self.input_field.setPlainText(f"{current_text}\n{text}") else: self.input_field.setPlainText(text) + self.stop_voice_input() def on_speech_error(self, error_message): - print(error_message) + print(f"Speech recognition error: {error_message}") + self.stop_voice_input() + + def closeEvent(self, event): + self.wake_word_detector.stop() + super().closeEvent(event) if __name__ == "__main__": diff --git a/llama_assistant/resources/wk_hey_llama.onnx b/llama_assistant/resources/wk_hey_llama.onnx new file mode 100644 index 0000000..177ad40 Binary files /dev/null and b/llama_assistant/resources/wk_hey_llama.onnx differ diff --git a/llama_assistant/setting_dialog.py b/llama_assistant/setting_dialog.py index d5d9ab3..ff8a5d7 100644 --- a/llama_assistant/setting_dialog.py +++ b/llama_assistant/setting_dialog.py @@ -11,6 +11,8 @@ QVBoxLayout, QHBoxLayout, QWidget, + QCheckBox, + QGroupBox, ) from PyQt6.QtCore import pyqtSignal from PyQt6.QtCore import Qt @@ -28,52 +30,88 @@ def __init__(self, parent=None): self.setWindowTitle("Settings") self.main_layout = QVBoxLayout(self) - # Create a form layout for the settings - form_widget = QWidget() - self.layout = QFormLayout(form_widget) - self.layout.setFormAlignment(Qt.AlignmentFlag.AlignLeft) - self.layout.setLabelAlignment(Qt.AlignmentFlag.AlignLeft) + # General Settings Group + self.create_general_settings_group() + + # Appearance Settings Group + self.create_appearance_settings_group() + + # Model Settings Group + self.create_model_settings_group() + + # Voice Activation Settings Group + self.create_voice_activation_settings_group() + + # Create a horizontal layout for the save button + button_layout = QHBoxLayout() + self.save_button = QPushButton("Save") + self.save_button.clicked.connect(self.accept) + button_layout.addStretch() + button_layout.addWidget(self.save_button) + + # Add the button layout to the main layout + self.main_layout.addLayout(button_layout) + + self.load_settings() + + def create_general_settings_group(self): + group_box = QGroupBox("General Settings") + layout = QFormLayout() self.shortcut_recorder = ShortcutRecorder() - self.layout.addRow("Shortcut:", self.shortcut_recorder) + layout.addRow("Shortcut:", self.shortcut_recorder) self.reset_shortcut_button = QPushButton("Reset Shortcut") self.reset_shortcut_button.clicked.connect(self.reset_shortcut) - self.layout.addRow(self.reset_shortcut_button) + layout.addRow(self.reset_shortcut_button) + + group_box.setLayout(layout) + self.main_layout.addWidget(group_box) + + def create_appearance_settings_group(self): + group_box = QGroupBox("Appearance Settings") + layout = QFormLayout() self.color_button = QPushButton("Choose Color") self.color_button.clicked.connect(self.choose_color) - self.layout.addRow("Background Color:", self.color_button) + layout.addRow("Background Color:", self.color_button) self.transparency_slider = QSlider(Qt.Orientation.Horizontal) self.transparency_slider.setRange(10, 100) self.transparency_slider.setValue(90) - self.layout.addRow("Transparency:", self.transparency_slider) + layout.addRow("Transparency:", self.transparency_slider) + + group_box.setLayout(layout) + self.main_layout.addWidget(group_box) + + def create_model_settings_group(self): + group_box = QGroupBox("Model Settings") + layout = QFormLayout() - # Text-only model selection self.text_model_combo = QComboBox() self.text_model_combo.addItems(self.get_model_names_by_type("text")) - self.layout.addRow("Text-only Model:", self.text_model_combo) + layout.addRow("Text-only Model:", self.text_model_combo) - # Multimodal model selection self.multimodal_model_combo = QComboBox() self.multimodal_model_combo.addItems(self.get_model_names_by_type("image")) - self.layout.addRow("Multimodal Model:", self.multimodal_model_combo) + layout.addRow("Multimodal Model:", self.multimodal_model_combo) - # Add the form widget to the main layout - self.main_layout.addWidget(form_widget) + group_box.setLayout(layout) + self.main_layout.addWidget(group_box) - # Create a horizontal layout for the save button - button_layout = QHBoxLayout() - self.save_button = QPushButton("Save") - self.save_button.clicked.connect(self.accept) - button_layout.addStretch() - button_layout.addWidget(self.save_button) + def create_voice_activation_settings_group(self): + group_box = QGroupBox("Voice Activation Settings") + layout = QVBoxLayout() - # Add the button layout to the main layout - self.main_layout.addLayout(button_layout) + self.hey_llama_chat_checkbox = QCheckBox('Say "Hey Llama" to open chat form') + self.hey_llama_chat_checkbox.stateChanged.connect(self.update_hey_llama_mic_state) + layout.addWidget(self.hey_llama_chat_checkbox) - self.load_settings() + self.hey_llama_mic_checkbox = QCheckBox('Say "Hey Llama" to activate microphone') + layout.addWidget(self.hey_llama_mic_checkbox) + + group_box.setLayout(layout) + self.main_layout.addWidget(group_box) def accept(self): self.save_settings() @@ -91,6 +129,9 @@ def choose_color(self): def reset_shortcut(self): self.shortcut_recorder.setText("++") + def update_hey_llama_mic_state(self, state): + self.hey_llama_mic_checkbox.setEnabled(state == Qt.CheckState.Checked.value) + def load_settings(self): home_dir = Path.home() settings_file = home_dir / "llama_assistant" / "settings.json" @@ -109,6 +150,10 @@ def load_settings(self): multimodal_model = settings.get("multimodal_model") if multimodal_model in self.get_model_names_by_type("image"): self.multimodal_model_combo.setCurrentText(multimodal_model) + + self.hey_llama_chat_checkbox.setChecked(settings.get("hey_llama_chat", False)) + self.hey_llama_mic_checkbox.setChecked(settings.get("hey_llama_mic", False)) + self.update_hey_llama_mic_state(settings.get("hey_llama_chat", False)) else: self.color = QColor("#1E1E1E") self.shortcut_recorder.setText("++") @@ -120,6 +165,8 @@ def get_settings(self): "transparency": self.transparency_slider.value(), "text_model": self.text_model_combo.currentText(), "multimodal_model": self.multimodal_model_combo.currentText(), + "hey_llama_chat": self.hey_llama_chat_checkbox.isChecked(), + "hey_llama_mic": self.hey_llama_mic_checkbox.isChecked(), } def save_settings(self): diff --git a/llama_assistant/speech_recognition_whisper_experimental.py b/llama_assistant/speech_recognition_whisper_experimental.py new file mode 100644 index 0000000..1da51c4 --- /dev/null +++ b/llama_assistant/speech_recognition_whisper_experimental.py @@ -0,0 +1,126 @@ +import threading +import queue +import pyaudio +import wave +import os +from pathlib import Path +import datetime +from whisper_cpp_python import Whisper +import re +import requests + + +class SpeechRecognition: + def __init__(self): + # Audio settings + self.RATE = 16000 + self.CHUNK = self.RATE + self.NB_CHANNELS = 1 + self.RECORD_SECONDS = 1 + + # Whisper settings + self.WHISPER_LANGUAGE = "en" + self.WHISPER_THREADS = 1 + + # Initialize queues + self.audio_queue = queue.Queue() + self.text_queue = queue.Queue() + + # Set up model path and download if necessary + self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp" + self.model_path = self.model_dir / "ggml-tiny-fp16.bin" + self.download_model_if_needed() + + # Initialize Whisper model + self.whisper = Whisper(model_path=str(self.model_path), n_threads=self.WHISPER_THREADS) + + # Initialize PyAudio + self.audio = pyaudio.PyAudio() + self.stream = self.audio.open( + format=pyaudio.paInt16, + channels=self.NB_CHANNELS, + rate=self.RATE, + input=True, + frames_per_buffer=self.CHUNK, + ) + + # Create temporary folder for audio files + self.tmp_audio_folder = Path("./tmp_audio") + if not self.tmp_audio_folder.exists(): + self.tmp_audio_folder.mkdir() + + self.stop_listening = False + + def download_model_if_needed(self): + if not self.model_path.exists(): + print("Downloading Whisper model...") + self.model_dir.mkdir(parents=True, exist_ok=True) + url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-tiny-fp16.bin" + response = requests.get(url) + with open(self.model_path, "wb") as f: + f.write(response.content) + print("Model downloaded successfully.") + + def listen(self): + while not self.stop_listening: + audio_data = self.stream.read(self.CHUNK) + self.audio_queue.put(audio_data) + + def transcribe(self): + while not self.stop_listening: + if not self.audio_queue.empty(): + audio_data = self.audio_queue.get() + + # Save audio data to temporary file + tmp_filepath = f"./tmp_audio/output_{datetime.datetime.now()}.wav" + with wave.open(tmp_filepath, "wb") as wf: + wf.setnchannels(self.NB_CHANNELS) + wf.setsampwidth(2) # 16-bit audio + wf.setframerate(self.RATE) + wf.writeframes(audio_data) + + # Transcribe audio + res = self.whisper.transcribe(file=tmp_filepath, language=self.WHISPER_LANGUAGE) + transcription = res["text"] + + # Clean up transcription + transcription = re.sub(r"\[.*\]", "", transcription) + transcription = re.sub(r"\(.*\)", "", transcription) + + # Add transcription to text queue + self.text_queue.put(transcription) + + # Cleanup + os.remove(tmp_filepath) + + def start(self): + self.stop_listening = False + threading.Thread(target=self.listen, daemon=True).start() + threading.Thread(target=self.transcribe, daemon=True).start() + + def stop(self): + self.stop_listening = True + self.stream.stop_stream() + self.stream.close() + self.audio.terminate() + + def get_transcription(self): + if not self.text_queue.empty(): + return self.text_queue.get() + return None + + +# Example usage +if __name__ == "__main__": + recognizer = SpeechRecognition() + recognizer.start() + + print("Speech recognition started. Press Ctrl+C to stop.") + try: + while True: + transcription = recognizer.get_transcription() + if transcription: + print(f"Transcription: {transcription}") + except KeyboardInterrupt: + print("Stopping speech recognition...") + recognizer.stop() diff --git a/llama_assistant/wake_word_detector.py b/llama_assistant/wake_word_detector.py new file mode 100644 index 0000000..b5e9ba7 --- /dev/null +++ b/llama_assistant/wake_word_detector.py @@ -0,0 +1,110 @@ +import pyaudio +import numpy as np +from openwakeword.model import Model +from openwakeword.utils import download_models +from importlib import resources +from PyQt6.QtCore import QThread, pyqtSignal +import time + + +class WakeWordDetector(QThread): + wakeword_detected = pyqtSignal(str) + + def __init__(self, chunk_size=1280, inference_framework="onnx"): + super().__init__() + self.chunk_size = chunk_size + self.inference_framework = inference_framework + self.rate = 16000 + self.format = pyaudio.paInt16 + self.channels = 1 + self.running = False + + self.audio = pyaudio.PyAudio() + self.mic_stream = self.audio.open( + format=self.format, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.chunk_size, + ) + + download_models() + self.owwModel = None + self.n_models = 0 + + def load_model(self): + with resources.path("llama_assistant.resources", "wk_hey_llama.onnx") as path: + self.owwModel = Model( + wakeword_models=[str(path)], + inference_framework=self.inference_framework, + ) + self.n_models = len(self.owwModel.models.keys()) + + def unload_model(self): + self.owwModel = None + self.n_models = 0 + + def run(self): + self.running = True + self.load_model() + while self.running: + try: + audio = np.frombuffer(self.mic_stream.read(self.chunk_size), dtype=np.int16) + prediction = self.owwModel.predict(audio) + self.process_prediction(prediction) + time.sleep(0.01) # Small delay to prevent CPU overuse + except Exception as e: + print(f"Error: {e}") + self.stop() + self.unload_model() + + def process_prediction(self, prediction): + for mdl, scores in self.owwModel.prediction_buffer.items(): + if scores[-1] > 0.5: + self.wakeword_detected.emit(mdl) + print(f"Wakeword detected: {mdl}") + + def stop(self): + self.running = False + self.wait() + + def print_results(self): + if not self.owwModel: + print("Model not loaded") + return + + n_spaces = 16 + output_string_header = """ + Model Name | Score | Wakeword Status + -------------------------------------- + """ + + for mdl in self.owwModel.prediction_buffer.keys(): + scores = list(self.owwModel.prediction_buffer[mdl]) + curr_score = format(scores[-1], ".20f").replace("-", "") + + output_string_header += f"""{mdl}{" "*(n_spaces - len(mdl))} | {curr_score[0:5]} | {"--"+" "*20 if scores[-1] <= 0.5 else "Wakeword Detected!"} + """ + + print("\033[F" * (4 * self.n_models + 1)) + print(output_string_header, " ", end="\r") + + +if __name__ == "__main__": + from PyQt6.QtWidgets import QApplication + import sys + + app = QApplication(sys.argv) + + detector = WakeWordDetector() + detector.wakeword_detected.connect(lambda mdl: print(f"Main thread: Wakeword detected - {mdl}")) + + print("\n\n") + print("#" * 100) + print("Listening for wakewords...") + print("#" * 100) + print("\n" * (detector.n_models * 3)) + + detector.start() + + sys.exit(app.exec()) diff --git a/pyproject.toml b/pyproject.toml index 0e87b97..8605015 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama-assistant" -version = "0.1.16" +version = "0.1.17" authors = [ {name = "Viet-Anh Nguyen", email = "vietanh.dev@gmail.com"}, ] @@ -32,6 +32,7 @@ dependencies = [ "pynput", "SpeechRecognition", "huggingface_hub", + "openwakeword", ] dynamic = [] @@ -51,7 +52,7 @@ include = ["llama_assistant*"] exclude = ["tests*"] [tool.setuptools.package-data] -"llama_assistant.resources" = ["*.png"] +"llama_assistant.resources" = ["*.png", "*.onnx"] [tool.black] diff --git a/requirements.txt b/requirements.txt index a093266..362d5f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ SpeechRecognition==3.10.4 markdown==3.7 pynput==1.7.7 llama-cpp-python -huggingface_hub==0.25.1 \ No newline at end of file +huggingface_hub==0.25.1 +openwakeword==0.6.0 diff --git a/setup.cfg b/setup.cfg index 3ec93ec..781ed7b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,4 +30,4 @@ python_requires = >=3.8 where = . [options.package_data] -llama_assistant.resources = *.png \ No newline at end of file +llama_assistant.resources = *.png, *.onnx