diff --git a/README.md b/README.md index 3fea9c4..946a89b 100644 --- a/README.md +++ b/README.md @@ -23,18 +23,17 @@ This assistant can run offline on your local machine, and it respects your priva ![Settings](https://raw.githubusercontent.com/vietanhdev/llama-assistant/refs/heads/main/docs/custom-models.png) - ## Supported Models - 📝 Text-only models: - - [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized) - - [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized) + - [Llama 3.2](https://github.com/facebookresearch/llama) - 1B, 3B (4/8-bit quantized). + - [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF) (4-bit quantized). - And other models that [LlamaCPP](https://github.com/ggerganov/llama.cpp) supports via custom models. [See the list](https://github.com/ggerganov/llama.cpp). - 🖼️ Multimodal models: - - [Moondream2](https://huggingface.co/vikhyatk/moondream2) - - [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) - - [LLaVA 1.5/1.6](https://llava-vl.github.io/) + - [Moondream2](https://huggingface.co/vikhyatk/moondream2). + - [MiniCPM-v2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf). + - [LLaVA 1.5/1.6](https://llava-vl.github.io/). - Besides supported models, you can try other variants via custom models. ## TODO @@ -45,7 +44,7 @@ This assistant can run offline on your local machine, and it respects your priva - [x] 📚 Support 5 other text models. - [x] 🖼️ Support 5 other multimodal models. - [x] ⚡ Streaming support for response. -- [ ] 🎙️ Add offline STT support: WhisperCPP (WIP - [Experimental Code](llama_assistant/speech_recognition_whisper_experimental.py)). +- [x] 🎙️ Add offline STT support: WhisperCPP. - [ ] 🧠 Knowledge database: Langchain or LlamaIndex?. - [ ] 🔌 Plugin system for extensibility. - [ ] 📰 News and weather updates. @@ -59,11 +58,11 @@ This assistant can run offline on your local machine, and it respects your priva ## Features -- 🎙️ Voice recognition for hands-free interaction -- 💬 Natural language processing with Llama 3.2 -- 🖼️ Image analysis capabilities (TODO) -- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS) -- 🎨 Customizable UI with adjustable transparency +- 🎙️ Voice recognition for hands-free interaction. +- 💬 Natural language processing with Llama 3.2. +- 🖼️ Image analysis capabilities (TODO). +- ⚡ Global hotkey for quick access (Cmd+Shift+Space on macOS). +- 🎨 Customizable UI with adjustable transparency. **Note:** This project is a work in progress, and new features are being added regularly. @@ -89,17 +88,17 @@ pip install pyaudio 1. Clone the repository: - ```bash - git clone https://github.com/vietanhdev/llama-assistant.git - cd llama-assistant - ``` +```bash +git clone https://github.com/vietanhdev/llama-assistant.git +cd llama-assistant +``` 2. Install the required dependencies: - ```bash - pip install -r requirements.txt - pip install pyaudio - ``` +```bash +pip install -r requirements.txt +pip install pyaudio +``` diff --git a/llama_assistant/llama_assistant.py b/llama_assistant/llama_assistant.py index 02259fa..bd2c179 100644 --- a/llama_assistant/llama_assistant.py +++ b/llama_assistant/llama_assistant.py @@ -46,7 +46,7 @@ from llama_assistant.custom_plaintext_editor import CustomPlainTextEdit from llama_assistant.global_hotkey import GlobalHotkey from llama_assistant.setting_dialog import SettingsDialog -from llama_assistant.speech_recognition import SpeechRecognitionThread +from llama_assistant.speech_recognition_thread import SpeechRecognitionThread from llama_assistant.utils import image_to_base64_data_uri, load_image from llama_assistant.model_handler import handler as model_handler from llama_assistant.icons import ( diff --git a/llama_assistant/speech_recognition.py b/llama_assistant/speech_recognition.py deleted file mode 100644 index 2a0c0c0..0000000 --- a/llama_assistant/speech_recognition.py +++ /dev/null @@ -1,31 +0,0 @@ -from PyQt6.QtCore import QThread, pyqtSignal -import speech_recognition as sr - - -class SpeechRecognitionThread(QThread): - finished = pyqtSignal(str) - error = pyqtSignal(str) - - def __init__(self): - super().__init__() - self.recognizer = sr.Recognizer() - self.microphone = sr.Microphone() - self.stop_listening = False - - def run(self): - with self.microphone as source: - self.recognizer.adjust_for_ambient_noise(source) - while not self.stop_listening: - try: - audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=10) - text = self.recognizer.recognize_google(audio) - self.finished.emit(text) - except sr.WaitTimeoutError: - continue - except sr.UnknownValueError: - self.error.emit("Could not understand audio") - except sr.RequestError as e: - self.error.emit(f"Could not request results; {e}") - - def stop(self): - self.stop_listening = True diff --git a/llama_assistant/speech_recognition_thread.py b/llama_assistant/speech_recognition_thread.py new file mode 100644 index 0000000..93cc753 --- /dev/null +++ b/llama_assistant/speech_recognition_thread.py @@ -0,0 +1,138 @@ +import pkgutil +from pathlib import Path +import datetime +import os +import re +import requests + +from PyQt6.QtCore import QThread, pyqtSignal +import speech_recognition as sr + +# patch whisper on file not find error +# https://github.com/carloscdias/whisper-cpp-python/pull/12 +try: + import whisper_cpp_python +except FileNotFoundError: + regex = r"(\"darwin\":\n\s*lib_ext = \")\.so(\")" + subst = "\\1.dylib\\2" + + print("fixing and re-importing whisper_cpp_python...") + # load whisper_cpp_python and substitute .so with .dylib for darwin + package = pkgutil.get_loader("whisper_cpp_python") + whisper_path = Path(package.path) + whisper_cpp_py = whisper_path.parent.joinpath("whisper_cpp.py") + content = whisper_cpp_py.read_text() + result = re.sub(regex, subst, content, 0, re.MULTILINE) + whisper_cpp_py.write_text(result) + + import whisper_cpp_python + + +class SpeechRecognitionThread(QThread): + finished = pyqtSignal(str) + error = pyqtSignal(str) + WHISPER_THREADS = 4 + WHISPER_LANGUAGE = "en" + + def __init__(self): + super().__init__() + self.stop_listening = False + + # Set up model path and download if necessary + self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp" + self.model_path = self.model_dir / "ggml-base-fp16.bin" + self.download_model_if_needed() + + # Initialize Whisper model + self.whisper = whisper_cpp_python.Whisper( + model_path=str(self.model_path), n_threads=self.WHISPER_THREADS + ) + + # Create temporary folder for audio files + self.tmp_audio_folder = Path.home() / "llama-assistant" / "tmp_audio" + self.tmp_audio_folder.mkdir(parents=True, exist_ok=True) + + def download_model_if_needed(self): + if not self.model_path.exists(): + print("Downloading Whisper model...") + self.model_dir.mkdir(parents=True, exist_ok=True) + url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-base-fp16.bin" + response = requests.get(url) + with open(self.model_path, "wb") as f: + f.write(response.content) + print("Model downloaded successfully.") + + def run(self): + recognizer = sr.Recognizer() + microphone = sr.Microphone() + try: + with microphone as source: + recognizer.adjust_for_ambient_noise(source) + while not self.stop_listening: + try: + recognizer.pause_threshold = 1 + audio_data = recognizer.listen(source, timeout=1, phrase_time_limit=5) + + # Save audio data to temporary file + tmp_filepath = ( + self.tmp_audio_folder / f"temp_audio_{datetime.datetime.now()}.wav" + ) + with open(tmp_filepath, "wb") as f: + f.write(audio_data.get_wav_data()) + + # Transcribe audio + res = self.whisper.transcribe( + file=tmp_filepath, language=self.WHISPER_LANGUAGE + ) + transcription = res["text"] + + # Clean up transcription + transcription = re.sub(r"\[.*\]", "", transcription) + transcription = re.sub(r"\(.*\)", "", transcription) + + print(f"Transcription: {transcription}") + os.remove(tmp_filepath) + + self.finished.emit(transcription) + except sr.WaitTimeoutError: + print("timeout") + continue + except sr.UnknownValueError: + print("Could not understand audio") + self.error.emit("Could not understand audio") + except sr.RequestError as e: + print(f"Could not request results; {e}") + self.error.emit(f"Could not request results; {e}") + except KeyboardInterrupt: + print("Keyboard interrupt detected. Stopping speech recognition.") + self.stop() + + def stop(self): + self.stop_listening = True + + +# Demo code +if __name__ == "__main__": + from PyQt6.QtWidgets import QApplication + import sys + + app = QApplication(sys.argv) + + def on_finished(text): + print(f"Transcription: {text}") + thread.stop() + app.quit() + + def on_error(error_message): + print(f"Error: {error_message}") + thread.stop() + app.quit() + + thread = SpeechRecognitionThread() + thread.finished.connect(on_finished) + thread.error.connect(on_error) + + print("Starting speech recognition. Speak into your microphone...") + thread.start() + + sys.exit(app.exec()) diff --git a/llama_assistant/speech_recognition_whisper_experimental.py b/llama_assistant/speech_recognition_whisper_experimental.py deleted file mode 100644 index 1da51c4..0000000 --- a/llama_assistant/speech_recognition_whisper_experimental.py +++ /dev/null @@ -1,126 +0,0 @@ -import threading -import queue -import pyaudio -import wave -import os -from pathlib import Path -import datetime -from whisper_cpp_python import Whisper -import re -import requests - - -class SpeechRecognition: - def __init__(self): - # Audio settings - self.RATE = 16000 - self.CHUNK = self.RATE - self.NB_CHANNELS = 1 - self.RECORD_SECONDS = 1 - - # Whisper settings - self.WHISPER_LANGUAGE = "en" - self.WHISPER_THREADS = 1 - - # Initialize queues - self.audio_queue = queue.Queue() - self.text_queue = queue.Queue() - - # Set up model path and download if necessary - self.model_dir = Path.home() / "llama-assistant" / "models" / "whisper-cpp" - self.model_path = self.model_dir / "ggml-tiny-fp16.bin" - self.download_model_if_needed() - - # Initialize Whisper model - self.whisper = Whisper(model_path=str(self.model_path), n_threads=self.WHISPER_THREADS) - - # Initialize PyAudio - self.audio = pyaudio.PyAudio() - self.stream = self.audio.open( - format=pyaudio.paInt16, - channels=self.NB_CHANNELS, - rate=self.RATE, - input=True, - frames_per_buffer=self.CHUNK, - ) - - # Create temporary folder for audio files - self.tmp_audio_folder = Path("./tmp_audio") - if not self.tmp_audio_folder.exists(): - self.tmp_audio_folder.mkdir() - - self.stop_listening = False - - def download_model_if_needed(self): - if not self.model_path.exists(): - print("Downloading Whisper model...") - self.model_dir.mkdir(parents=True, exist_ok=True) - url = "https://huggingface.co/danielus/ggml-whisper-models/resolve/main/ggml-tiny-fp16.bin" - response = requests.get(url) - with open(self.model_path, "wb") as f: - f.write(response.content) - print("Model downloaded successfully.") - - def listen(self): - while not self.stop_listening: - audio_data = self.stream.read(self.CHUNK) - self.audio_queue.put(audio_data) - - def transcribe(self): - while not self.stop_listening: - if not self.audio_queue.empty(): - audio_data = self.audio_queue.get() - - # Save audio data to temporary file - tmp_filepath = f"./tmp_audio/output_{datetime.datetime.now()}.wav" - with wave.open(tmp_filepath, "wb") as wf: - wf.setnchannels(self.NB_CHANNELS) - wf.setsampwidth(2) # 16-bit audio - wf.setframerate(self.RATE) - wf.writeframes(audio_data) - - # Transcribe audio - res = self.whisper.transcribe(file=tmp_filepath, language=self.WHISPER_LANGUAGE) - transcription = res["text"] - - # Clean up transcription - transcription = re.sub(r"\[.*\]", "", transcription) - transcription = re.sub(r"\(.*\)", "", transcription) - - # Add transcription to text queue - self.text_queue.put(transcription) - - # Cleanup - os.remove(tmp_filepath) - - def start(self): - self.stop_listening = False - threading.Thread(target=self.listen, daemon=True).start() - threading.Thread(target=self.transcribe, daemon=True).start() - - def stop(self): - self.stop_listening = True - self.stream.stop_stream() - self.stream.close() - self.audio.terminate() - - def get_transcription(self): - if not self.text_queue.empty(): - return self.text_queue.get() - return None - - -# Example usage -if __name__ == "__main__": - recognizer = SpeechRecognition() - recognizer.start() - - print("Speech recognition started. Press Ctrl+C to stop.") - try: - while True: - transcription = recognizer.get_transcription() - if transcription: - print(f"Transcription: {transcription}") - except KeyboardInterrupt: - print("Stopping speech recognition...") - recognizer.stop() diff --git a/pyproject.toml b/pyproject.toml index 2b51ad2..6bb0dc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama-assistant" -version = "0.1.26" +version = "0.1.28" authors = [ {name = "Viet-Anh Nguyen", email = "vietanh.dev@gmail.com"}, ]