first commit

Katashynskyi · Aug 23, 2024 · f06a4fd · f06a4fd
commit f06a4fd
Show file tree

Hide file tree

Showing 23 changed files with 645 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.conda
+.vscode
+__pycache__
+data/model_data
+data/HISTORY.json
+flagged
+src/__pycache__
diff --git a/README.md b/README.md
@@ -0,0 +1,48 @@
+# UA-EN Voice Assistant 
+## No api-keys | local | llama3.1 (12k tokens prompt-menu ~20 pages in single request)
+![GitHub](https://img.shields.io/github/license/Katashynskyi/voice_assistant_UA_EN)
+![GitHub last commit](https://img.shields.io/github/last-commit/Katashynskyi/voice_assistant_UA_EN)
+![Gif](data/media/gif.gif)
+
+
+[![GUI version](data/media/GUI_V.png)](https://youtu.be/iw9P4Y7KXI4)
+[![Console version](data/media/CONSOLE_V.png)](https://youtu.be/c-8Z4qzOcII)
+
+## The Idea
+This project serves as a proof-of-concept for a minimum viable product (MVP) inspired by the capabilities of the OMNI model from ChatGPT. However, it offers a significant advantage: local deployment without restrictions. This empowers users to leverage its functionalities for various purposes, including:
+- Translation across languages
+- Learning Enhancement by practicing writing, reading, and audio skills
+- Customization for tailored use cases
+
+## Features & Tech stack
+
+- **Language Classification**: classify if it's UA or EN for authomatic mode."Lang-id-voxlingua107-ecapa" by speechbrain (supports 100+ lang's).
+- **Google legacy recognizer**: it uses a generic key that works out of the box. It's fast and works well.
+- **Wav2Vec2-Bert**: best (for now) Ukrainian Speech-to-text converter.
+- **Edge-TTS**: best (not generated) voices I can get for free.
+- **Ollama-python**: lib to download and use most popular LLM's.
+- **Streamlit**: for GUI.
+- **dialogue saved in json**: HISTORY.json (only for main.py. For app.py it's only short-term context-window memory).
+- **Config.py**: prompt for best user experience (modify it for your own purposes).
+
+
+## Getting Started
+### Tested on
+- WSL 22.04.3
+- Geforce (mobile) GTX 1050Ti (4GB)
+- RAM (32GB)
+### Prerequisites
+- Python 3.9+
+- Virtual environment (Conda 3.9+)
+- CUDA (optional)
+### Installation
+
+- Clone the repository
+- Create conda venv (Conda 3.9)
+- sudo apt install portaudio19-dev
+- Install the required packages: pip install -r requirements.txt
+
+## Usage
+
+After installation of required libs run main.py for console experience or app.py for GUI lovers.
+
diff --git a/app.py b/app.py
@@ -0,0 +1,184 @@
+"""Python script for a voice assistant using Streamlit and Ollama
+
+This script creates a user interface using Streamlit to interact with a large language model\
+    (LLM) from Ollama for voice-based and text-based communication.
+
+Features:
+  - Audio recording using Streamlit's `audio_recorder` component.
+  - Speech recognition for Ukrainian (UA) and English (EN) using custom functions.
+  - Automatic language detection based on transcribed text.
+  - Text input for user prompts.
+  - Streamlit chat interface for displaying conversation history.
+  - Interaction with Ollama's LLM for generating responses.
+  - Text-to-speech functionality (not implemented in this code).
+"""
+
+import warnings
+import asyncio
+import ollama
+import streamlit as st
+from audio_recorder_streamlit import audio_recorder
+from src.ukrainian_stt import ua_transcribe
+from src.english_stt import en_transcribe
+from src.transcribe_speak import transcribe_and_speak
+from src.utils import convert_audio_to_wav, check_language
+from src.identify_lang import identify_language
+from config import SYS_MSG
+
+# Suppress warnings
+warnings.filterwarnings("ignore")
+
+# File paths
+RECORDED_WAV_FILE = "./data/wav/microphone_stereo.wav"
+CONV_WAV_FILE = "./data/wav/converted_mono.wav"
+WAV_FILE = "./data/wav/chunk.wav"
+
+# Initial conversation history
+HISTORY = [{"role": "system", "content": SYS_MSG}]
+
+# Streamlit page configuration
+st.set_page_config(
+    page_title="Voice assistant UA-EN",
+    page_icon=":trident:",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items=None,
+)
+
+
+def ollama_prompt(model="llama3.1", messages=None):
+    """
+    Sends a prompt to the Ollama LLM and returns a stream of responses.
+
+    Args:
+        model (str, optional): The Ollama model to use. Defaults to "llama3.1".
+        messages (list, optional): A list of dictionaries representing the conversation history.
+                                  Defaults to None.
+
+    Returns:
+        stream: An asynchronous stream of dictionaries containing the LLM's responses.
+    """
+    stream = ollama.chat(model=model, messages=messages, stream=True)
+    return stream
+
+
+def stream_parser(stream):
+    """
+    Parses the stream of responses from the LLM and displays them in the Streamlit chat interface.
+
+    Args:
+        stream: An asynchronous stream of dictionaries containing the LLM's responses.
+
+    Yields:
+        str: Each chunk of the LLM's response.
+    """
+    sentence_chunks, response_text = "", ""
+    st.session_state.messages.append({"role": "assistant", "content": response_text})
+    print("Assistant: ", end="")
+    for chunk in stream:
+        print(chunk["message"]["content"], end="", flush=True)
+        content = chunk["message"]["content"]
+        sentence_chunks += content
+        response_text += content
+        st.session_state.messages[-1]["content"] += content
+        if sentence_chunks.endswith(
+            ('."', "\n\n", "**:", ".", "!", "?", '?"', '!"', ":")
+        ):
+            if any("\u0400" <= char <= "\u04FF" for char in sentence_chunks):
+                lang = "ua"
+            else:
+                lang = "en"
+            asyncio.run(transcribe_and_speak(text=sentence_chunks, lang=lang))
+            sentence_chunks = ""
+        yield chunk["message"]["content"]
+
+
+def stop_running():
+    """The Dummy.Currently does nothing."""
+    with my_slot1.chat_message("user"):
+        st.markdown("Stop!")
+
+# Setup order of elements
+my_slot0 = st.empty()  # most buttons
+my_slot1 = st.empty()  # chat_message("user")
+my_slot2 = st.empty()  # chat_message(message["role"] & chat_message("assistant")
+
+
+# Streamlit custom microphone
+col1, col2 = my_slot0.columns([1, 8.5], vertical_alignment="bottom")
+with col1:
+    audio_bytes = audio_recorder(
+        text="", energy_threshold=0.01, icon_size="5x"
+    )  # if energy_threshold negative - never stops
+if audio_bytes is not None and len(audio_bytes) != 44:
+    st.audio(audio_bytes, format="audio/wav")
+    with open(file=RECORDED_WAV_FILE, mode="wb") as f:
+        f.write(audio_bytes)
+        f.close()
+    convert_audio_to_wav(audio_file=RECORDED_WAV_FILE, output_file=CONV_WAV_FILE)
+# Choose language buttons
+with col2:
+    PRMPT = None
+    button0, button1, button2, button3 = st.columns(4)
+    with button0:
+        if st.button("Stop", use_container_width=True, type="primary"):
+            stop_running()
+    with button1:
+        if st.button("Говорю (UA)", use_container_width=True):
+            PRMPT = "ua:" + ua_transcribe(CONV_WAV_FILE)
+            print(PRMPT)
+    with button2:
+        if st.button("Talking (EN)", use_container_width=True):
+            PRMPT = "en:" + en_transcribe(CONV_WAV_FILE)
+            print(PRMPT)
+    with button3:
+        if st.button("Automatic", use_container_width=True):
+            DEF_LANG = "???"
+            DEF_LANG = identify_language(CONV_WAV_FILE)
+            if DEF_LANG in [
+                ["uk: Ukrainian"],
+                ["pl: Polish"],
+                ["ru: Russian"],
+                ["be: Belarusian"],
+            ]:
+                PRMPT = "ua:" + ua_transcribe(CONV_WAV_FILE)
+                print(PRMPT)
+            else:
+                PRMPT = "en:" + en_transcribe(CONV_WAV_FILE)
+                if PRMPT == "Didn't recognize that.":
+                    print(PRMPT)
+                    PRMPT = None
+                else:
+                    print(PRMPT)
+    user_prompt = st.chat_input(placeholder="Краще напишу/I'll write instead")
+    if user_prompt is not None:
+        user_prompt = check_language(user_prompt=user_prompt)
+# Checks for existing messages in session state
+if "messages" not in st.session_state:
+    st.session_state.messages = HISTORY
+
+# Display chat messages from session state
+for message in st.session_state.messages:
+    with my_slot2.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+if user_prompt is not None or PRMPT is not None:
+    # Display user prompt in chat message widget
+    with my_slot1.chat_message("user"):
+        print()
+        print("User:", end="")
+        print(user_prompt or PRMPT)
+        st.markdown(user_prompt or PRMPT)
+
+    # adds user's prompt to session state
+    st.session_state.messages.append({"role": "user", "content": user_prompt or PRMPT})
+
+    # retrieves response from model
+    LLM_STREAM = ollama_prompt(
+        messages=st.session_state.messages,
+    )
+    with my_slot2.chat_message("assistant"):
+        try:
+            st.write(stream_parser(LLM_STREAM))
+        except stop_running():
+            pass
diff --git a/config.py b/config.py
@@ -0,0 +1,11 @@
+"""File with instructions what LLM must do, rewrite it as much as you wish"""
+
+SYS_MSG = "You are a helpful AI voice assistant (your name is Bot). Generate the most useful\
+and factual response possible, carefully considering all previous generated text in your response\
+before adding new tokens to the response. Use all of the context of this conversation so your\
+response is relevant to the conversation. Make your responses clear and concise, avoiding any\
+verbosity. You'll mostly be asked questions in English or Ukrainian. It's mandatory to avoid mixing languages within a single\
+sentence. You can provide your response entirely in English or entirely in Ukrainian.\
+If you need to use words from both languages in the same sentence, consider transliterating \
+(especially names!) one of them. If prompt contain 'ua:' answer in UA if contain 'en:' in english \
+(both ways use transliteration if needed)"
diff --git a/data/chunk copy.wav b/data/chunk copy.wav
@@ -0,0 +1 @@
+This is some sample data
diff --git a/data/chunk.wav b/data/chunk.wav
@@ -0,0 +1 @@
+/home/mr/repositories/VOICE_ASSISTANT/data/wav/chunk.wav
diff --git a/data/converted_mono.wav b/data/converted_mono.wav
@@ -0,0 +1 @@
+/home/mr/repositories/VOICE_ASSISTANT/data/wav/converted_mono.wav
diff --git a/data/media/CONSOLE_V.png b/data/media/CONSOLE_V.png
diff --git a/data/media/GUI_V.png b/data/media/GUI_V.png
diff --git a/data/media/gif.gif b/data/media/gif.gif
diff --git a/data/wav/EN_test.wav b/data/wav/EN_test.wav
diff --git a/data/wav/UA_test.wav b/data/wav/UA_test.wav
diff --git a/data/wav/chunk.wav b/data/wav/chunk.wav
diff --git a/data/wav/converted_mono.wav b/data/wav/converted_mono.wav
diff --git a/data/wav/microphone_stereo.wav b/data/wav/microphone_stereo.wav
diff --git a/data/wav/transcribed_piece.mp3 b/data/wav/transcribed_piece.mp3
diff --git a/main.py b/main.py
@@ -0,0 +1,107 @@
+"""Main.py provide entry point to terminal based voice agent with memory and speech abilities \
+in UA and EN voices. With this configuration it can be used for studying english or ukrainian, \
+both writing and listening skills"""
+
+import json
+import sounddevice  # fixing ALSA errors
+import speech_recognition as sr
+from src.english_stt import en_transcribe
+from src.identify_lang import identify_language
+from src.ollama_tts import ollama_prompt
+from src.ukrainian_stt import ua_transcribe
+from config import SYS_MSG
+
+
+def main(memory=False):
+    """
+    The main entry point for the application.
+
+    Args:
+        memory (bool, optional): Whether to load and save conversation history. Defaults to False.
+
+    Returns:
+        None
+
+    Automatically detect your language, transcribe it and pronounce & write the answer in UA & EN.
+    Optionally save conversation history to a JSON file
+    """
+    if memory is True:
+        try:
+            with open("data/HISTORY.json", "r", encoding="UTF-8") as f:
+                history = json.load(f)
+        except FileNotFoundError:
+            print(
+                "No previous history is available, creating new one.",
+                "\t",
+                "Попередньої розмови не знайдено, створюю розмову.",
+            )
+            history = [{"role": "system", "content": SYS_MSG}]
+    try:
+        listening = True
+        while listening:
+            with sr.Microphone() as source:
+                recognizer = sr.Recognizer()
+                recognizer.pause_threshold = 0.8
+                recognizer.energy_threshold = 500
+                recognizer.adjust_for_ambient_noise(source)  # , duration=1)
+                # recognizer.dynamic_energy_threshold = 3000
+
+                def_lang = "???"
+                try:
+                    print("Listening...\t\t Слухаю...")
+                    audio = recognizer.listen(source, timeout=5.0)
+                    print("Working on it...\t Обробка...")
+                    # Save the audio data to a WAV file
+                    wav_data = audio.get_wav_data(convert_rate=16000)
+                    # Write the WAV data to a file
+                    wav_file = "./data/wav/chunk.wav"
+                    with open(wav_file, "wb") as file:
+                        file.write(wav_data)
+                except sr.UnknownValueError:
+                    print(
+                        "Could you repeat please?.It's not recognized",
+                        "\t\t",
+                        "Повторіть будь ласка, не розчула",
+                    )
+                def_lang = identify_language(wav_file)
+                if def_lang in [
+                    ["uk: Ukrainian"],
+                    ["pl: Polish"],
+                    ["ru: Russian"],
+                    ["be: Belarusian"],
+                ]:
+                    print("Запит Солов'їною, обробка...")
+                    prmpt = ua_transcribe(wav_file)
+                    print("Користувач:", prmpt)
+                    print("Дай подумати...")
+                    ollama_prompt(prompt="ua: " + prmpt, history=history)
+                else:
+                    print("Detected as english, working on it...")
+                    prmpt = en_transcribe(wav_file)
+                    if prmpt == "Didn't recognize that.":
+                        print("Didn't recognize that.\t\t Не зрозуміла.")
+                    else:
+                        print("User:", prmpt)
+                        print("Wait for LLM to answer... Зараз відповім...")
+                        ollama_prompt(prompt="en: " + prmpt, history=history)
+                    print("\n", "\n")
+
+    except KeyboardInterrupt:
+        listening = False
+        print("\n", "Stopped listening.\t Перервано.")
+        print("\n", "\n")
+        print("HISTORY:", history)
+        # Also we can save history to reuse it later
+        # Specify the file path and name
+        history_path = "./data/HISTORY.json"
+
+        # Serialize the list to JSON
+        json_data = json.dumps(history)
+
+        # Write the JSON data to the file
+        with open(history_path, "w", encoding="utf-8") as file:
+            file.write(json_data)
+
+
+if __name__ == "__main__":
+    main(memory=True)
diff --git a/src/english_stt.py b/src/english_stt.py
@@ -0,0 +1,26 @@
+"""Module providing speech recognition"""
+
+import speech_recognition as sr
+
+
+def en_transcribe(wav_filename="./data/wav/EN_test.wav"):
+    """Function transcribe/recognize english wav."""
+    recognizer = sr.Recognizer()
+
+    # Open the audio file and recognize it
+    with sr.AudioFile(wav_filename) as source:
+        audio_data = recognizer.record(source)
+        try:
+            return recognizer.recognize_google(audio_data)
+        except sr.UnknownValueError:
+            return "Didn't recognize that."
+        except sr.RequestError as e:
+            return f"Could not request results; {e}"
+
+
+# Example usage
+if __name__ == "__main__":
+    SAMPLE = "./data/wav/EN_test.wav"
+    response = en_transcribe(SAMPLE)
+    print()
+    print("Response: ", response)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/home/mr/repositories/VOICE_ASSISTANT/data/wav/chunk.wav