Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Katashynskyi committed Aug 23, 2024
0 parents commit f06a4fd
Show file tree
Hide file tree
Showing 23 changed files with 645 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.conda
.vscode
__pycache__
data/model_data
data/HISTORY.json
flagged
src/__pycache__
48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# UA-EN Voice Assistant
## No api-keys | local | llama3.1 (12k tokens prompt-menu ~20 pages in single request)
![GitHub](https://img.shields.io/github/license/Katashynskyi/voice_assistant_UA_EN)
![GitHub last commit](https://img.shields.io/github/last-commit/Katashynskyi/voice_assistant_UA_EN)
![Gif](data/media/gif.gif)


[![GUI version](data/media/GUI_V.png)](https://youtu.be/iw9P4Y7KXI4)
[![Console version](data/media/CONSOLE_V.png)](https://youtu.be/c-8Z4qzOcII)

## The Idea
This project serves as a proof-of-concept for a minimum viable product (MVP) inspired by the capabilities of the OMNI model from ChatGPT. However, it offers a significant advantage: local deployment without restrictions. This empowers users to leverage its functionalities for various purposes, including:
- Translation across languages
- Learning Enhancement by practicing writing, reading, and audio skills
- Customization for tailored use cases

## Features & Tech stack

- **Language Classification**: classify if it's UA or EN for authomatic mode."Lang-id-voxlingua107-ecapa" by speechbrain (supports 100+ lang's).
- **Google legacy recognizer**: it uses a generic key that works out of the box. It's fast and works well.
- **Wav2Vec2-Bert**: best (for now) Ukrainian Speech-to-text converter.
- **Edge-TTS**: best (not generated) voices I can get for free.
- **Ollama-python**: lib to download and use most popular LLM's.
- **Streamlit**: for GUI.
- **dialogue saved in json**: HISTORY.json (only for main.py. For app.py it's only short-term context-window memory).
- **Config.py**: prompt for best user experience (modify it for your own purposes).


## Getting Started
### Tested on
- WSL 22.04.3
- Geforce (mobile) GTX 1050Ti (4GB)
- RAM (32GB)
### Prerequisites
- Python 3.9+
- Virtual environment (Conda 3.9+)
- CUDA (optional)
### Installation

- Clone the repository
- Create conda venv (Conda 3.9)
- sudo apt install portaudio19-dev
- Install the required packages: pip install -r requirements.txt

## Usage

After installation of required libs run main.py for console experience or app.py for GUI lovers.

184 changes: 184 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""Python script for a voice assistant using Streamlit and Ollama
This script creates a user interface using Streamlit to interact with a large language model\
(LLM) from Ollama for voice-based and text-based communication.
Features:
- Audio recording using Streamlit's `audio_recorder` component.
- Speech recognition for Ukrainian (UA) and English (EN) using custom functions.
- Automatic language detection based on transcribed text.
- Text input for user prompts.
- Streamlit chat interface for displaying conversation history.
- Interaction with Ollama's LLM for generating responses.
- Text-to-speech functionality (not implemented in this code).
"""

import warnings
import asyncio
import ollama
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from src.ukrainian_stt import ua_transcribe
from src.english_stt import en_transcribe
from src.transcribe_speak import transcribe_and_speak
from src.utils import convert_audio_to_wav, check_language
from src.identify_lang import identify_language
from config import SYS_MSG

# Suppress warnings
warnings.filterwarnings("ignore")

# File paths
RECORDED_WAV_FILE = "./data/wav/microphone_stereo.wav"
CONV_WAV_FILE = "./data/wav/converted_mono.wav"
WAV_FILE = "./data/wav/chunk.wav"

# Initial conversation history
HISTORY = [{"role": "system", "content": SYS_MSG}]

# Streamlit page configuration
st.set_page_config(
page_title="Voice assistant UA-EN",
page_icon=":trident:",
layout="wide",
initial_sidebar_state="auto",
menu_items=None,
)


def ollama_prompt(model="llama3.1", messages=None):
"""
Sends a prompt to the Ollama LLM and returns a stream of responses.
Args:
model (str, optional): The Ollama model to use. Defaults to "llama3.1".
messages (list, optional): A list of dictionaries representing the conversation history.
Defaults to None.
Returns:
stream: An asynchronous stream of dictionaries containing the LLM's responses.
"""
stream = ollama.chat(model=model, messages=messages, stream=True)
return stream


def stream_parser(stream):
"""
Parses the stream of responses from the LLM and displays them in the Streamlit chat interface.
Args:
stream: An asynchronous stream of dictionaries containing the LLM's responses.
Yields:
str: Each chunk of the LLM's response.
"""
sentence_chunks, response_text = "", ""
st.session_state.messages.append({"role": "assistant", "content": response_text})
print("Assistant: ", end="")
for chunk in stream:
print(chunk["message"]["content"], end="", flush=True)
content = chunk["message"]["content"]
sentence_chunks += content
response_text += content
st.session_state.messages[-1]["content"] += content
if sentence_chunks.endswith(
('."', "\n\n", "**:", ".", "!", "?", '?"', '!"', ":")
):
if any("\u0400" <= char <= "\u04FF" for char in sentence_chunks):
lang = "ua"
else:
lang = "en"
asyncio.run(transcribe_and_speak(text=sentence_chunks, lang=lang))
sentence_chunks = ""
yield chunk["message"]["content"]


def stop_running():
"""The Dummy.Currently does nothing."""
with my_slot1.chat_message("user"):
st.markdown("Stop!")

# Setup order of elements
my_slot0 = st.empty() # most buttons
my_slot1 = st.empty() # chat_message("user")
my_slot2 = st.empty() # chat_message(message["role"] & chat_message("assistant")


# Streamlit custom microphone
col1, col2 = my_slot0.columns([1, 8.5], vertical_alignment="bottom")
with col1:
audio_bytes = audio_recorder(
text="", energy_threshold=0.01, icon_size="5x"
) # if energy_threshold negative - never stops
if audio_bytes is not None and len(audio_bytes) != 44:
st.audio(audio_bytes, format="audio/wav")
with open(file=RECORDED_WAV_FILE, mode="wb") as f:
f.write(audio_bytes)
f.close()
convert_audio_to_wav(audio_file=RECORDED_WAV_FILE, output_file=CONV_WAV_FILE)
# Choose language buttons
with col2:
PRMPT = None
button0, button1, button2, button3 = st.columns(4)
with button0:
if st.button("Stop", use_container_width=True, type="primary"):
stop_running()
with button1:
if st.button("Говорю (UA)", use_container_width=True):
PRMPT = "ua:" + ua_transcribe(CONV_WAV_FILE)
print(PRMPT)
with button2:
if st.button("Talking (EN)", use_container_width=True):
PRMPT = "en:" + en_transcribe(CONV_WAV_FILE)
print(PRMPT)
with button3:
if st.button("Automatic", use_container_width=True):
DEF_LANG = "???"
DEF_LANG = identify_language(CONV_WAV_FILE)
if DEF_LANG in [
["uk: Ukrainian"],
["pl: Polish"],
["ru: Russian"],
["be: Belarusian"],
]:
PRMPT = "ua:" + ua_transcribe(CONV_WAV_FILE)
print(PRMPT)
else:
PRMPT = "en:" + en_transcribe(CONV_WAV_FILE)
if PRMPT == "Didn't recognize that.":
print(PRMPT)
PRMPT = None
else:
print(PRMPT)
user_prompt = st.chat_input(placeholder="Краще напишу/I'll write instead")
if user_prompt is not None:
user_prompt = check_language(user_prompt=user_prompt)
# Checks for existing messages in session state
if "messages" not in st.session_state:
st.session_state.messages = HISTORY

# Display chat messages from session state
for message in st.session_state.messages:
with my_slot2.chat_message(message["role"]):
st.markdown(message["content"])

if user_prompt is not None or PRMPT is not None:
# Display user prompt in chat message widget
with my_slot1.chat_message("user"):
print()
print("User:", end="")
print(user_prompt or PRMPT)
st.markdown(user_prompt or PRMPT)

# adds user's prompt to session state
st.session_state.messages.append({"role": "user", "content": user_prompt or PRMPT})

# retrieves response from model
LLM_STREAM = ollama_prompt(
messages=st.session_state.messages,
)
with my_slot2.chat_message("assistant"):
try:
st.write(stream_parser(LLM_STREAM))
except stop_running():
pass
11 changes: 11 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""File with instructions what LLM must do, rewrite it as much as you wish"""

SYS_MSG = "You are a helpful AI voice assistant (your name is Bot). Generate the most useful\
and factual response possible, carefully considering all previous generated text in your response\
before adding new tokens to the response. Use all of the context of this conversation so your\
response is relevant to the conversation. Make your responses clear and concise, avoiding any\
verbosity. You'll mostly be asked questions in English or Ukrainian. It's mandatory to avoid mixing languages within a single\
sentence. You can provide your response entirely in English or entirely in Ukrainian.\
If you need to use words from both languages in the same sentence, consider transliterating \
(especially names!) one of them. If prompt contain 'ua:' answer in UA if contain 'en:' in english \
(both ways use transliteration if needed)"
1 change: 1 addition & 0 deletions data/chunk copy.wav
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some sample data
1 change: 1 addition & 0 deletions data/chunk.wav
1 change: 1 addition & 0 deletions data/converted_mono.wav
Binary file added data/media/CONSOLE_V.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/media/GUI_V.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/media/gif.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/wav/EN_test.wav
Binary file not shown.
Binary file added data/wav/UA_test.wav
Binary file not shown.
Binary file added data/wav/chunk.wav
Binary file not shown.
Binary file added data/wav/converted_mono.wav
Binary file not shown.
Binary file added data/wav/microphone_stereo.wav
Binary file not shown.
Binary file added data/wav/transcribed_piece.mp3
Binary file not shown.
107 changes: 107 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Main.py provide entry point to terminal based voice agent with memory and speech abilities \
in UA and EN voices. With this configuration it can be used for studying english or ukrainian, \
both writing and listening skills"""

import json
import sounddevice # fixing ALSA errors
import speech_recognition as sr
from src.english_stt import en_transcribe
from src.identify_lang import identify_language
from src.ollama_tts import ollama_prompt
from src.ukrainian_stt import ua_transcribe
from config import SYS_MSG


def main(memory=False):
"""
The main entry point for the application.
Args:
memory (bool, optional): Whether to load and save conversation history. Defaults to False.
Returns:
None
Automatically detect your language, transcribe it and pronounce & write the answer in UA & EN.
Optionally save conversation history to a JSON file
"""
if memory is True:
try:
with open("data/HISTORY.json", "r", encoding="UTF-8") as f:
history = json.load(f)
except FileNotFoundError:
print(
"No previous history is available, creating new one.",
"\t",
"Попередньої розмови не знайдено, створюю розмову.",
)
history = [{"role": "system", "content": SYS_MSG}]
try:
listening = True
while listening:
with sr.Microphone() as source:
recognizer = sr.Recognizer()
recognizer.pause_threshold = 0.8
recognizer.energy_threshold = 500
recognizer.adjust_for_ambient_noise(source) # , duration=1)
# recognizer.dynamic_energy_threshold = 3000

def_lang = "???"
try:
print("Listening...\t\t Слухаю...")
audio = recognizer.listen(source, timeout=5.0)
print("Working on it...\t Обробка...")
# Save the audio data to a WAV file
wav_data = audio.get_wav_data(convert_rate=16000)
# Write the WAV data to a file
wav_file = "./data/wav/chunk.wav"
with open(wav_file, "wb") as file:
file.write(wav_data)
except sr.UnknownValueError:
print(
"Could you repeat please?.It's not recognized",
"\t\t",
"Повторіть будь ласка, не розчула",
)
def_lang = identify_language(wav_file)
if def_lang in [
["uk: Ukrainian"],
["pl: Polish"],
["ru: Russian"],
["be: Belarusian"],
]:
print("Запит Солов'їною, обробка...")
prmpt = ua_transcribe(wav_file)
print("Користувач:", prmpt)
print("Дай подумати...")
ollama_prompt(prompt="ua: " + prmpt, history=history)
else:
print("Detected as english, working on it...")
prmpt = en_transcribe(wav_file)
if prmpt == "Didn't recognize that.":
print("Didn't recognize that.\t\t Не зрозуміла.")
else:
print("User:", prmpt)
print("Wait for LLM to answer... Зараз відповім...")
ollama_prompt(prompt="en: " + prmpt, history=history)
print("\n", "\n")

except KeyboardInterrupt:
listening = False
print("\n", "Stopped listening.\t Перервано.")
print("\n", "\n")
print("HISTORY:", history)
# Also we can save history to reuse it later
# Specify the file path and name
history_path = "./data/HISTORY.json"

# Serialize the list to JSON
json_data = json.dumps(history)

# Write the JSON data to the file
with open(history_path, "w", encoding="utf-8") as file:
file.write(json_data)


if __name__ == "__main__":
main(memory=True)
26 changes: 26 additions & 0 deletions src/english_stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Module providing speech recognition"""

import speech_recognition as sr


def en_transcribe(wav_filename="./data/wav/EN_test.wav"):
"""Function transcribe/recognize english wav."""
recognizer = sr.Recognizer()

# Open the audio file and recognize it
with sr.AudioFile(wav_filename) as source:
audio_data = recognizer.record(source)
try:
return recognizer.recognize_google(audio_data)
except sr.UnknownValueError:
return "Didn't recognize that."
except sr.RequestError as e:
return f"Could not request results; {e}"


# Example usage
if __name__ == "__main__":
SAMPLE = "./data/wav/EN_test.wav"
response = en_transcribe(SAMPLE)
print()
print("Response: ", response)
Loading

0 comments on commit f06a4fd

Please sign in to comment.