diff --git a/.gitignore b/.gitignore index 5da200f..f1b6a50 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ __pycache__/ *.wav keys.py -.venv/ \ No newline at end of file +.venv/ +.DS_Store +ecout_env \ No newline at end of file diff --git a/AudioRecorder.py b/AudioRecorder.py index 8c09227..7012c7a 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,11 +1,14 @@ -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio from datetime import datetime +import pyaudiowpatch as pyaudio + +import custom_speech_recognition as sr + RECORD_TIMEOUT = 3 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False + class BaseRecorder: def __init__(self, source, source_name): self.recorder = sr.Recognizer() @@ -25,23 +28,31 @@ def adjust_for_noise(self, device_name, msg): print(f"[INFO] Completed ambient noise adjustment for {device_name}.") def record_into_queue(self, audio_queue): - def record_callback(_, audio:sr.AudioData) -> None: + def record_callback(_, audio: sr.AudioData) -> None: data = audio.get_raw_data() audio_queue.put((self.source_name, data, datetime.utcnow())) - self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT) + self.recorder.listen_in_background( + self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT + ) + class DefaultMicRecorder(BaseRecorder): def __init__(self): super().__init__(source=sr.Microphone(sample_rate=16000), source_name="You") - self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise( + "Default Mic", "Please make some noise from the Default Mic..." + ) + class DefaultSpeakerRecorder(BaseRecorder): def __init__(self): with pyaudio.PyAudio() as p: wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) - default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"]) - + default_speakers = p.get_device_info_by_index( + wasapi_info["defaultOutputDevice"] + ) + if not default_speakers["isLoopbackDevice"]: for loopback in p.get_loopback_device_info_generator(): if default_speakers["name"] in loopback["name"]: @@ -49,11 +60,16 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") - - source = sr.Microphone(speaker=True, - device_index= default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) + + source = sr.Microphone( + speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"], + ) super().__init__(source=source, source_name="Speaker") - self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...") \ No newline at end of file + self.adjust_for_noise( + "Default Speaker", + "Please make or play some noise from the Default Speaker...", + ) diff --git a/AudioTranscriber.py b/AudioTranscriber.py index b37eae8..9942d0a 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -1,19 +1,20 @@ -import whisper -import torch -import wave +import io import os import threading -import tempfile -import custom_speech_recognition as sr -import io +import wave from datetime import timedelta -import pyaudiowpatch as pyaudio from heapq import merge +from tempfile import NamedTemporaryFile + +import pyaudiowpatch as pyaudio + +import custom_speech_recognition as sr PHRASE_TIMEOUT = 3.05 MAX_PHRASES = 10 + class AudioTranscriber: def __init__(self, mic_source, speaker_source, model): self.transcript_data = {"You": [], "Speaker": []} @@ -24,20 +25,20 @@ def __init__(self, mic_source, speaker_source, model): "sample_rate": mic_source.SAMPLE_RATE, "sample_width": mic_source.SAMPLE_WIDTH, "channels": mic_source.channels, - "last_sample": bytes(), + "last_sample": b"", "last_spoken": None, "new_phrase": True, - "process_data_func": self.process_mic_data + "process_data_func": self.process_mic_data, }, "Speaker": { "sample_rate": speaker_source.SAMPLE_RATE, "sample_width": speaker_source.SAMPLE_WIDTH, "channels": speaker_source.channels, - "last_sample": bytes(), + "last_sample": b"", "last_spoken": None, "new_phrase": True, - "process_data_func": self.process_speaker_data - } + "process_data_func": self.process_speaker_data, + }, } def transcribe_audio_queue(self, audio_queue): @@ -46,40 +47,44 @@ def transcribe_audio_queue(self, audio_queue): self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken) source_info = self.audio_sources[who_spoke] - text = '' - try: - fd, path = tempfile.mkstemp(suffix=".wav") - os.close(fd) - source_info["process_data_func"](source_info["last_sample"], path) - text = self.audio_model.get_transcription(path) - except Exception as e: - print(e) - finally: - os.unlink(path) - - if text != '' and text.lower() != 'you': + text = "" + temp_file = NamedTemporaryFile(delete=False, suffix=".wav") + temp_file.close() + + source_info["process_data_func"](source_info["last_sample"], temp_file.name) + text = self.audio_model.get_transcription(temp_file.name) + + os.unlink(temp_file.name) + + if text != "" and text.lower() != "you": self.update_transcript(who_spoke, text, time_spoken) self.transcript_changed_event.set() def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken): source_info = self.audio_sources[who_spoke] - if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT): - source_info["last_sample"] = bytes() + if source_info["last_spoken"] and time_spoken - source_info[ + "last_spoken" + ] > timedelta(seconds=PHRASE_TIMEOUT): + source_info["last_sample"] = b"" source_info["new_phrase"] = True else: source_info["new_phrase"] = False source_info["last_sample"] += data - source_info["last_spoken"] = time_spoken + source_info["last_spoken"] = time_spoken def process_mic_data(self, data, temp_file_name): - audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) + audio_data = sr.AudioData( + data, + self.audio_sources["You"]["sample_rate"], + self.audio_sources["You"]["sample_width"], + ) wav_data = io.BytesIO(audio_data.get_wav_data()) - with open(temp_file_name, 'w+b') as f: + with open(temp_file_name, "w+b") as f: f.write(wav_data.read()) def process_speaker_data(self, data, temp_file_name): - with wave.open(temp_file_name, 'wb') as wf: + with wave.open(temp_file_name, "wb") as wf: wf.setnchannels(self.audio_sources["Speaker"]["channels"]) p = pyaudio.PyAudio() wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) @@ -98,18 +103,23 @@ def update_transcript(self, who_spoke, text, time_spoken): transcript[0] = (f"{who_spoke}: [{text}]\n\n", time_spoken) def get_transcript(self): - combined_transcript = list(merge( - self.transcript_data["You"], self.transcript_data["Speaker"], - key=lambda x: x[1], reverse=True)) + combined_transcript = list( + merge( + self.transcript_data["You"], + self.transcript_data["Speaker"], + key=lambda x: x[1], + reverse=True, + ) + ) combined_transcript = combined_transcript[:MAX_PHRASES] return "".join([t[0] for t in combined_transcript]) - + def clear_transcript_data(self): self.transcript_data["You"].clear() self.transcript_data["Speaker"].clear() - self.audio_sources["You"]["last_sample"] = bytes() - self.audio_sources["Speaker"]["last_sample"] = bytes() + self.audio_sources["You"]["last_sample"] = b"" + self.audio_sources["Speaker"]["last_sample"] = b"" self.audio_sources["You"]["new_phrase"] = True - self.audio_sources["Speaker"]["new_phrase"] = True \ No newline at end of file + self.audio_sources["Speaker"]["new_phrase"] = True diff --git a/GPTResponder.py b/GPTResponder.py index 2adab72..9527d2f 100644 --- a/GPTResponder.py +++ b/GPTResponder.py @@ -1,26 +1,30 @@ +import time + import openai from keys import OPENAI_API_KEY -from prompts import create_prompt, INITIAL_RESPONSE -import time + +from prompts import INITIAL_RESPONSE, create_prompt openai.api_key = OPENAI_API_KEY + def generate_response_from_transcript(transcript): try: response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0301", - messages=[{"role": "system", "content": create_prompt(transcript)}], - temperature = 0.0 + model="gpt-3.5-turbo-0301", + messages=[{"role": "system", "content": create_prompt(transcript)}], + temperature=0.0, ) except Exception as e: print(e) - return '' + return "" full_response = response.choices[0].message.content try: - return full_response.split('[')[1].split(']')[0] - except: - return '' - + return full_response.split("[")[1].split("]")[0] + except IndexError: + return "" + + class GPTResponder: def __init__(self): self.response = INITIAL_RESPONSE @@ -31,14 +35,16 @@ def respond_to_transcriber(self, transcriber): if transcriber.transcript_changed_event.is_set(): start_time = time.time() - transcriber.transcript_changed_event.clear() + transcriber.transcript_changed_event.clear() transcript_string = transcriber.get_transcript() response = generate_response_from_transcript(transcript_string) - + end_time = time.time() # Measure end time - execution_time = end_time - start_time # Calculate the time it took to execute the function - - if response != '': + + # Calculate the time it took to execute the function + execution_time = end_time - start_time + + if response != "": self.response = response remaining_time = self.response_interval - execution_time @@ -48,4 +54,4 @@ def respond_to_transcriber(self, transcriber): time.sleep(0.3) def update_response_interval(self, interval): - self.response_interval = interval \ No newline at end of file + self.response_interval = interval diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..091a8e5 --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ +lint: + @echo + ruff . + @echo + black --check --diff --color . + @echo + pip-audit + +format: + ruff --silent --exit-zero --fix . + black . + +precommit: + make lint + make format + +venv: + python -m venv ecout_env + +install: + pip install -r requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index e7eaa6c..e847af0 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Ecoute is a live transcription tool that provides real-time transcripts for both ## 📖 Demo -https://github.com/SevaSk/ecoute/assets/50382291/8ac48927-8a26-49fd-80e9-48f980986208 + Ecoute is designed to help users in their conversations by providing live transcriptions and generating contextually relevant responses. By leveraging the power of OpenAI's GPT-3.5, Ecoute aims to make communication more efficient and enjoyable. @@ -18,18 +18,22 @@ Follow these steps to set up and run Ecoute on your local machine. - Python >=3.8.0 - An OpenAI API key that can access OpenAI API (set up a paid account OpenAI account) - Windows OS (Not tested on others) -- FFmpeg +- FFmpeg If FFmpeg is not installed in your system, you can follow the steps below to install it. First, you need to install Chocolatey, a package manager for Windows. Open your PowerShell as Administrator and run the following command: + ``` Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) ``` + Once Chocolatey is installed, you can install FFmpeg by running the following command in your PowerShell: + ``` choco install ffmpeg ``` + Please ensure that you run these commands in a PowerShell window with administrator privileges. If you face any issues during the installation, you can visit the official Chocolatey and FFmpeg websites for troubleshooting. ### 🔧 Installation @@ -51,7 +55,7 @@ Please ensure that you run these commands in a PowerShell window with administra ``` pip install -r requirements.txt ``` - + 4. Create a `keys.py` file in the ecoute directory and add your OpenAI API key: - Option 1: You can utilize a command on your command prompt. Run the following command, ensuring to replace "API KEY" with your actual OpenAI API key: @@ -61,10 +65,11 @@ Please ensure that you run these commands in a PowerShell window with administra ``` - Option 2: You can create the keys.py file manually. Open up your text editor of choice and enter the following content: - + ``` OPENAI_API_KEY="API KEY" ``` + Replace "API KEY" with your actual OpenAI API key. Save this file as keys.py within the ecoute directory. ### 🎬 Running Ecoute @@ -102,3 +107,60 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 🤝 Contributing Contributions are welcome! Feel free to open issues or submit pull requests to improve Ecoute. + +### Installation + +To set up the environment and install the necessary dependencies, follow these steps based on your operating system. + +#### Windows + +1. Install make on your Windows machine. + + ```shell + choco install make + ``` + +2. Create a virtual environment: + + ```shell + make venv + ``` + +3. Activate the virtual environment: + + ```shell + .\ecout_venv\Scripts\activate + ``` + +4. Install the required packages: + + ```shell + make install + ``` + +#### Linux & MacOS + +1. Create a virtual environment: + + ```shell + make venv + ``` + +2. Activate the virtual environment by running the command: + + ```shell + source ecout_venv/bin/activate + ``` + +3. Install the required python packages: + + ```shell + make install + ``` + +### Code Quality + +Before submitting a pull request run `make precommit` and resolve any issues. Additionally, here are some useful commands: + +- `make lint` +- `make format` diff --git a/TranscriberModels.py b/TranscriberModels.py index fe31108..ced2d12 100644 --- a/TranscriberModels.py +++ b/TranscriberModels.py @@ -1,7 +1,9 @@ -import openai -import whisper import os + +import openai import torch +import whisper + def get_model(use_api): if use_api: @@ -9,19 +11,23 @@ def get_model(use_api): else: return WhisperTranscriber() + class WhisperTranscriber: def __init__(self): - self.audio_model = whisper.load_model(os.path.join(os.getcwd(), 'tiny.en.pt')) - print(f"[INFO] Whisper using GPU: " + str(torch.cuda.is_available())) + self.audio_model = whisper.load_model(os.path.join(os.getcwd(), "tiny.en.pt")) + print("[INFO] Whisper using GPU: " + str(torch.cuda.is_available())) def get_transcription(self, wav_file_path): try: - result = self.audio_model.transcribe(wav_file_path, fp16=torch.cuda.is_available()) + result = self.audio_model.transcribe( + wav_file_path, fp16=torch.cuda.is_available() + ) except Exception as e: print(e) - return '' - return result['text'].strip() - + return "" + return result["text"].strip() + + class APIWhisperTranscriber: def get_transcription(self, wav_file_path): try: @@ -29,5 +35,6 @@ def get_transcription(self, wav_file_path): result = openai.Audio.transcribe("whisper-1", audio_file) except Exception as e: print(e) - return '' - return result['text'].strip() \ No newline at end of file + return "" + + return result["text"].strip() diff --git a/main.py b/main.py index 75f1c21..08e1ae6 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,34 @@ -import threading -from AudioTranscriber import AudioTranscriber -from GPTResponder import GPTResponder -import customtkinter as ctk -import AudioRecorder import queue -import time -import torch import sys +import threading +import time + +import customtkinter as ctk + +import AudioRecorder import TranscriberModels -import subprocess +from AudioTranscriber import AudioTranscriber +from GPTResponder import GPTResponder + def write_in_textbox(textbox, text): textbox.delete("0.0", "end") textbox.insert("0.0", text) + def update_transcript_UI(transcriber, textbox): transcript_string = transcriber.get_transcript() write_in_textbox(textbox, transcript_string) textbox.after(300, update_transcript_UI, transcriber, textbox) -def update_response_UI(responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state): + +def update_response_UI( + responder, + textbox, + update_interval_slider_label, + update_interval_slider, + freeze_state, +): if not freeze_state[0]: response = responder.response @@ -29,41 +38,68 @@ def update_response_UI(responder, textbox, update_interval_slider_label, update_ update_interval = int(update_interval_slider.get()) responder.update_response_interval(update_interval) - update_interval_slider_label.configure(text=f"Update interval: {update_interval} seconds") + update_interval_slider_label.configure( + text=f"Update interval: {update_interval} seconds" + ) + + textbox.after( + 300, + update_response_UI, + responder, + textbox, + update_interval_slider_label, + update_interval_slider, + freeze_state, + ) - textbox.after(300, update_response_UI, responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state) def clear_context(transcriber, audio_queue): transcriber.clear_transcript_data() with audio_queue.mutex: audio_queue.queue.clear() + def create_ui_components(root): ctk.set_appearance_mode("dark") ctk.set_default_color_theme("dark-blue") root.title("Ecoute") - root.configure(bg='#252422') + root.configure(bg="#252422") root.geometry("1000x600") font_size = 20 - transcript_textbox = ctk.CTkTextbox(root, width=300, font=("Arial", font_size), text_color='#FFFCF2', wrap="word") + transcript_textbox = ctk.CTkTextbox( + root, width=300, font=("Arial", font_size), text_color="#FFFCF2", wrap="word" + ) transcript_textbox.grid(row=0, column=0, padx=10, pady=20, sticky="nsew") - response_textbox = ctk.CTkTextbox(root, width=300, font=("Arial", font_size), text_color='#639cdc', wrap="word") + response_textbox = ctk.CTkTextbox( + root, width=300, font=("Arial", font_size), text_color="#639cdc", wrap="word" + ) response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew") freeze_button = ctk.CTkButton(root, text="Freeze", command=None) freeze_button.grid(row=1, column=1, padx=10, pady=3, sticky="nsew") - update_interval_slider_label = ctk.CTkLabel(root, text=f"", font=("Arial", 12), text_color="#FFFCF2") + update_interval_slider_label = ctk.CTkLabel( + root, text="", font=("Arial", 12), text_color="#FFFCF2" + ) update_interval_slider_label.grid(row=2, column=1, padx=10, pady=3, sticky="nsew") - update_interval_slider = ctk.CTkSlider(root, from_=1, to=10, width=300, height=20, number_of_steps=9) + update_interval_slider = ctk.CTkSlider( + root, from_=1, to=10, width=300, height=20, number_of_steps=9 + ) update_interval_slider.set(2) update_interval_slider.grid(row=3, column=1, padx=10, pady=10, sticky="nsew") - return transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button + return ( + transcript_textbox, + response_textbox, + update_interval_slider, + update_interval_slider_label, + freeze_button, + ) + def main(): try: @@ -73,7 +109,13 @@ def main(): return root = ctk.CTk() - transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button = create_ui_components(root) + ( + transcript_textbox, + response_textbox, + update_interval_slider, + update_interval_slider_label, + freeze_button, + ) = create_ui_components(root) audio_queue = queue.Queue() @@ -85,15 +127,21 @@ def main(): speaker_audio_recorder = AudioRecorder.DefaultSpeakerRecorder() speaker_audio_recorder.record_into_queue(audio_queue) - model = TranscriberModels.get_model('--api' in sys.argv) + model = TranscriberModels.get_model("--api" in sys.argv) - transcriber = AudioTranscriber(user_audio_recorder.source, speaker_audio_recorder.source, model) - transcribe = threading.Thread(target=transcriber.transcribe_audio_queue, args=(audio_queue,)) + transcriber = AudioTranscriber( + user_audio_recorder.source, speaker_audio_recorder.source, model + ) + transcribe = threading.Thread( + target=transcriber.transcribe_audio_queue, args=(audio_queue,) + ) transcribe.daemon = True transcribe.start() responder = GPTResponder() - respond = threading.Thread(target=responder.respond_to_transcriber, args=(transcriber,)) + respond = threading.Thread( + target=responder.respond_to_transcriber, args=(transcriber,) + ) respond.daemon = True respond.start() @@ -106,23 +154,42 @@ def main(): root.grid_columnconfigure(0, weight=2) root.grid_columnconfigure(1, weight=1) - # Add the clear transcript button to the UI - clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_context(transcriber, audio_queue, )) + # Add the clear transcript button to the UI + clear_transcript_button = ctk.CTkButton( + root, + text="Clear Transcript", + command=lambda: clear_context( + transcriber, + audio_queue, + ), + ) clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew") - freeze_state = [False] # Using list to be able to change its content inside inner functions + freeze_state = [ + False + ] # Using list to be able to change its content inside inner functions + def freeze_unfreeze(): freeze_state[0] = not freeze_state[0] # Invert the freeze state freeze_button.configure(text="Unfreeze" if freeze_state[0] else "Freeze") freeze_button.configure(command=freeze_unfreeze) - update_interval_slider_label.configure(text=f"Update interval: {update_interval_slider.get()} seconds") + update_interval_slider_label.configure( + text=f"Update interval: {update_interval_slider.get()} seconds" + ) update_transcript_UI(transcriber, transcript_textbox) - update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider, freeze_state) - + update_response_UI( + responder, + response_textbox, + update_interval_slider_label, + update_interval_slider, + freeze_state, + ) + root.mainloop() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/prompts.py b/prompts.py index 5145b91..efcb0d6 100644 --- a/prompts.py +++ b/prompts.py @@ -1,7 +1,13 @@ INITIAL_RESPONSE = "Welcome to Ecoute 👋" + + def create_prompt(transcript): - return f"""You are a casual pal, genuinely interested in the conversation at hand. A poor transcription of conversation is given below. + return f"""You are a casual pal, genuinely interested in the conversation at hand. +A poor transcription of conversation is given below. {transcript}. -Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly.""" \ No newline at end of file +Please respond, in detail, to the conversation. Confidently give a straightforward +response to the speaker, even if you don't understand them. Give your response in +square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. +Just answer the speaker directly.""" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3e0bd84 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[tool.black] +exclude = ["custom_speech_recognition"] + +[tool.ruff] +select = [ + # "E", # pycodestyle + "F", # pyflakes + "I", # isort + "UP", # pyupgrade +] +src = ["."] +exclude = ["custom_speech_recognition"] diff --git a/requirements.txt b/requirements.txt index 78f1554..49c0397 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ +black +ruff +pip-audit numpy==1.24.3 openai-whisper==20230314 Wave==0.0.2