Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Macos development #143

Open
wants to merge 31 commits into
base: 29-add-option-to-use-speech-to-text-api-rather-than-transcribing-locally
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
0227cc7
Merge pull request #30 from SevaSk/29-add-option-to-use-speech-to-tex…
SevaSk May 30, 2023
d8a8f52
Update README.md
SevaSk May 30, 2023
acff327
return nothing on error in get_transcription
SevaSk May 30, 2023
66da757
Merge branch 'main' of https://github.com/SevaSk/ecoute
SevaSk May 30, 2023
143e1de
Update README.md
SevaSk May 30, 2023
a0e163f
Update README.md
SevaSk May 30, 2023
fdd761d
Update README.md
SevaSk May 30, 2023
8abdb19
fixed potential temp disk memory leak
SevaSk May 30, 2023
46d37a3
Merge branch 'main' of https://github.com/SevaSk/ecoute
SevaSk May 30, 2023
1e96946
Update README.md
SevaSk May 31, 2023
09f7349
Update README.md
SevaSk May 31, 2023
a024716
Update README.md
SevaSk May 31, 2023
2166ae0
Update README.md
SevaSk May 31, 2023
418700a
Update README.md
SevaSk May 31, 2023
10802c3
Update README.md
zarifpour May 31, 2023
cd441b4
Merge pull request #42 from zarifpour/patch-1
SevaSk May 31, 2023
6a20d04
fixed files opening and closing improperly
SevaSk May 31, 2023
70f6675
Merge branch 'main' of https://github.com/SevaSk/ecoute
SevaSk May 31, 2023
b95b162
API now transcribes rather then translates.
SevaSk Jun 1, 2023
210e942
Update README.md
SevaSk Jun 1, 2023
39573cc
Update README.md
SevaSk Jun 1, 2023
fb7c6e1
Update README.md
SevaSk Jun 1, 2023
92cf879
Update README.md
SevaSk Jun 1, 2023
811c111
catching exceptions when making temp_file
SevaSk Jun 1, 2023
1b958ad
Merge branch 'main' of https://github.com/SevaSk/ecoute
SevaSk Jun 1, 2023
29389e9
catch source is none error
SevaSk Jun 1, 2023
54f98f8
file handling possible bug fix
SevaSk Jun 2, 2023
579fdc9
Catching no ffmpeg exception
SevaSk Jun 3, 2023
7264e73
Update README.md
SevaSk Jun 7, 2023
540bfb1
Used Pyaudion for Mac os
shosseini811 Aug 24, 2023
162f3a6
Fixing The delay in the freeze and unfreeze operation
shosseini811 Aug 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__pycache__/
*.wav
keys.py
.venv/
.venv/
.DS_Store
38 changes: 20 additions & 18 deletions AudioRecorder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import custom_speech_recognition as sr
import pyaudiowpatch as pyaudio
import pyaudio
from datetime import datetime

RECORD_TIMEOUT = 3
Expand All @@ -11,6 +11,10 @@ def __init__(self, source, source_name):
self.recorder = sr.Recognizer()
self.recorder.energy_threshold = ENERGY_THRESHOLD
self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD

if source is None:
raise ValueError("audio source can't be None")

self.source = source
self.source_name = source_name

Expand All @@ -34,22 +38,20 @@ def __init__(self):

class DefaultSpeakerRecorder(BaseRecorder):
def __init__(self):
with pyaudio.PyAudio() as p:
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"])

if not default_speakers["isLoopbackDevice"]:
for loopback in p.get_loopback_device_info_generator():
if default_speakers["name"] in loopback["name"]:
default_speakers = loopback
break
else:
print("[ERROR] No loopback device found.")
p = pyaudio.PyAudio()

# Get default input device information
default_input_device_info = p.get_default_input_device_info()

source = sr.Microphone(speaker=True,
device_index= default_speakers["index"],
sample_rate=int(default_speakers["defaultSampleRate"]),
chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
channels=default_speakers["maxInputChannels"])
source = sr.Microphone(
device_index=default_input_device_info["index"],
sample_rate=int(default_input_device_info["defaultSampleRate"]),
chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
channels=default_input_device_info["maxInputChannels"]
)
super().__init__(source=source, source_name="Speaker")
self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")
self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")

# Don't forget to close PyAudio object when you're done
p.terminate()

29 changes: 17 additions & 12 deletions AudioTranscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import wave
import os
import threading
from tempfile import NamedTemporaryFile
import tempfile
import custom_speech_recognition as sr
import io
from datetime import timedelta
import pyaudiowpatch as pyaudio
import pyaudio
from heapq import merge

PHRASE_TIMEOUT = 3.05
Expand Down Expand Up @@ -45,8 +45,17 @@ def transcribe_audio_queue(self, audio_queue):
who_spoke, data, time_spoken = audio_queue.get()
self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
source_info = self.audio_sources[who_spoke]
temp_file = source_info["process_data_func"](source_info["last_sample"])
text = self.audio_model.get_transcription(temp_file)

text = ''
try:
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
source_info["process_data_func"](source_info["last_sample"], path)
text = self.audio_model.get_transcription(path)
except Exception as e:
print(e)
finally:
os.unlink(path)

if text != '' and text.lower() != 'you':
self.update_transcript(who_spoke, text, time_spoken)
Expand All @@ -63,23 +72,19 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
source_info["last_sample"] += data
source_info["last_spoken"] = time_spoken

def process_mic_data(self, data):
temp_file = NamedTemporaryFile().name
def process_mic_data(self, data, temp_file_name):
audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file, 'w+b') as f:
with open(temp_file_name, 'w+b') as f:
f.write(wav_data.read())
return temp_file

def process_speaker_data(self, data):
temp_file = NamedTemporaryFile().name
with wave.open(temp_file, 'wb') as wf:
def process_speaker_data(self, data, temp_file_name):
with wave.open(temp_file_name, 'wb') as wf:
wf.setnchannels(self.audio_sources["Speaker"]["channels"])
p = pyaudio.PyAudio()
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(self.audio_sources["Speaker"]["sample_rate"])
wf.writeframes(data)
return temp_file

def update_transcript(self, who_spoke, text, time_spoken):
source_info = self.audio_sources[who_spoke]
Expand Down
27 changes: 17 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Follow these steps to set up and run Ecoute on your local machine.

### 📋 Prerequisites

- Python 3.x
- Python >=3.8.0
- An OpenAI API key
- Windows OS (Not tested on others)
- FFmpeg
Expand All @@ -28,7 +28,7 @@ Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManage
```
Once Chocolatey is installed, you can install FFmpeg by running the following command in your PowerShell:
```
choco install ffmpeg-full
choco install ffmpeg
```
Please ensure that you run these commands in a PowerShell window with administrator privileges. If you face any issues during the installation, you can visit the official Chocolatey and FFmpeg websites for troubleshooting.

Expand All @@ -54,11 +54,18 @@ Please ensure that you run these commands in a PowerShell window with administra

4. Create a `keys.py` file in the ecoute directory and add your OpenAI API key:

```
echo 'OPENAI_API_KEY = "API KEY"' > keys.py
```
- Option 1: You can utilize a command on your command prompt. Run the following command, ensuring to replace "API KEY" with your actual OpenAI API key:

Replace `API KEY` with your actual OpenAI API key.
```
python -c "with open('keys.py', 'w', encoding='utf-8') as f: f.write('OPENAI_API_KEY=\"API KEY\"')"
```

- Option 2: You can create the keys.py file manually. Open up your text editor of choice and enter the following content:

```
OPENAI_API_KEY="API KEY"
```
Replace "API KEY" with your actual OpenAI API key. Save this file as keys.py within the ecoute directory.

### 🎬 Running Ecoute

Expand All @@ -68,25 +75,25 @@ Run the main script:
python main.py
```

For a better and faster version, use:
For a more better and faster version that also works with most languages, use:

```
python main.py --api
```

Upon initiation, Ecoute will begin transcribing your microphone input and speaker output in real-time, generating a suggested response based on the conversation. Please note that it might take a few seconds for the system to warm up before the transcription becomes real-time.

The --api flag significantly enhances transcription speed and accuracy, and it's expected to be the default option in future releases. However, keep in mind that using the Whisper API will consume more OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional cost, the considerable improvements in speed and transcription accuracy might make it a worthwhile investment for your use case.
The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). It's expected to become the default option in future releases. However, keep in mind that using the Whisper API will consume more OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile investment for your use case.

### ⚠️ Limitations

While Ecoute provides real-time transcription and response suggestions, there are several known limitations to its functionality that you should be aware of:

**Default Mic and Speaker:** Ecoute is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. If you wish to use a different mic or speaker, you will need to set it as your default device in your system settings.

**Whisper Model**: We utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.
**Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.

**Language**: The Whisper model used in Ecoute is set to English. As a result, it may not accurately transcribe non-English languages or dialects. We are actively working to add multi-language support to future versions of the program.
**Language**: If you are not using the --api flag the Whisper model used in Ecoute is set to English. As a result, it may not accurately transcribe non-English languages or dialects. We are actively working to add multi-language support to future versions of the program.

## 📖 License

Expand Down
9 changes: 4 additions & 5 deletions TranscriberModels.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,15 @@ def get_transcription(self, wav_file_path):
result = self.audio_model.transcribe(wav_file_path, fp16=torch.cuda.is_available())
except Exception as e:
print(e)
return ''
return result['text'].strip()

class APIWhisperTranscriber:
def get_transcription(self, wav_file_path):
new_file_path = wav_file_path + '.wav'
os.rename(wav_file_path, new_file_path)
audio_file= open(new_file_path, "rb")
try:
result = openai.Audio.translate("whisper-1", audio_file)
with open(wav_file_path, "rb") as audio_file:
result = openai.Audio.transcribe("whisper-1", audio_file)
except Exception as e:
print(e)

return ''
return result['text'].strip()
2 changes: 1 addition & 1 deletion custom_speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get_pyaudio():
Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed
"""
try:
import pyaudiowpatch as pyaudio
import pyaudio
except ImportError:
raise AttributeError("Could not find PyAudio; check installation")
from distutils.version import LooseVersion
Expand Down
26 changes: 15 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import torch
import sys
import TranscriberModels
import subprocess

def write_in_textbox(textbox, text):
textbox.delete("0.0", "end")
Expand All @@ -16,21 +17,18 @@ def write_in_textbox(textbox, text):
def update_transcript_UI(transcriber, textbox):
transcript_string = transcriber.get_transcript()
write_in_textbox(textbox, transcript_string)
textbox.after(300, update_transcript_UI, transcriber, textbox)
textbox.after(500, update_transcript_UI, transcriber, textbox)

def update_response_UI(responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state):
if not freeze_state[0]:
response = responder.response

textbox.configure(state="normal")
write_in_textbox(textbox, response)
textbox.configure(state="disabled")

update_interval = int(update_interval_slider.get())
responder.update_response_interval(update_interval)
update_interval_slider_label.configure(text=f"Update interval: {update_interval} seconds")

textbox.after(300, update_response_UI, responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state)
textbox.after(500, update_response_UI, responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state)

def clear_context(transcriber, audio_queue):
transcriber.clear_transcript_data()
Expand Down Expand Up @@ -65,6 +63,12 @@ def create_ui_components(root):
return transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button

def main():
try:
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except FileNotFoundError:
print("ERROR: The ffmpeg library is not installed. Please install ffmpeg and try again.")
return

root = ctk.CTk()
transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button = create_ui_components(root)

Expand Down Expand Up @@ -99,23 +103,23 @@ def main():
root.grid_columnconfigure(0, weight=2)
root.grid_columnconfigure(1, weight=1)

# Add the clear transcript button to the UI
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_context(transcriber, audio_queue, ))
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_context(transcriber, audio_queue))
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")

freeze_state = [False] # Using list to be able to change its content inside inner functions
freeze_state = [False]
def freeze_unfreeze():
freeze_state[0] = not freeze_state[0] # Invert the freeze state
freeze_state[0] = not freeze_state[0]
freeze_button.configure(text="Unfreeze" if freeze_state[0] else "Freeze")
response_textbox.configure(state="normal" if freeze_state[0] else "disabled")

freeze_button.configure(command=freeze_unfreeze)

update_interval_slider_label.configure(text=f"Update interval: {update_interval_slider.get()} seconds")

update_transcript_UI(transcriber, transcript_textbox)
update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider, freeze_state)

root.mainloop()

if __name__ == "__main__":
main()
main()
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ openai-whisper==20230314
Wave==0.0.2
openai==0.27.6
customtkinter==5.1.3
PyAudioWPatch==0.2.12.5
--extra-index-url https://download.pytorch.org/whl/cu117
PyAudio # for Mac
# --extra-index-url https://download.pytorch.org/whl/cu117
torch