SevaSk · shosseini811 · May 30, 2023 · May 30, 2023 · May 30, 2023 · May 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__/
 *.wav
 keys.py
-.venv/
+.venv/
+.DS_Store
diff --git a/AudioRecorder.py b/AudioRecorder.py
@@ -1,5 +1,5 @@
 import custom_speech_recognition as sr
-import pyaudiowpatch as pyaudio
+import pyaudio
 from datetime import datetime
 
 RECORD_TIMEOUT = 3
@@ -11,6 +11,10 @@ def __init__(self, source, source_name):
         self.recorder = sr.Recognizer()
         self.recorder.energy_threshold = ENERGY_THRESHOLD
         self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
+
+        if source is None:
+            raise ValueError("audio source can't be None")
+
         self.source = source
         self.source_name = source_name
 
@@ -34,22 +38,20 @@ def __init__(self):
 
 class DefaultSpeakerRecorder(BaseRecorder):
     def __init__(self):
-        with pyaudio.PyAudio() as p:
-            wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
-            default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
-
-            if not default_speakers["isLoopbackDevice"]:
-                for loopback in p.get_loopback_device_info_generator():
-                    if default_speakers["name"] in loopback["name"]:
-                        default_speakers = loopback
-                        break
-                else:
-                    print("[ERROR] No loopback device found.")
+        p = pyaudio.PyAudio()
+
+        # Get default input device information
+        default_input_device_info = p.get_default_input_device_info()
 
-        source = sr.Microphone(speaker=True,
-                               device_index= default_speakers["index"],
-                               sample_rate=int(default_speakers["defaultSampleRate"]),
-                               chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
-                               channels=default_speakers["maxInputChannels"])
+        source = sr.Microphone(
+            device_index=default_input_device_info["index"],
+            sample_rate=int(default_input_device_info["defaultSampleRate"]),
+            chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
+            channels=default_input_device_info["maxInputChannels"]
+        )
         super().__init__(source=source, source_name="Speaker")
-        self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")
+        self.adjust_for_noise("Default Speaker", "Please make or play some noise from the Default Speaker...")
+
+        # Don't forget to close PyAudio object when you're done
+        p.terminate()
+
diff --git a/AudioTranscriber.py b/AudioTranscriber.py
@@ -3,11 +3,11 @@
 import wave
 import os
 import threading
-from tempfile import NamedTemporaryFile
+import tempfile
 import custom_speech_recognition as sr
 import io
 from datetime import timedelta
-import pyaudiowpatch as pyaudio
+import pyaudio
 from heapq import merge
 
 PHRASE_TIMEOUT = 3.05
@@ -45,8 +45,17 @@ def transcribe_audio_queue(self, audio_queue):
             who_spoke, data, time_spoken = audio_queue.get()
             self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken)
             source_info = self.audio_sources[who_spoke]
-            temp_file = source_info["process_data_func"](source_info["last_sample"])
-            text = self.audio_model.get_transcription(temp_file)
+
+            text = ''
+            try:
+                fd, path = tempfile.mkstemp(suffix=".wav")
+                os.close(fd)
+                source_info["process_data_func"](source_info["last_sample"], path)
+                text = self.audio_model.get_transcription(path)
+            except Exception as e:
+                print(e)
+            finally:
+                os.unlink(path)
 
             if text != '' and text.lower() != 'you':
                 self.update_transcript(who_spoke, text, time_spoken)
@@ -63,23 +72,19 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
         source_info["last_sample"] += data
         source_info["last_spoken"] = time_spoken 
 
-    def process_mic_data(self, data):
-        temp_file = NamedTemporaryFile().name
+    def process_mic_data(self, data, temp_file_name):
         audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
         wav_data = io.BytesIO(audio_data.get_wav_data())
-        with open(temp_file, 'w+b') as f:
+        with open(temp_file_name, 'w+b') as f:
             f.write(wav_data.read())
-        return temp_file
 
-    def process_speaker_data(self, data):
-        temp_file = NamedTemporaryFile().name
-        with wave.open(temp_file, 'wb') as wf:
+    def process_speaker_data(self, data, temp_file_name):
+        with wave.open(temp_file_name, 'wb') as wf:
             wf.setnchannels(self.audio_sources["Speaker"]["channels"])
             p = pyaudio.PyAudio()
             wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
             wf.setframerate(self.audio_sources["Speaker"]["sample_rate"])
             wf.writeframes(data)
-        return temp_file
 
     def update_transcript(self, who_spoke, text, time_spoken):
         source_info = self.audio_sources[who_spoke]

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Follow these steps to set up and run Ecoute on your local machine.
 
 ### 📋 Prerequisites
 
-- Python 3.x
+- Python >=3.8.0
 - An OpenAI API key
 - Windows OS (Not tested on others)
 - FFmpeg 
@@ -28,7 +28,7 @@ Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManage
 ```
 Once Chocolatey is installed, you can install FFmpeg by running the following command in your PowerShell:
 ```
-choco install ffmpeg-full
+choco install ffmpeg
 ```
 Please ensure that you run these commands in a PowerShell window with administrator privileges. If you face any issues during the installation, you can visit the official Chocolatey and FFmpeg websites for troubleshooting.
 
@@ -54,11 +54,18 @@ Please ensure that you run these commands in a PowerShell window with administra
 
 4. Create a `keys.py` file in the ecoute directory and add your OpenAI API key:
 
-   ```
-   echo 'OPENAI_API_KEY = "API KEY"' > keys.py
-   ```
+   - Option 1: You can utilize a command on your command prompt. Run the following command, ensuring to replace "API KEY" with your actual OpenAI API key:
 
-   Replace `API KEY` with your actual OpenAI API key.
+      ```
+      python -c "with open('keys.py', 'w', encoding='utf-8') as f: f.write('OPENAI_API_KEY=\"API KEY\"')"
+      ```
+
+   - Option 2: You can create the keys.py file manually. Open up your text editor of choice and enter the following content:
+
+      ```
+      OPENAI_API_KEY="API KEY"
+      ```
+      Replace "API KEY" with your actual OpenAI API key. Save this file as keys.py within the ecoute directory.
 
 ### 🎬 Running Ecoute
 
@@ -68,25 +75,25 @@ Run the main script:
 python main.py
 ```
 
-For a better and faster version, use:
+For a more better and faster version that also works with most languages, use:
 
 ```
 python main.py --api
 ```
 
 Upon initiation, Ecoute will begin transcribing your microphone input and speaker output in real-time, generating a suggested response based on the conversation. Please note that it might take a few seconds for the system to warm up before the transcription becomes real-time.
 
-The --api flag significantly enhances transcription speed and accuracy, and it's expected to be the default option in future releases. However, keep in mind that using the Whisper API will consume more OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional cost, the considerable improvements in speed and transcription accuracy might make it a worthwhile investment for your use case.
+The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). It's expected to become the default option in future releases. However, keep in mind that using the Whisper API will consume more OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile investment for your use case.
 
 ### ⚠️ Limitations
 
 While Ecoute provides real-time transcription and response suggestions, there are several known limitations to its functionality that you should be aware of:
 
 **Default Mic and Speaker:** Ecoute is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. If you wish to use a different mic or speaker, you will need to set it as your default device in your system settings.
 
-**Whisper Model**: We utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.
+**Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.
 
-**Language**: The Whisper model used in Ecoute is set to English. As a result, it may not accurately transcribe non-English languages or dialects. We are actively working to add multi-language support to future versions of the program.
+**Language**: If you are not using the --api flag the Whisper model used in Ecoute is set to English. As a result, it may not accurately transcribe non-English languages or dialects. We are actively working to add multi-language support to future versions of the program.
 
 ## 📖 License
 

diff --git a/TranscriberModels.py b/TranscriberModels.py
@@ -19,16 +19,15 @@ def get_transcription(self, wav_file_path):
             result = self.audio_model.transcribe(wav_file_path, fp16=torch.cuda.is_available())
         except Exception as e:
             print(e)
+            return ''
         return result['text'].strip()
 
 class APIWhisperTranscriber:
     def get_transcription(self, wav_file_path):
-        new_file_path = wav_file_path + '.wav'
-        os.rename(wav_file_path, new_file_path)
-        audio_file= open(new_file_path, "rb")
         try:
-            result = openai.Audio.translate("whisper-1", audio_file)
+            with open(wav_file_path, "rb") as audio_file:
+                result = openai.Audio.transcribe("whisper-1", audio_file)
         except Exception as e:
             print(e)
-
+            return ''
         return result['text'].strip()
diff --git a/custom_speech_recognition/__init__.py b/custom_speech_recognition/__init__.py
@@ -107,7 +107,7 @@ def get_pyaudio():
         Imports the pyaudio module and checks its version. Throws exceptions if pyaudio can't be found or a wrong version is installed
         """
         try:
-            import pyaudiowpatch as pyaudio
+            import pyaudio
         except ImportError:
             raise AttributeError("Could not find PyAudio; check installation")
         from distutils.version import LooseVersion

diff --git a/main.py b/main.py
@@ -8,6 +8,7 @@
 import torch
 import sys
 import TranscriberModels
+import subprocess
 
 def write_in_textbox(textbox, text):
     textbox.delete("0.0", "end")
@@ -16,21 +17,18 @@ def write_in_textbox(textbox, text):
 def update_transcript_UI(transcriber, textbox):
     transcript_string = transcriber.get_transcript()
     write_in_textbox(textbox, transcript_string)
-    textbox.after(300, update_transcript_UI, transcriber, textbox)
+    textbox.after(500, update_transcript_UI, transcriber, textbox)
 
 def update_response_UI(responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state):
     if not freeze_state[0]:
         response = responder.response
-
         textbox.configure(state="normal")
         write_in_textbox(textbox, response)
         textbox.configure(state="disabled")
-
         update_interval = int(update_interval_slider.get())
         responder.update_response_interval(update_interval)
         update_interval_slider_label.configure(text=f"Update interval: {update_interval} seconds")
-
-    textbox.after(300, update_response_UI, responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state)
+    textbox.after(500, update_response_UI, responder, textbox, update_interval_slider_label, update_interval_slider, freeze_state)
 
 def clear_context(transcriber, audio_queue):
     transcriber.clear_transcript_data()
@@ -65,6 +63,12 @@ def create_ui_components(root):
     return transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button
 
 def main():
+    try:
+        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except FileNotFoundError:
+        print("ERROR: The ffmpeg library is not installed. Please install ffmpeg and try again.")
+        return
+
     root = ctk.CTk()
     transcript_textbox, response_textbox, update_interval_slider, update_interval_slider_label, freeze_button = create_ui_components(root)
 
@@ -99,23 +103,23 @@ def main():
     root.grid_columnconfigure(0, weight=2)
     root.grid_columnconfigure(1, weight=1)
 
-     # Add the clear transcript button to the UI
-    clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_context(transcriber, audio_queue, ))
+    clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_context(transcriber, audio_queue))
     clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
 
-    freeze_state = [False]  # Using list to be able to change its content inside inner functions
+    freeze_state = [False]
     def freeze_unfreeze():
-        freeze_state[0] = not freeze_state[0]  # Invert the freeze state
+        freeze_state[0] = not freeze_state[0]
         freeze_button.configure(text="Unfreeze" if freeze_state[0] else "Freeze")
+        response_textbox.configure(state="normal" if freeze_state[0] else "disabled")
 
     freeze_button.configure(command=freeze_unfreeze)
 
     update_interval_slider_label.configure(text=f"Update interval: {update_interval_slider.get()} seconds")
 
     update_transcript_UI(transcriber, transcript_textbox)
     update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider, freeze_state)
- 
+
     root.mainloop()
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,6 @@ openai-whisper==20230314
 Wave==0.0.2
 openai==0.27.6
 customtkinter==5.1.3
-PyAudioWPatch==0.2.12.5
---extra-index-url https://download.pytorch.org/whl/cu117
+PyAudio # for Mac
+# --extra-index-url https://download.pytorch.org/whl/cu117
 torch