diff --git a/.gitignore b/.gitignore index ca57615..6a3394b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ test.py VOICEVOX/* *.wav models--staka--fugumt-en-ja/ +*.exe +*.wav +!lore.txt \ No newline at end of file diff --git a/STTSLocal.py b/STTSLocal.py index c09b364..d42e316 100644 --- a/STTSLocal.py +++ b/STTSLocal.py @@ -11,22 +11,35 @@ import dict import translator from voicevox import vboxclient - +from timer import Timer +import whisper VOICE_VOX_URL_HIGH_SPEED = "https://api.su-shiki.com/v2/voicevox/audio/" VOICE_VOX_URL_LOW_SPEED = "https://api.tts.quest/v1/voicevox/" VOICE_VOX_URL_LOCAL = "127.0.0.1" +VOICE_OUTPUT_FILENAME = "audioResponse.wav" + use_cloud_voice_vox = False voice_vox_api_key = '' speakersResponse = None vboxapp = None speaker_id = 1 mic_mode = 'open mic' -PUSH_TO_TALK_OUTPUT_FILENAME = "PUSH_TO_TALK_OUTPUT_FILE.wav" +MIC_OUTPUT_FILENAME = "PUSH_TO_TALK_OUTPUT_FILE.wav" PUSH_TO_RECORD_KEY = '5' whisper_filter_list = ['you', 'thank you.', 'thanks for watching.'] +pipeline_elapsed_time = 0 +TTS_pipeline_start_time = 0 +pipeline_timer = Timer() +step_timer = Timer() +model = None + + +def initialize_model(): + global model + model = whisper.load_model("base") def start_voicevox_server(): @@ -104,7 +117,8 @@ def stop_record_auto(): log_message("Recording Stopped") -def sendTextToSyntheizer(text, speaker_id, api_key=''): +def cloud_synthesize(text, speaker_id, api_key=''): + global pipeline_elapsed_time url = '' if (api_key == ''): print('No api key detected, sending request to low speed server.') @@ -116,9 +130,9 @@ def sendTextToSyntheizer(text, speaker_id, api_key=''): print(f"Sending POST request to: {url}") response = requests.request( "POST", url) - log_message("Speech synthesized for text [{}]".format(text)) print(f'response: {response}') # print(f'response.content: {response.content}') + wav_bytes = None if (api_key == ''): response_json = response.json() # print(response_json) @@ -132,35 +146,34 @@ def sendTextToSyntheizer(text, speaker_id, api_key=''): return print(f"Downloading wav response from {wav_url}") wav_bytes = requests.get(wav_url).content - try: - PlayAudio(wav_bytes) - except: - print("Failed to play wav.") - print(wav_bytes) else: - PlayAudio(response.content) + wav_bytes = response.content + + with open(VOICE_OUTPUT_FILENAME, "wb") as file: + file.write(wav_bytes) def syntheize_audio(text, speaker_id): global use_cloud_voice_vox global voice_vox_api_key if (use_cloud_voice_vox): - sendTextToSyntheizer(text, speaker_id, api_key=voice_vox_api_key) + cloud_synthesize(text, speaker_id, api_key=voice_vox_api_key) else: - play_audio_from_local_syntheizer(text, speaker_id) + local_synthesize(text, speaker_id) -def play_audio_from_local_syntheizer(text, speaker_id): - vboxapp.run(text=text, speaker=speaker_id, - filename="audioResponse.wav") # textとfilenameは好きに変更できます - voiceLine = AudioSegment.from_wav("audioResponse.wav") - play(voiceLine) +def local_synthesize(text, speaker_id): + VoiceTextResponse = requests.request( + "POST", f"http://127.0.0.1:50021/audio_query?text={text}&speaker={speaker_id}") + AudioResponse = requests.request( + "POST", f"http://127.0.0.1:50021/synthesis?speaker={speaker_id}", data=VoiceTextResponse) + with open(VOICE_OUTPUT_FILENAME, "wb") as file: + file.write(AudioResponse.content) -def PlayAudio(audioBytes): - with open("audioResponse.wav", "wb") as file: - file.write(audioBytes) - voiceLine = AudioSegment.from_wav("audioResponse.wav") + +def PlayAudio(): + voiceLine = AudioSegment.from_wav(VOICE_OUTPUT_FILENAME) play(voiceLine) @@ -202,7 +215,7 @@ def push_to_talk(): channels=CHANNELS ) - audio_segment.export(PUSH_TO_TALK_OUTPUT_FILENAME, format="wav") + audio_segment.export(MIC_OUTPUT_FILENAME, format="wav") break @@ -213,6 +226,9 @@ def start_STTS_loop(): def start_STTS_pipeline(): + global pipeline_elapsed_time + global step_timer + global pipeline_timer global mic_mode audio = None if (mic_mode == 'open mic'): @@ -229,18 +245,30 @@ def start_STTS_pipeline(): if not auto_recording: return - # send audio to whisper - global input_language_name - input_text = '' + with open(MIC_OUTPUT_FILENAME, "wb") as file: + file.write(audio.get_wav_data()) + log_message("recording compelete, sending to whisper") elif (mic_mode == 'push to talk'): push_to_talk() - r = sr.Recognizer() - with sr.AudioFile(PUSH_TO_TALK_OUTPUT_FILENAME) as source: - audio = r.record(source) + + # send audio to whisper + pipeline_timer.start() + step_timer.start() + input_text = '' try: - input_text = r.recognize_whisper( - audio, language=input_language_name.lower()) + global model + if (model == None): + initialize_model() + global input_language_name + print(input_language_name) + audio = whisper.load_audio(MIC_OUTPUT_FILENAME) + audio = whisper.pad_or_trim(audio) + mel = whisper.log_mel_spectrogram(audio).to(model.device) + options = whisper.DecodingOptions( + language=input_language_name.lower(), without_timestamps=True, fp16=False if model.device == 'cpu' else None) + result = whisper.decode(model, mel, options) + input_text = result.text except sr.UnknownValueError: log_message("Whisper could not understand audio") except sr.RequestError as e: @@ -248,34 +276,46 @@ def start_STTS_pipeline(): global whisper_filter_list if (input_text == ''): return - log_message(f'Input: {input_text}') + log_message(f'Input: {input_text} ({step_timer.end()}s)') + print(f'looking for {input_text.strip().lower()} in {whisper_filter_list}') if (input_text.strip().lower() in whisper_filter_list): log_message(f'Input {input_text} was filtered.') return with open("Input.txt", "w", encoding="utf-8") as file: file.write(input_text) + pipeline_elapsed_time += pipeline_timer.end() start_TTS_pipeline(input_text) def start_TTS_pipeline(input_text): global voice_name global speaker_id + global pipeline_elapsed_time + pipeline_timer.start() inputLanguage = language_dict[input_language_name][:2] outputLanguage = 'ja' # print(f"inputLanguage: {inputLanguage}, outputLanguage: {outputLanguage}") translate = inputLanguage != outputLanguage if (translate): + step_timer.start() input_processed_text = translator.translate( input_text, inputLanguage, outputLanguage) - log_message(f'Translation: {input_processed_text}') + log_message( + f'Translation: {input_processed_text} ({step_timer.end()}s)') else: input_processed_text = input_text with open("translation.txt", "w", encoding="utf-8") as file: file.write(input_processed_text) + step_timer.start() syntheize_audio( input_processed_text, speaker_id) + log_message( + f"Speech synthesized for text [{input_processed_text}] ({step_timer.end()}s)") + log_message( + f'Total time: ({round(pipeline_elapsed_time + pipeline_timer.end(),2)}s)') + PlayAudio() global last_input_text last_input_text = input_text @@ -283,6 +323,7 @@ def start_TTS_pipeline(input_text): last_input_language = inputLanguage global last_voice_param last_voice_param = speaker_id + pipeline_elapsed_time = 0 def playOriginal(): diff --git a/UI.py b/UI.py index 3b1f2c6..bb37e8b 100644 --- a/UI.py +++ b/UI.py @@ -11,6 +11,7 @@ import time import subLocal as SUB import translator +import chatbot class Pages(Enum): @@ -18,6 +19,7 @@ class Pages(Enum): TEXT_INPUT = 1 SETTINGS = 2 SUBTITLE = 3 + CHAT = 4 current_page = Pages.AUDIO_INPUT @@ -66,6 +68,18 @@ def __init__(self, master, **kwargs): ) subtitles_button.pack(anchor="s") + chat_button = customtkinter.CTkButton(master=self, + width=120, + height=32, + border_width=0, + corner_radius=0, + text="Chat", + command=lambda: self.change_page( + Pages.CHAT), + fg_color='grey' + ) + chat_button.pack(anchor="s") + button = customtkinter.CTkButton(master=self, width=120, height=32, @@ -163,6 +177,100 @@ def log_message_on_console(self, message_text): self.textbox.configure(state="disabled") +class ChatFrame(customtkinter.CTkFrame): + def __init__(self, master, **kwargs): + super().__init__(master, **kwargs) + self.isRecording = False + self.thread = Thread(target=STTS.start_record_auto) + # add widgets onto the frame... + self.textbox = customtkinter.CTkTextbox(self, width=400, height=400) + self.textbox.grid(row=0, column=0, rowspan=4, columnspan=4) + # configure textbox to be read-only + self.textbox.configure(state="disabled") + chatbot.logging_eventhandlers.append(self.log_message_on_console) + + self.user_input_var = customtkinter.StringVar(self, '') + self.voicevox_api_key_input = customtkinter.CTkEntry( + master=self, textvariable=self.user_input_var, width=300) + self.voicevox_api_key_input.grid( + row=4, column=0, padx=10, pady=10, sticky='W', columnspan=3) + self.send_button = customtkinter.CTkButton(master=self, + width=32, + height=32, + border_width=0, + corner_radius=8, + text="send", + command=self.send_user_input, + fg_color='grey' + ) + self.send_button.grid(row=4, column=3, pady=10) + # self.recordButton = customtkinter.CTkButton(master=self, + # width=120, + # height=32, + # border_width=0, + # corner_radius=8, + # text="Start Recording", + # command=self.recordButton_callback, + # fg_color='grey' + # ) + # self.recordButton.grid(row=3, column=0, pady=10) + + # self.playOriginalButton = customtkinter.CTkButton(master=self, + # width=120, + # height=32, + # border_width=0, + # corner_radius=8, + # text="Play original", + # command=self.play_original_callback, + # fg_color='grey' + # ) + # self.playOriginalButton.grid(row=3, column=1, pady=10) + + # self.clearConsoleButton = customtkinter.CTkButton(master=self, + # width=32, + # height=32, + # border_width=0, + # corner_radius=8, + # text="X", + # command=self.clear_console, + # fg_color='grey' + # ) + # self.clearConsoleButton.grid(row=3, column=2, padx=10, pady=10) + + # def clear_console(self): + # self.textbox.configure(state="normal") + # self.textbox.delete('1.0', customtkinter.END) + # self.textbox.configure(state="disabled") + + # def recordButton_callback(self): + # if (self.isRecording): + # self.recordButton.configure( + # text="Start Recording", fg_color='grey') + # self.isRecording = False + # STTS.stop_record_auto() + # else: + # self.recordButton.configure( + # text="Stop Recording", fg_color='#fc7b5b') + # self.isRecording = True + # STTS.start_record_auto() + # self.recordButton.grid(row=3, column=0, pady=10) + + # def play_original_callback(self): + # thread = Thread(target=STTS.playOriginal()) + # thread.start() + def send_user_input(self): + text = self.user_input_var.get() + self.user_input_var.set('') + thread = Thread(target=chatbot.send_user_input, args=[text,]) + thread.start() + + def log_message_on_console(self, message_text): + # insert at line 0 character 0 + self.textbox.configure(state="normal") + self.textbox.insert(customtkinter.INSERT, message_text+'\n') + self.textbox.configure(state="disabled") + + class TextBoxFrame(customtkinter.CTkFrame): def __init__(self, master, **kwargs): @@ -409,7 +517,7 @@ def __init__(self): class OptionsFrame(customtkinter.CTkFrame): - def __init__(self, master, **kwargs): + def __init__(self, master, enable_micmeter=True, **kwargs): super().__init__(master, **kwargs) self.speaker_names = STTS.get_speaker_names() self.default_speaker = self.speaker_names[0] @@ -458,13 +566,15 @@ def __init__(self, master, **kwargs): variable=self.style_combobox_var) self.style_combobox.pack(padx=20, pady=0) - label_mic = customtkinter.CTkLabel( - master=self, text='Mic activity: ') - label_mic.pack(padx=20, pady=10) - self.progressbar = customtkinter.CTkProgressBar(master=self, width=100) - self.progressbar.pack(padx=20, pady=0) - thread = Thread(target=self.update_mic_meter) - thread.start() + if (enable_micmeter): + label_mic = customtkinter.CTkLabel( + master=self, text='Mic activity: ') + label_mic.pack(padx=20, pady=10) + self.progressbar = customtkinter.CTkProgressBar( + master=self, width=100) + self.progressbar.pack(padx=20, pady=0) + thread = Thread(target=self.update_mic_meter) + thread.start() def update_mic_meter(self): global audio_level @@ -529,6 +639,17 @@ def __init__(self, *args, **kwargs): subtitles_frame.pack(padx=0, pady=0) +class ChatPage(Page): + def __init__(self, *args, **kwargs): + Page.__init__(self, *args, **kwargs) + chat_frame = ChatFrame( + master=self, width=500, corner_radius=8) + chat_frame.grid(row=0, column=1, padx=20, pady=20, + sticky="nswe") + options = OptionsFrame(master=self, enable_micmeter=False) + options.grid(row=0, column=2, padx=20, pady=20, sticky="nswe") + + class SettingsPage(Page): def __init__(self, *args, **kwargs): Page.__init__(self, *args, **kwargs) @@ -634,6 +755,7 @@ def __init__(self): textInputPage = TextInputPage(self) settingsPage = SettingsPage(self) subtitlesPage = SubtitlesPage(self) + chatPage = ChatPage(self) container = customtkinter.CTkFrame( self, width=700, height=700, bg_color='#fafafa') container.grid(row=0, column=1, padx=20, pady=20, sticky="nswe") @@ -641,6 +763,7 @@ def __init__(self): audioInputPage.place(in_=container, x=0, y=0) textInputPage.place(in_=container, x=0, y=0) subtitlesPage.place(in_=container, x=0, y=0) + chatPage.place(in_=container, x=0, y=0) settingsPage.place(in_=container, x=0, y=0) audioInputPage.show() @@ -660,6 +783,9 @@ def showPage(): elif (current_page == Pages.SETTINGS): container.lift() settingsPage.show() + elif (current_page == Pages.CHAT): + container.lift() + chatPage.show() pageChange_eventhandlers.append(showPage) @@ -681,8 +807,12 @@ def listen_to_mic(): thread = Thread(target=listen_to_mic) thread.start() +print("Starting voicevox server...") STTS.start_voicevox_server() - +print("Initializing tts model...") +STTS.initialize_model() +print("Initializing translator...") +translator.initialize() app = App() app.configure(background='#fafafa') app.mainloop() diff --git a/audioResponse.wav b/audioResponse.wav index 0f19c4c..d410187 100644 Binary files a/audioResponse.wav and b/audioResponse.wav differ diff --git a/chatbot.py b/chatbot.py new file mode 100644 index 0000000..46e3222 --- /dev/null +++ b/chatbot.py @@ -0,0 +1,54 @@ +import os +import traceback +import openai +import STTSLocal as STTS + +lore = '' +try: + with open('./lore.txt', 'r', encoding='utf-8') as file: + lore = file.read() +except Exception: + print("error when reading lore.txt") + print(traceback.format_exc()) +lore = lore.replace('\n', '') + +message_log = [ + {"role": "system", "content": lore}, + {"role": "user", "content": lore}, +] + +logging_eventhandlers = [] + + +def send_user_input(user_input): + log_message(f'user: {user_input}') + global message_log + api_key = os.getenv("OPENAI_API_KEY") + + openai.api_key = api_key + print(f"Sending: {user_input} with api key :{api_key}") + print(message_log) + message_log.append({"role": "user", "content": user_input}) + response = None + try: + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=message_log + ) + except Exception: + log_message("Error when loading api key from environment variable") + log_message( + "You need an API key from https://platform.openai.com/ stored in an environment variable with name \"OPENAI_API_KEY\" to use the chat feature") + print(traceback.format_exc()) + return + text_response = response['choices'][0]['message']['content'] + message_log.append({"role": "assistant", "content": text_response}) + log_message(f'AI: {text_response}') + STTS.start_TTS_pipeline(text_response) + + +def log_message(message_text): + print(message_text) + global logging_eventhandlers + for eventhandler in logging_eventhandlers: + eventhandler(message_text) diff --git a/lore.txt b/lore.txt new file mode 100644 index 0000000..d9bdcf7 --- /dev/null +++ b/lore.txt @@ -0,0 +1,3 @@ +You are not an assistant, your task is to roleplay as a character. +Can you talk to me as if you are this character? Please only provide short answers and try not to break out of character. Here's her description: +A shrine maiden of a Shinto shrine. \ No newline at end of file diff --git a/sub.py b/sub.py deleted file mode 100644 index b9f29a3..0000000 --- a/sub.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -from pydub.playback import play -import azure.cognitiveservices.speech as speechsdk -import dict - -SPEECH_KEY = os.environ.get('SPEECH_KEY_P') -SPEECH_REGION = 'eastus' - -text_change_eventhandlers = [] - -# Translation request -translation_recognizer = None -target_language = '' -speech_translation_config = None -audio_config = None -input_language_name = "Japanese" -output_language_name = "English" - -language_dict = dict.language_dict -azure_language_dict = dict.azure_language_dict - - -def initialize_speech_translator(): - global speech_translation_config - global target_language - global audio_config - global translation_recognizer - global input_language_name - speech_translation_config = speechsdk.translation.SpeechTranslationConfig( - subscription=SPEECH_KEY, region=SPEECH_REGION) - speech_translation_config.speech_recognition_language = language_dict[input_language_name] - - target_language = azure_language_dict[output_language_name] - speech_translation_config.add_target_language(target_language) - - audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True) - translation_recognizer = speechsdk.translation.TranslationRecognizer( - translation_config=speech_translation_config, audio_config=audio_config) - - -def start_translator(): - # print("Speak into your microphone.") - # translation_recognition_result = translation_recognizer.recognize_once_async().get() - - # if translation_recognition_result.reason == speechsdk.ResultReason.TranslatedSpeech: - # send_update_text_event( - # translation_recognition_result.translations[target_language]) - # print("Recognized: {}".format(translation_recognition_result.text)) - # print("""Translated into '{}': {}""".format( - # target_language, - # translation_recognition_result.translations[target_language])) - # elif translation_recognition_result.reason == speechsdk.ResultReason.NoMatch: - # print("No speech could be recognized: {}".format( - # translation_recognition_result.no_match_details)) - # elif translation_recognition_result.reason == speechsdk.ResultReason.Canceled: - # cancellation_details = translation_recognition_result.cancellation_details - # print("Speech Recognition canceled: {}".format( - # cancellation_details.reason)) - # if cancellation_details.reason == speechsdk.CancellationReason.Error: - # print("Error details: {}".format( - # cancellation_details.error_details)) - # print("Did you set the speech resource key and region values?") - - print("Speak into your microphone.") - translation_recognizer.start_continuous_recognition_async() - translation_recognizer.recognizing.connect( - lambda evt: set_translation_text(send_update_text_event(evt.result.translations[target_language]))) - translation_recognizer.recognized.connect( - lambda evt: set_translation_text(send_update_text_event(evt.result.translations[target_language]))) - translation_recognizer.canceled.connect(showReconitionErrors) - - translation_recognizer.session_stopped.connect(showReconitionErrors) - translation_recognizer.canceled.connect(showReconitionErrors) - - -def showReconitionErrors(translation_recognition_result): - translation_recognition_result = translation_recognition_result.result - if translation_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech: - log_message("Recognized: {}".format( - translation_recognition_result.text)) - elif translation_recognition_result.reason == speechsdk.ResultReason.NoMatch: - log_message("No speech could be recognized: {}".format( - translation_recognition_result.no_match_details)) - elif translation_recognition_result.reason == speechsdk.ResultReason.Canceled: - cancellation_details = translation_recognition_result.cancellation_details - log_message("Speech Recognition canceled: {}".format( - cancellation_details.reason)) - if cancellation_details.reason == speechsdk.CancellationReason.Error: - log_message("Error details: {}".format( - cancellation_details.error_details)) - - -def set_translation_text(text): - print(text) - - -def log_message(message_text): - print(message_text) - - -def send_update_text_event(text): - print(text) - global text_change_eventhandlers - for eventhandler in text_change_eventhandlers: - eventhandler(text) - - -def change_input_language(input_lang_name): - global input_language_name - input_language_name = input_lang_name - initialize_speech_translator() - - -def change_output_language(output_lang_name): - global output_language_name - output_language_name = output_lang_name - initialize_speech_translator() - - -initialize_speech_translator() diff --git a/timer.py b/timer.py new file mode 100644 index 0000000..e5f535e --- /dev/null +++ b/timer.py @@ -0,0 +1,19 @@ +from time import time + + +class Timer(): + def __init__(self): + self.started = False + self.elasped_time = 0 + self.start_time = 0 + + def start(self): + self.started = True + self.start_time = time() + + def end(self): + self.started = False + return round(time()-self.start_time, 2) + + def is_started(self): + return self.started diff --git a/translator.py b/translator.py index 0554cb9..93c550a 100644 --- a/translator.py +++ b/translator.py @@ -4,6 +4,13 @@ use_deepl = False deepl_api_key = '' +fugu_translator = None + + +def initialize(): + global fugu_translator + fugu_translator = pipeline( + 'translation', model='./models--staka--fugumt-en-ja/snapshots/2d6da1c7352386e12ddd46ce3d0bbb2310200fcc') def translate(text, from_code, to_code): @@ -32,8 +39,10 @@ def translate(text, from_code, to_code): else: if (from_code == 'en' and to_code == 'ja'): - fugu_translator = pipeline( - 'translation', model='./models--staka--fugumt-en-ja/snapshots/2d6da1c7352386e12ddd46ce3d0bbb2310200fcc') + global fugu_translator + if (fugu_translator == None): + fugu_translator = pipeline( + 'translation', model='./models--staka--fugumt-en-ja/snapshots/2d6da1c7352386e12ddd46ce3d0bbb2310200fcc') return fugu_translator(text)[0]['translation_text'] else: print(