diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index ec04f1a1..576524a8 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -48,7 +48,10 @@ jobs: - name: Build dependencies run: | python -m pip install -U pip setuptools - pip install wheel && cd ../../binding/python && python3 setup.py sdist bdist_wheel && pip install dist/pvorca-0.1.4-py3-none-any.whl + pip install wheel + cd ../../binding/python + python3 setup.py sdist bdist_wheel + python3 -m pip install dist/pvorca-0.1.4-py3-none-any.whl - name: Install dependencies run: | @@ -81,7 +84,10 @@ jobs: # TODO: remove after release - name: Build dependencies run: | - pip install wheel && cd ../../binding/python && python3 setup.py sdist bdist_wheel && pip install --force-reinstall dist/pvorca-0.1.4-py3-none-any.whl + pip install wheel + cd ../../binding/python + python3 setup.py sdist bdist_wheel + python3 -m pip install --force-reinstall dist/pvorca-0.1.4-py3-none-any.whl - name: Install dependencies run: pip3 install -r requirements.txt diff --git a/README.md b/README.md index 82a8fa62..40bd4ff1 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,9 @@ orca = pvorca.create(access_key='${ACCESS_KEY}') ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). + +#### Streaming synthesis + To synthesize a text stream, create an Orca Stream object and add text to it one-by-one: ```python @@ -235,6 +238,8 @@ When done with streaming text synthesis, the stream object needs to be closed: stream.close() ``` +#### Single synthesis + Use single synthesis mode if the complete text is known in advance: ```python @@ -284,9 +289,9 @@ The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant infor Build an instance of the object: ```c -pv_orca_t *handle = NULL; +pv_orca_t *orca = NULL; const char *model_path = "${MODEL_PATH}"; -pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &handle); +pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &orca); if (status != PV_STATUS_SUCCESS) { // error handling logic } @@ -303,7 +308,65 @@ status = pv_orca_synthesize_params_init(&synthesize_params); // change the default parameters of synthesize_params as desired ``` -Now, the `handle` and `synthesize_params` object can be used to synthesize speech: +#### Streaming synthesis + +To synthesize a text stream, create an `orca_stream` object using the `synthesize_params`: + +```c +pv_orca_stream_t *orca_stream = NULL; +status = pv_orca_stream_open(orca, synthesize_params, &orca_stream); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +``` + +Add text to `orca_stream` one-by-one and handle the synthesized audio: + +```c +extern char *get_next_text_chunk(void); + +int32_t num_samples_chunk = 0; +int16_t *pcm_chunk = NULL; +status = pv_orca_stream_synthesize( + orca_stream, + get_next_text_chunk(), + &num_samples_chunk, + &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the text stream is complete, call the flush method to synthesize the remaining text: + +```c +status = pv_orca_stream_flush(orca_stream, &num_samples_chunk, &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the pcms are handled, make sure to release the acquired resources for each chunk with: + +```c +pv_orca_pcm_delete(pcm_chunk); +``` + +Finally, when done make sure to close the stream: + +```c +pv_orca_stream_close(orca_stream); +``` + +#### Single synthesis + +If the text is known in advance, single synthesis mode can be used: ```c int32_t num_samples = 0; @@ -311,7 +374,7 @@ int16_t *synthesized_pcm = NULL; int32_t num_alignments = 0; pv_orca_word_alignment_t **alignments = NULL; status = pv_orca_synthesize( - handle, + orca, "${TEXT}", synthesize_params, &num_samples, @@ -322,13 +385,35 @@ status = pv_orca_synthesize( Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Print the metadata of the synthesized audio: + +```c +for (int32_t i = 0; i < num_alignments; i++) { + fprintf( + stdout, + "[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].word, + alignments[i].start_sec, + alignments[i].end_sec); + for (int32_t j = 0; j < alignments[i].num_phonemes; j++) { + fprintf( + stdout, + "\t[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].phonemes[j].phoneme, + alignments[i].phonemes[j].start_sec, + alignments[i].phonemes[j].end_sec); + + } +} +``` + Finally, when done make sure to release the acquired resources: ```c pv_orca_word_alignments_delete(num_alignments, alignments); -pv_orca_delete_pcm(pcm); +pv_orca_pcm_delete(pcm); pv_orca_synthesize_params_delete(synthesize_params); -pv_orca_delete(handle); +pv_orca_delete(orca); ``` ### Web diff --git a/binding/python/_orca.py b/binding/python/_orca.py index a8bc0ee8..3971c6f3 100644 --- a/binding/python/_orca.py +++ b/binding/python/_orca.py @@ -141,7 +141,7 @@ class COrcaStream(Structure): class Stream: """ - Orca Stream object that allows to convert a stream of text to a stream of audio. + Orca Stream object that converts a stream of text to a stream of audio. """ def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None: @@ -150,7 +150,7 @@ def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None: def synthesize(self, text: str) -> Optional[Sequence[int]]: """ - Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + Adds a chunk of text to the Stream object and generates audio if enough text has been added. This function is expected to be called multiple times with consecutive chunks of text from a text stream. The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the buffered text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk diff --git a/demo/c/README.md b/demo/c/README.md index 3117c789..715acb9a 100644 --- a/demo/c/README.md +++ b/demo/c/README.md @@ -18,10 +18,8 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you # Speech Synthesis Demos Orca supports two modes of operation: streaming and single synthesis. - In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. This is demonstrated in the Orca streaming demo. - In the single synthesis mode, the text is synthesized in a single call to the Orca engine. **Note**: the following commands are run from the root of the repo. diff --git a/demo/c/orca_demo.c b/demo/c/orca_demo.c index 722b34b7..35a0c62c 100644 --- a/demo/c/orca_demo.c +++ b/demo/c/orca_demo.c @@ -284,7 +284,7 @@ int picovoice_main(int argc, char **argv) { double proc_sec = 0.; gettimeofday(&before, NULL); - fprintf(stdout, "Synthesizing text `%s` ...\n", text); + fprintf(stdout, "\nSynthesizing text `%s`\n", text); int32_t num_alignments = 0; pv_orca_word_alignment_t **alignments = NULL; @@ -323,7 +323,33 @@ int picovoice_main(int argc, char **argv) { ((double) (after.tv_sec - before.tv_sec) + ((double) (after.tv_usec - before.tv_usec)) * 1e-6); - fprintf(stdout, "Synthesized text in %.1f sec\n", proc_sec); + if (num_alignments > 0) { + fprintf(stdout, "\nWord alignments"); + if (num_alignments > 3) { + fprintf(stdout, " (only showing first 3):\n"); + } else { + fprintf(stdout, ":\n"); + } + int32_t num_alignments_shown = num_alignments > 3 ? 3 : num_alignments; + for (int32_t i = 0; i < num_alignments_shown; i++) { + fprintf( + stdout, + "word=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->word, + alignments[i]->start_sec, + alignments[i]->end_sec); + for (int32_t j = 0; j < alignments[i]->num_phonemes; j++) { + fprintf( + stdout, + "\tphoneme=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->phonemes[j]->phoneme, + alignments[i]->phonemes[j]->start_sec, + alignments[i]->phonemes[j]->end_sec); + } + } + } + + fprintf(stdout, "\nSynthesized text in %.2f sec\n", proc_sec); fprintf(stdout, "Saved audio to `%s`\n", output_path); pv_status_t delete_status = pv_orca_word_alignments_delete_func(num_alignments, alignments); diff --git a/demo/llm/orca_voice_assistant_demo.py b/demo/llm/orca_voice_assistant_demo.py index 9cadb257..1691b13e 100644 --- a/demo/llm/orca_voice_assistant_demo.py +++ b/demo/llm/orca_voice_assistant_demo.py @@ -15,7 +15,18 @@ from pvrecorder import PvRecorder -from src import * +from src import ( + LLM, + LLMs, + Synthesizer, + Synthesizers, + TimingPrinter, + Timer, + UserInput, + UserInputs, + StreamingAudioDevice, + Transcribers, +) MAX_WAIT_TIME_FIRST_AUDIO = 10 @@ -25,17 +36,15 @@ def get_user_input_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: user_input_type = UserInputs(args.user_input) if user_input_type is UserInputs.VOICE: - kwargs["audio_device_index"] = args.audio_device_index - - transcriber_type = Transcribers.PICOVOICE_CHEETAH - kwargs["transcriber"] = transcriber_type + kwargs["audio_device_index"] = args.input_audio_device_index + kwargs["transcriber"] = Transcribers.PICOVOICE_CHEETAH kwargs["transcriber_params"] = dict() - if transcriber_type is Transcribers.PICOVOICE_CHEETAH: - if not args.picovoice_access_key: - raise ValueError("Picovoice access key is required when using voice user input") - kwargs["transcriber_params"]["access_key"] = args.picovoice_access_key - kwargs["transcriber_params"]["endpoint_duration_sec"] = args.endpoint_duration_sec + if args.picovoice_access_key is None: + raise ValueError("Picovoice access key is required when using voice user input") + kwargs["transcriber_params"]["access_key"] = args.picovoice_access_key + if args.speech_endpoint_duration_sec is not None: + kwargs["transcriber_params"]["endpoint_duration_sec"] = args.speech_endpoint_duration_sec elif user_input_type is UserInputs.TEXT: kwargs["llm_type"] = LLMs(args.llm) @@ -48,14 +57,14 @@ def get_llm_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: llm_type = LLMs(args.llm) if llm_type is LLMs.OPENAI: - if not args.openai_access_key: + if args.openai_access_key is None: raise ValueError( f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") if args.tokens_per_second is not None: raise ValueError(f"Tokens per second is not supported for `{llm_type}`") kwargs["access_key"] = args.openai_access_key - if args.system_message: + if args.system_message is not None: kwargs["system_message"] = args.system_message elif llm_type is LLMs.DUMMY: @@ -70,30 +79,31 @@ def get_synthesizer_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: synthesizer_type = Synthesizers(args.synthesizer) if synthesizer_type is Synthesizers.PICOVOICE_ORCA: - if not args.picovoice_access_key: + if args.picovoice_access_key is None: raise ValueError("Picovoice access key is required when using Picovoice TTS") kwargs["access_key"] = args.picovoice_access_key kwargs["model_path"] = args.orca_model_path kwargs["library_path"] = args.orca_library_path elif synthesizer_type is Synthesizers.OPENAI: - if not args.openai_access_key: + if args.openai_access_key is None: raise ValueError( f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") kwargs["access_key"] = args.openai_access_key return kwargs -def main(args: argparse.Namespace) -> None: - llm_type = LLMs(args.llm) - timer = Timer() +def main(args: argparse.Namespace) -> None: + max_num_interactions = args.num_interactions user_input_init_kwargs = get_user_input_init_kwargs(args) user_input = UserInput.create(UserInputs(args.user_input), **user_input_init_kwargs) audio_output = StreamingAudioDevice.from_default_device() + timer = Timer() + synthesizer_init_kwargs = get_synthesizer_init_kwargs(args) synthesizer = Synthesizer.create( Synthesizers(args.synthesizer), @@ -102,31 +112,25 @@ def main(args: argparse.Namespace) -> None: **synthesizer_init_kwargs) llm_init_kwargs = get_llm_init_kwargs(args) - llm = LLM.create(llm_type, **llm_init_kwargs) + llm = LLM.create(LLMs(args.llm), **llm_init_kwargs) - max_length = len(f"{llm}") if len(f"{llm}") > len(f"{synthesizer}") else len(f"{synthesizer}") - llm_info_string = f"{llm}".ljust(max_length) - synthesizer_info_string = f"{synthesizer}".ljust(max_length) - progress_printer = ProgressPrinter( - timer_message_llm=f"Time to wait for {llm_info_string} : ", - timer_message_tts=f"Time to wait for {synthesizer_info_string} : ", - ) + timing_printer = TimingPrinter(llm_string=f"{llm}", synthesizer_string=f"{synthesizer}") try: - num_interactions = 0 + num_interactions_counter = 0 while True: timer.reset() audio_output.start(sample_rate=synthesizer.sample_rate) - text = user_input.get_user_prompt() + text = user_input.get_user_input() timer.log_time_llm_request() - generator = llm.chat(user_input=text) + text_generator = llm.chat(user_input=text) llm_message = "" printed_stats = False - for token in generator: + for token in text_generator: if token is None: continue @@ -135,11 +139,11 @@ def main(args: argparse.Namespace) -> None: llm_message += token - if synthesizer.input_streamable: + if synthesizer.text_streamable: synthesizer.synthesize(token) if not timer.before_first_audio and not printed_stats: - progress_printer.print_timing_stats( + timing_printer.print_timing_stats( num_seconds_first_llm_token=timer.num_seconds_to_first_token(), num_seconds_first_audio=timer.num_seconds_to_first_audio(), ) @@ -150,7 +154,7 @@ def main(args: argparse.Namespace) -> None: timer.log_time_last_llm_token() - if synthesizer.input_streamable: + if synthesizer.text_streamable: synthesizer.flush() else: synthesizer.synthesize(llm_message) @@ -163,16 +167,16 @@ def main(args: argparse.Namespace) -> None: break if not printed_stats: - progress_printer.print_timing_stats( + timing_printer.print_timing_stats( num_seconds_first_llm_token=timer.num_seconds_to_first_token(), num_seconds_first_audio=timer.num_seconds_to_first_audio()) print(f"Answering with {synthesizer} ...") - audio_output.wait_and_terminate() + audio_output.flush_and_terminate() - num_interactions += 1 + num_interactions_counter += 1 - if num_interactions == 2: + if 0 < max_num_interactions == num_interactions_counter: print("\nDemo complete!") break @@ -193,14 +197,14 @@ def main(args: argparse.Namespace) -> None: choices=[u.value for u in UserInputs], help="Choose type of input type") parser.add_argument( - "--audio-device-index", + "--input-audio-device-index", type=int, default=-1, help="Index of input audio device") parser.add_argument( - "--endpoint-duration-sec", + "--speech-endpoint-duration-sec", type=float, - default=1., + default=None, help="Duration in seconds for speechless audio to be considered an endpoint") parser.add_argument( "--show-audio-devices", @@ -210,20 +214,21 @@ def main(args: argparse.Namespace) -> None: parser.add_argument( "--llm", default=LLMs.DUMMY.value, - choices=[l.value for l in LLMs], + choices=[llm.value for llm in LLMs], help="Choose LLM to use") parser.add_argument( "--openai-access-key", + default=None, help="Open AI access key. Needed when using openai models") parser.add_argument( "--system-message", default=None, - help="The system message to use for the LLM") + help="The system message to use to prompt the LLM response") parser.add_argument( "--tokens-per-second", default=None, type=int, - help="Imitated tokens per second") + help="Imitated tokens per second to use for Dummy LLM") parser.add_argument( "--tts", @@ -231,9 +236,24 @@ def main(args: argparse.Namespace) -> None: default=Synthesizers.PICOVOICE_ORCA.value, choices=[s.value for s in Synthesizers], help="Choose voice synthesizer to use") - parser.add_argument("--picovoice-access-key", "-a", help="AccessKey obtained from Picovoice Console") - parser.add_argument("--orca-model-path", "-m", help="Path to the model parameters file") - parser.add_argument("--orca-library-path", "-l", help="Path to Orca's dynamic library") + parser.add_argument( + "--picovoice-access-key", + default=None, + help="AccessKey obtained from Picovoice Console") + parser.add_argument( + "--orca-model-path", + default=None, + help="Path to the model parameters file") + parser.add_argument( + "--orca-library-path", + default=None, + help="Path to Orca's dynamic library") + + parser.add_argument( + "--num-interactions", + type=int, + default=-1, + help="Number of interactions with LLM run before completing the demo. Default is -1 (run indefinitely)") arg = parser.parse_args() diff --git a/demo/llm/requirements.txt b/demo/llm/requirements.txt index adbdbf07..34ec74df 100644 --- a/demo/llm/requirements.txt +++ b/demo/llm/requirements.txt @@ -1,3 +1,14 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + openai==1.17.0 pvcheetah==2.0.1 pvrecorder==1.2.2 diff --git a/demo/llm/src/__init__.py b/demo/llm/src/__init__.py index 58ee0079..d618d01b 100644 --- a/demo/llm/src/__init__.py +++ b/demo/llm/src/__init__.py @@ -1,4 +1,15 @@ -from .audio_player import * +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from .audio_device import * from .llm import * from .synthesizer import * from .transcriber import * diff --git a/demo/llm/src/audio_device.py b/demo/llm/src/audio_device.py index e27c8892..2036776e 100644 --- a/demo/llm/src/audio_device.py +++ b/demo/llm/src/audio_device.py @@ -1,6 +1,22 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + import time from queue import Queue -from typing import Any, Optional, Sequence, Union +from typing import ( + Any, + Optional, + Sequence, + Union, +) import numpy as np from numpy.typing import NDArray @@ -8,8 +24,8 @@ class StreamingAudioDevice: - def __init__(self, device_info: dict) -> None: - self._device_info = device_info + def __init__(self, device_index: int) -> None: + self._device_index = device_index self._queue: Queue[NDArray] = Queue() self._buffer = None @@ -24,7 +40,7 @@ def start(self, sample_rate: int) -> None: channels=1, samplerate=self._sample_rate, dtype=np.int16, - device=int(self._device_info["index"]), + device=self._device_index, callback=self._callback, blocksize=self._blocksize) self._stream.start() @@ -46,10 +62,7 @@ def play(self, pcm_chunk: Optional[Union[Sequence[int], NDArray]] = None) -> Non pcm_chunk = np.array(pcm_chunk, dtype=np.int16) if self._buffer is not None: - if pcm_chunk is not None: - pcm_chunk = np.concatenate([self._buffer, pcm_chunk]) - else: - pcm_chunk = self._buffer + pcm_chunk = self._buffer if pcm_chunk is None else np.concatenate([self._buffer, pcm_chunk]) self._buffer = None if pcm_chunk is None: @@ -62,11 +75,11 @@ def play(self, pcm_chunk: Optional[Union[Sequence[int], NDArray]] = None) -> Non else: self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) - def wait_and_terminate(self) -> None: - self.wait() + def flush_and_terminate(self) -> None: + self.flush() self.terminate() - def wait(self) -> None: + def flush(self) -> None: if self._buffer is not None: chunk = np.zeros(self._blocksize, dtype=np.int16) chunk[:self._buffer.shape[0]] = self._buffer @@ -84,7 +97,9 @@ def terminate(self) -> None: @classmethod def from_default_device(cls) -> 'StreamingAudioDevice': - return cls(device_info=query_devices(kind="output")) + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + return cls(device_index=device_index) __all__ = [ diff --git a/demo/llm/src/llm.py b/demo/llm/src/llm.py index 5f59af1b..542c9b09 100644 --- a/demo/llm/src/llm.py +++ b/demo/llm/src/llm.py @@ -1,3 +1,14 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + import json import os import random @@ -22,10 +33,7 @@ class LLM: You are a friendly voice assistant in customer service of an e-commerce platform. Use natural, conversational language that are clear and easy to follow (short sentences, simple words). Only use english letters and punctuation, no special characters. - Be verbose. Keep the conversation flowing naturally. - Don't use lists. - If the customer was successful, say "Great!" and ask if they need help with anything else. """ def __init__(self, system_message: str = SYSTEM_MESSAGE) -> None: @@ -39,9 +47,6 @@ def chat(self, user_input: str) -> Generator[str, None, None]: for token in self._chat(user_input=user_input): yield token - def reset_history(self) -> None: - pass - @classmethod def create(cls, llm_type: LLMs, **kwargs) -> 'LLM': classes = { @@ -74,31 +79,19 @@ def __init__( self._model_name = model_name self._client = OpenAI(api_key=access_key) - self._message_history = [{"role": "system", "content": self._system_message}] - - # Dummy request to avoid long wait times. - # The first request takes significantly longer than subsequent ones. - stream = self._client.chat.completions.create( - model=self._model_name, - messages=[{"role": "system", "content": "dummy"}, {"role": "user", "content": "dummy"}], - seed=self.RANDOM_SEED, - temperature=0, - top_p=0.05, - stream=True) - for chunk in stream: - pass + self._history = [{"role": "system", "content": self._system_message}] def _append_user_message(self, message: str) -> None: - self._message_history.append({"role": "user", "content": message}) + self._history.append({"role": "user", "content": message}) def _append_assistant_message(self, message: str) -> None: - self._message_history.append({"role": "assistant", "content": message}) + self._history.append({"role": "assistant", "content": message}) def _chat(self, user_input: str) -> Generator[str, None, None]: self._append_user_message(user_input) stream = self._client.chat.completions.create( model=self._model_name, - messages=self._message_history, + messages=self._history, seed=self.RANDOM_SEED, temperature=0, top_p=0.05, @@ -111,15 +104,11 @@ def _chat(self, user_input: str) -> Generator[str, None, None]: assistant_message += token self._append_assistant_message(assistant_message) - def reset_history(self) -> None: - self._message_history = [{"role": "system", "content": self._system_message}] - def __str__(self) -> str: return f"ChatGPT ({self._model_name})" class DummyLLM(LLM): - USER_PROMPT = "Press ENTER to generate a demo LLM response " TOKENS_PER_SECOND = 25 def __init__(self, tokens_per_second: int = TOKENS_PER_SECOND) -> None: diff --git a/demo/llm/src/synthesizer.py b/demo/llm/src/synthesizer.py index 402fbb84..e8bcb69c 100644 --- a/demo/llm/src/synthesizer.py +++ b/demo/llm/src/synthesizer.py @@ -1,3 +1,14 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + import threading import time from dataclasses import dataclass @@ -17,7 +28,7 @@ import pvorca from numpy.typing import NDArray from openai import OpenAI -from pvorca import OrcaInvalidArgumentError +from pvorca import OrcaActivationLimitError from .util import Timer @@ -33,10 +44,10 @@ def __init__( sample_rate: int, play_audio_callback: Callable[[Union[Sequence[int], NDArray]], None], timer: Timer, - input_streamable: bool = False, + text_streamable: bool = False, ) -> None: self.sample_rate = sample_rate - self.input_streamable = input_streamable + self.text_streamable = text_streamable self._play_audio_callback = play_audio_callback self._timer = timer @@ -45,6 +56,7 @@ def synthesize(self, text: str) -> None: raise NotImplementedError( f"Method `synthesize` must be implemented in a subclass of {self.__class__.__name__}") + @property def info(self) -> str: raise NotImplementedError( f"Method `info` must be implemented in a subclass of {self.__class__.__name__}") @@ -56,20 +68,19 @@ def terminate(self) -> None: pass @classmethod - def create(cls, engine: Union[str, Synthesizers], **kwargs: Any) -> 'Synthesizer': - classes = { + def create(cls, engine: Synthesizers, **kwargs: Any) -> 'Synthesizer': + subclasses = { Synthesizers.PICOVOICE_ORCA: PicovoiceOrcaSynthesizer, Synthesizers.OPENAI: OpenAISynthesizer, } - if engine not in classes: + if engine not in subclasses: raise NotImplementedError(f"Cannot create {cls.__name__} of type `{engine.value}`") - return classes[engine](**kwargs) + return subclasses[engine](**kwargs) def __str__(self) -> str: - raise NotImplementedError( - f"Method `__str__` must be implemented in a subclass of {self.__class__.__name__}") + raise NotImplementedError() class OpenAISynthesizer(Synthesizer): @@ -121,7 +132,7 @@ def __str__(self) -> str: class PicovoiceOrcaSynthesizer(Synthesizer): - NUM_TOKENS_PER_PCM_CHUNK = 8 + NUM_TOKENS_PER_PCM_CHUNK = 4 @dataclass class OrcaTextInput: @@ -141,7 +152,7 @@ def __init__( sample_rate=self._orca.sample_rate, play_audio_callback=play_audio_callback, timer=timer, - input_streamable=True) + text_streamable=True) self._orca_stream = self._orca.open_stream() @@ -166,11 +177,10 @@ def _reset_state(self) -> None: def _compute_first_audio_delay(self, pcm: Sequence[int], processing_time: float) -> float: seconds_audio = len(pcm) / self.sample_rate tokens_per_sec = self._num_tokens / (time.time() - self._timer.time_first_synthesis_request) - time_delay = \ - max(((self.NUM_TOKENS_PER_PCM_CHUNK / ( - tokens_per_sec + 1e-4)) - seconds_audio) + processing_time, - 0) - return time_delay + llm_delay_seconds = (self.NUM_TOKENS_PER_PCM_CHUNK / (tokens_per_sec + 1e-4)) + orca_delay_seconds = 3 * processing_time + delay_seconds = max(llm_delay_seconds + orca_delay_seconds - seconds_audio, 0) + return delay_seconds def _run(self) -> None: while True: @@ -188,9 +198,8 @@ def _run(self) -> None: pcm = self._orca_stream.synthesize(orca_input.text) else: pcm = self._orca_stream.flush() - except OrcaInvalidArgumentError as e: - print(f"Orca could not synthesize text input `{orca_input.text}`: `{e}`") - continue + except OrcaActivationLimitError: + raise ValueError("Orca activation limit reached.") processing_time = time.time() - start if pcm is not None: @@ -218,6 +227,7 @@ def terminate(self): self._orca_stream.close() self._orca.delete() + @property def info(self) -> str: return f"Picovoice Orca v{self._orca.version}" diff --git a/demo/llm/src/transcriber.py b/demo/llm/src/transcriber.py index e49ccbbe..f03ae432 100644 --- a/demo/llm/src/transcriber.py +++ b/demo/llm/src/transcriber.py @@ -1,10 +1,20 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + from enum import Enum from typing import ( Any, Optional, Sequence, Tuple, - Union, ) from pvcheetah import CheetahActivationLimitError, create @@ -26,16 +36,15 @@ def frame_length(self) -> int: raise NotImplementedError() @classmethod - def create(cls, x: Union[str, Transcribers], **kwargs: Any) -> 'Transcriber': - try: - x = Transcribers(x) - subclass = { - Transcribers.PICOVOICE_CHEETAH: PicovoiceCheetahTranscriber, - }[x] - except KeyError: - raise ValueError(f"Invalid transcriber type `{x}`") - - return subclass(**kwargs) + def create(cls, x: Transcribers, **kwargs: Any) -> 'Transcriber': + subclasses = { + Transcribers.PICOVOICE_CHEETAH: PicovoiceCheetahTranscriber, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") + + return subclasses[x](**kwargs) class PicovoiceCheetahTranscriber(Transcriber): diff --git a/demo/llm/src/user_input.py b/demo/llm/src/user_input.py index f4286742..791943e8 100644 --- a/demo/llm/src/user_input.py +++ b/demo/llm/src/user_input.py @@ -1,4 +1,14 @@ -import time +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + from enum import Enum from typing import ( Any, @@ -18,20 +28,20 @@ class UserInputs(Enum): class UserInput: - def get_user_prompt(self) -> str: + def get_user_input(self) -> str: raise NotImplementedError() @classmethod def create(cls, x: UserInputs, **kwargs: Any) -> 'UserInput': - try: - subclass = { - UserInputs.VOICE: VoiceUserInput, - UserInputs.TEXT: TextUserInput, - }[x] - except KeyError: - raise ValueError(f"Invalid input type `{x}`") + subclasses = { + UserInputs.VOICE: VoiceUserInput, + UserInputs.TEXT: TextUserInput, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") - return subclass(**kwargs) + return subclasses[x](**kwargs) class VoiceUserInput(UserInput): @@ -44,23 +54,20 @@ def __init__( self._transcriber = Transcriber.create(transcriber, **transcriber_params) self._recorder = PvRecorder(frame_length=self._transcriber.frame_length, device_index=audio_device_index) - def get_user_prompt(self) -> str: + def get_user_input(self) -> str: print("Listening ...") if not self._recorder.is_recording: self._recorder.start() transcript = "" - #start = time.time() try: while True: partial_transcript, is_endpoint = self._transcriber.process(self._recorder.read()) transcript += partial_transcript - if is_endpoint: # or time.time() - start > 2: + if is_endpoint: final_transcript = self._transcriber.flush() transcript += final_transcript self._recorder.stop() - if transcript == "": - transcript = "Hi, I'm trying to place an order on your webpage but I'm having trouble with the checkout process. Can you help me?" return transcript except Exception as e: self._recorder.stop() @@ -77,7 +84,7 @@ def __init__(self, llm_type: LLMs, prompt: Optional[str] = None) -> None: else: self._prompt = self.USER_PROMPT_DUMMY_LLM if llm_type is LLMs.DUMMY else self.USER_PROMPT - def get_user_prompt(self) -> str: + def get_user_input(self) -> str: return input(self._prompt) diff --git a/demo/llm/src/util.py b/demo/llm/src/util.py index 2652ba78..79fb7966 100644 --- a/demo/llm/src/util.py +++ b/demo/llm/src/util.py @@ -1,3 +1,14 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + import time from dataclasses import dataclass from typing import Tuple @@ -56,6 +67,12 @@ def is_first_token(self) -> bool: def set_initial_audio_delay(self, delay: float) -> None: self.initial_audio_delay = delay + def num_seconds_to_first_audio(self) -> float: + return self.time_first_audio - self.time_first_llm_token + + def num_seconds_to_first_token(self) -> float: + return self.time_first_llm_token - self.time_llm_request + def reset(self) -> None: self.time_llm_request = -1.0 self.time_first_llm_token = -1.0 @@ -69,51 +86,43 @@ def reset(self) -> None: self._num_tokens = 0 - @staticmethod - def _to_rounded_string(t: float) -> str: - return f"{round(t, 1):.1f}s" - - def num_seconds_to_first_audio(self) -> float: - return self.time_first_audio - self.time_first_llm_token - - def num_seconds_to_first_token(self) -> float: - return self.time_first_llm_token - self.time_llm_request +class TimingPrinter: + TIMER_MESSAGE = "Time to wait for" -class ProgressPrinter: TIMER_BAR_MAX_RED_SECONDS = 2.0 TIMER_BAR_SYMBOLS_PER_SECONDS = 40 TIMER_BAR_SYMBOL = ">" - TIMER_MESSAGE_LLM = "Time to wait for LLM: " - TIMER_MESSAGE_TTS = "Time to wait for TTS: " - MAX_GREEN_VALUE = 0.6 MAX_RED_VALUE = 0.75 def __init__( self, - timer_message_llm: str = TIMER_MESSAGE_LLM, - timer_message_tts: str = TIMER_MESSAGE_TTS, + llm_string: str, + synthesizer_string: str, timer_bar_max_red_seconds: float = TIMER_BAR_MAX_RED_SECONDS, timer_bar_symbols_per_second: float = TIMER_BAR_SYMBOLS_PER_SECONDS, timer_bar_symbol: str = TIMER_BAR_SYMBOL, ) -> None: - self._progress_bar_symbols_per_second = timer_bar_symbols_per_second + max_length = len(llm_string) if len(llm_string) > len(synthesizer_string) else len(synthesizer_string) + llm_info_string = llm_string.ljust(max_length) + synthesizer_info_string = synthesizer_string.ljust(max_length) + + self._timer_message_llm = f"{self.TIMER_MESSAGE} {llm_info_string} : " + self._timer_message_tts = f"{self.TIMER_MESSAGE} {synthesizer_info_string} : " + self._progress_bar_color_max = timer_bar_max_red_seconds * timer_bar_symbols_per_second + self._progress_bar_symbols_per_second = timer_bar_symbols_per_second self._progress_bar_symbol = timer_bar_symbol - self._timer_message_llm = timer_message_llm - self._timer_message_tts = timer_message_tts - @staticmethod def _colored_string(text: str, red: float, green: float, blue: float, bold: bool = False) -> str: s = Colors.BOLD if bold else "" - s = f"{s}\033[38;2;{int(red * 255)};{int(green * 255)};{int(blue * 255)}m{text}\033[0m" + s = f"{s}\033[38;2;{int(red * 255)};{int(green * 255)};{int(blue * 255)}m{text}{Colors.RESET}" return s def _print_colored_progress_bar(self, num_seconds: float, bold: bool = False) -> Tuple[float, float, float]: - red = 0 green = self.MAX_GREEN_VALUE blue = 0 @@ -122,7 +131,6 @@ def _print_colored_progress_bar(self, num_seconds: float, bold: bool = False) -> length = int(num_seconds * self._progress_bar_symbols_per_second) for i in range(length): - if i < half_max_length: red = min(i / (half_max_length - 1), self.MAX_RED_VALUE) else: @@ -156,6 +164,6 @@ def print_timing_stats(self, num_seconds_first_llm_token: float, num_seconds_fir __all__ = [ "Colors", - "ProgressPrinter", + "TimingPrinter", "Timer", ] diff --git a/demo/python/_audio_device.py b/demo/python/_audio_device.py index faabf7d8..ec0d2a18 100644 --- a/demo/python/_audio_device.py +++ b/demo/python/_audio_device.py @@ -22,8 +22,8 @@ class StreamingAudioDevice: - def __init__(self, device_info: dict) -> None: - self._device_info = device_info + def __init__(self, device_index: int) -> None: + self._device_index = device_index self._queue: Queue[Sequence[int]] = Queue() self._buffer = None @@ -38,7 +38,7 @@ def start(self, sample_rate: int) -> None: channels=1, samplerate=self._sample_rate, dtype=np.int16, - device=int(self._device_info["index"]), + device=self._device_index, callback=self._callback, blocksize=self._blocksize) self._stream.start() @@ -95,7 +95,9 @@ def terminate(self) -> None: @classmethod def from_default_device(cls) -> 'StreamingAudioDevice': - return cls(device_info=query_devices(kind="output")) + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + return cls(device_index=device_index) __all__ = [ diff --git a/demo/python/orca_demo.py b/demo/python/orca_demo.py index c3232d28..b21bebe9 100644 --- a/demo/python/orca_demo.py +++ b/demo/python/orca_demo.py @@ -34,7 +34,7 @@ def main(args: argparse.Namespace) -> None: start = time.time() - pcm, alignment = orca.synthesize(text) + pcm, alignments = orca.synthesize(text) processing_time = time.time() - start length_sec = len(pcm) / orca.sample_rate diff --git a/demo/python/orca_demo_streaming.py b/demo/python/orca_demo_streaming.py index 11ccbb39..b7358097 100644 --- a/demo/python/orca_demo_streaming.py +++ b/demo/python/orca_demo_streaming.py @@ -71,7 +71,7 @@ def main(args: argparse.Namespace) -> None: print(traceback.format_exc()) print( "WARNING: Failed to initialize audio device, see details above. Falling back to running " - "the demo without audio\n") + "the demo without audio playback.\n") def play_audio_callback(pcm: Sequence[int]): pass