diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index 0cf761a9..d165055f 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -67,7 +67,7 @@ jobs: strategy: matrix: - machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64 ] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/python-perf.yml b/.github/workflows/python-perf.yml index 64c8a6cd..e8f149c2 100644 --- a/.github/workflows/python-perf.yml +++ b/.github/workflows/python-perf.yml @@ -11,7 +11,6 @@ on: - 'binding/python/_util.py' - 'binding/python/test_orca_perf.py' - 'lib/common/**' - - 'lib/jetson/**' - 'lib/linux/**' - 'lib/mac/**' - 'lib/raspberry-pi/**' @@ -25,7 +24,6 @@ on: - 'binding/python/_util.py' - 'binding/python/test_orca_perf.py' - 'lib/common/**' - - 'lib/jetson/**' - 'lib/linux/**' - 'lib/mac/**' - 'lib/raspberry-pi/**' @@ -42,7 +40,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ ubuntu-latest, windows-latest, macos-latest ] include: - os: ubuntu-latest proc_performance_threshold_rtf: 5.0 @@ -78,7 +76,7 @@ jobs: strategy: fail-fast: false matrix: - machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64 ] include: - machine: rpi3-32 proc_performance_threshold_rtf: 1.0 @@ -90,8 +88,6 @@ jobs: proc_performance_threshold_rtf: 2.0 - machine: rpi5-64 proc_performance_threshold_rtf: 2.0 - - machine: jetson - proc_performance_threshold_rtf: 2.0 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 83a2fdf7..f7073116 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -9,7 +9,6 @@ on: - 'binding/python/**' - '!binding/python/README.md' - 'lib/common/**' - - 'lib/jetson/**' - 'lib/linux/**' - 'lib/mac/**' - 'lib/raspberry-pi/**' @@ -22,7 +21,6 @@ on: - 'binding/python/**' - '!binding/python/README.md' - 'lib/common/**' - - 'lib/jetson/**' - 'lib/linux/**' - 'lib/mac/**' - 'lib/raspberry-pi/**' @@ -64,7 +62,7 @@ jobs: strategy: matrix: - machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64 ] steps: - uses: actions/checkout@v3 diff --git a/demo/c/README.md b/demo/c/README.md index 715acb9a..3ed50721 100644 --- a/demo/c/README.md +++ b/demo/c/README.md @@ -17,6 +17,7 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you - **For Windows Only**: [MinGW](https://www.mingw-w64.org/) is required to build the demo. # Speech Synthesis Demos + Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. This is demonstrated in the Orca streaming demo. @@ -72,7 +73,7 @@ To run the Orca demo: ./demo/c/build/orca_demo -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} ``` -Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with +Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with a path to any of the model files available under [lib/common](../../lib/common), `${ACCESS_KEY}` with AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/), `${TEXT}` with the text to be synthesized, and `${WAV_OUTPUT_PATH}` with a path to a output audio file. diff --git a/demo/python/orca_demo_streaming.py b/demo/python/orca_demo_streaming.py index 05b0d92e..5791f9f3 100644 --- a/demo/python/orca_demo_streaming.py +++ b/demo/python/orca_demo_streaming.py @@ -15,113 +15,25 @@ import subprocess import threading import time -import traceback from dataclasses import dataclass from queue import Queue +from collections import deque +from itertools import chain from typing import ( - Any, Callable, - Dict, Optional, Sequence, ) -import numpy as np import pvorca import tiktoken -from numpy.typing import NDArray -from pvorca import OrcaActivationLimitError, OrcaInvalidArgumentError -from sounddevice import ( - OutputStream, - query_devices, - PortAudioError, -) +from pvorca import Orca, OrcaActivationLimitError, OrcaInvalidArgumentError +from pvspeaker import PvSpeaker CUSTOM_PRON_PATTERN = r"\{(.*?\|.*?)\}" CUSTOM_PRON_PATTERN_NO_WHITESPACE = r"\{(.*?\|.*?)\}(?!\s)" -class StreamingAudioDevice: - def __init__(self, device_index: Optional[int] = None) -> None: - if device_index is None: - device_info = query_devices(kind="output") - device_index = int(device_info["index"]) - - self._device_index = device_index - self._queue: Queue[Sequence[int]] = Queue() - - self._buffer = None - self._stream = None - self._sample_rate = None - self._blocksize = None - - def start(self, sample_rate: int) -> None: - self._sample_rate = sample_rate - self._blocksize = self._sample_rate // 20 - self._stream = OutputStream( - channels=1, - samplerate=self._sample_rate, - dtype=np.int16, - device=self._device_index, - callback=self._callback, - blocksize=self._blocksize) - self._stream.start() - - # noinspection PyShadowingNames - # noinspection PyUnusedLocal - def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None: - if self._queue.empty(): - outdata[:] = 0 - return - - pcm = self._queue.get() - outdata[:, 0] = pcm - - def play(self, pcm_chunk: Sequence[int]) -> None: - if self._stream is None: - raise ValueError("Stream is not started. Call `start` method first.") - - pcm_chunk = np.array(pcm_chunk, dtype=np.int16) - - if self._buffer is not None: - if pcm_chunk is not None: - pcm_chunk = np.concatenate([self._buffer, pcm_chunk]) - else: - pcm_chunk = self._buffer - self._buffer = None - - length = pcm_chunk.shape[0] - for index_block in range(0, length, self._blocksize): - if (length - index_block) < self._blocksize: - self._buffer = pcm_chunk[index_block: index_block + (length - index_block)] - else: - self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) - - def flush_and_terminate(self) -> None: - self.flush() - self.terminate() - - def flush(self) -> None: - if self._buffer is not None: - chunk = np.zeros(self._blocksize, dtype=np.int16) - chunk[:self._buffer.shape[0]] = self._buffer - self._queue.put_nowait(chunk) - - time_interval = self._blocksize / self._sample_rate - while not self._queue.empty(): - time.sleep(time_interval) - - time.sleep(time_interval) - - def terminate(self) -> None: - self._stream.stop() - self._stream.close() - - @staticmethod - def list_output_devices() -> Dict[str, Any]: - return query_devices(kind="output") - - def linux_machine() -> str: machine = platform.machine() if machine == "x86_64": @@ -159,19 +71,18 @@ class OrcaInput: def __init__( self, - play_audio_callback: Callable[[Sequence[int]], None], - access_key: str, + orca: Orca, + flush_audio_callback: Callable[[Sequence[int]], None], + play_audio_callback: Callable[[Sequence[int]], int], num_tokens_per_second: int, - model_path: Optional[str] = None, - library_path: Optional[str] = None, audio_wait_chunks: Optional[int] = None, ) -> None: - self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + self._orca = orca self._orca_stream = self._orca.stream_open() - self._sample_rate = self._orca.sample_rate self._play_audio_callback = play_audio_callback + self._flush_audio_callback = flush_audio_callback self._num_tokens_per_second = num_tokens_per_second assert self._num_tokens_per_second > 0 @@ -179,7 +90,7 @@ def __init__( self._thread = None self._time_first_audio_available = -1 - self._pcm_buffer: Queue[Sequence[int]] = Queue() + self._pcm_buffer = deque() self._wait_chunks = audio_wait_chunks or self._get_first_audio_wait_chunks() self._num_pcm_chunks_processed = 0 @@ -197,8 +108,6 @@ def _run(self) -> None: while True: orca_input = self._queue.get() if orca_input is None: - while not self._pcm_buffer.empty(): - self._play_audio_callback(self._pcm_buffer.get()) break try: @@ -210,18 +119,19 @@ def _run(self) -> None: raise ValueError(f"Orca could not synthesize text input `{orca_input.text}`: `{e}`") if pcm is not None: - if self._num_pcm_chunks_processed < self._wait_chunks: - self._pcm_buffer.put_nowait(pcm) - else: - while not self._pcm_buffer.empty(): - self._play_audio_callback(self._pcm_buffer.get()) - self._play_audio_callback(pcm) - if self._num_pcm_chunks_processed == 0: self._time_first_audio_available = time.time() - self._num_pcm_chunks_processed += 1 + self._pcm_buffer.append(pcm) + + if self._num_pcm_chunks_processed > self._wait_chunks: + if len(self._pcm_buffer) > 0: + pcm = self._pcm_buffer.popleft() + written = self._play_audio_callback(pcm) + if written < len(pcm): + self._pcm_buffer.appendleft(pcm[written:]) + def _close_thread_blocking(self): self._queue.put_nowait(None) self._thread.join() @@ -236,7 +146,12 @@ def synthesize(self, text: str) -> None: def flush(self) -> None: self._queue.put_nowait(self.OrcaInput(text="", flush=True)) self._close_thread_blocking() - self.start() + + def flush_audio(self) -> None: + remaining_pcm = list(chain.from_iterable(self._pcm_buffer)) + self._thread = threading.Thread(target=self._flush_audio_callback, args=(remaining_pcm,)) + self._thread.start() + self._thread.join() def delete(self) -> None: self._close_thread_blocking() @@ -246,14 +161,6 @@ def delete(self) -> None: def get_time_first_audio_available(self) -> float: return self._time_first_audio_available - @property - def sample_rate(self) -> int: - return self._sample_rate - - @property - def version(self) -> str: - return self._orca.version - def tokenize_text(text: str) -> Sequence[str]: text = re.sub(CUSTOM_PRON_PATTERN_NO_WHITESPACE, r'{\1} ', text) @@ -314,15 +221,26 @@ def main() -> None: type=int, default=None, help="Number of PCM chunks to wait before starting to play audio. Default: system-dependent.") + parser.add_argument( + "--buffer_size_secs", + type=int, + default=20, + help="The size in seconds of the internal buffer used by pvspeaker to play audio.") parser.add_argument( "--show_audio_devices", action="store_true", help="Only list available audio output devices and exit") - parser.add_argument('--audio-device-index', type=int, default=None, help='Index of input audio device') + parser.add_argument( + '--audio-device-index', + type=int, + default=-1, + help='Index of input audio device') args = parser.parse_args() if args.show_audio_devices: - print(StreamingAudioDevice.list_output_devices()) + devices = PvSpeaker.get_available_devices() + for i in range(len(devices)): + print("index: %d, device name: %s" % (i, devices[i])) exit(0) access_key = args.access_key @@ -331,38 +249,49 @@ def main() -> None: text = args.text_to_stream tokens_per_second = args.tokens_per_second audio_wait_chunks = args.audio_wait_chunks + buffer_size_secs = args.buffer_size_secs audio_device_index = args.audio_device_index + orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + + speaker = None try: - audio_device = StreamingAudioDevice(device_index=audio_device_index) - # Some systems may have issues with PortAudio only when starting the audio device. Test it here. - audio_device.start(sample_rate=16000) - audio_device.terminate() - play_audio_callback = audio_device.play - except PortAudioError: - print(traceback.format_exc()) + speaker = PvSpeaker( + sample_rate=orca.sample_rate, + bits_per_sample=16, + buffer_size_secs=buffer_size_secs, + device_index=audio_device_index) + speaker.start() + except RuntimeError or ValueError: print( - "WARNING: Failed to initialize audio device, see details above. Falling back to running " - "the demo without audio playback.\n") - audio_device = None + "\nWarning: Failed to initialize PvSpeaker. Orca will still generate PCM data, " + "but it will not be played.\n") + + def play_audio_callback(pcm: Sequence[int]) -> int: + try: + if speaker is not None: + return speaker.write(pcm) + return len(pcm) + except ValueError: + pass + return len(pcm) - # noinspection PyUnusedLocal - def play_audio_callback(pcm: Sequence[int]): + def flush_audio_callback(pcm: Sequence[int]) -> None: + try: + if speaker is not None: + speaker.flush(pcm) + except MemoryError: pass - orca = OrcaThread( + orca_thread = OrcaThread( + orca=orca, play_audio_callback=play_audio_callback, + flush_audio_callback=flush_audio_callback, num_tokens_per_second=tokens_per_second, - access_key=access_key, - model_path=model_path, - library_path=library_path, audio_wait_chunks=audio_wait_chunks, ) - orca.start() - if audio_device is not None: - audio_device.start(sample_rate=orca.sample_rate) - + orca_thread.start() try: print(f"Orca version: {orca.version}\n") @@ -373,26 +302,31 @@ def play_audio_callback(pcm: Sequence[int]): for token in tokens: print(f"{token}", end="", flush=True) - orca.synthesize(text=token) + orca_thread.synthesize(text=token) time.sleep(1 / tokens_per_second) text_stream_duration_seconds = time.time() - time_start_text_stream - orca.flush() - - first_audio_available_seconds = orca.get_time_first_audio_available() - time_start_text_stream + orca_thread.flush() + first_audio_available_seconds = orca_thread.get_time_first_audio_available() - time_start_text_stream print(f"\n\nTime to finish text stream: {text_stream_duration_seconds:.2f} seconds") print(f"Time to receive first audio: {first_audio_available_seconds:.2f} seconds after text stream started\n") - if audio_device is not None: + if speaker is not None: print("Waiting for audio to finish ...") - audio_device.flush_and_terminate() - + orca_thread.flush_audio() + + if speaker is not None: + speaker.delete() + except KeyboardInterrupt: + print("\nStopped...") + if speaker is not None: + speaker.stop() except OrcaActivationLimitError: - print("AccessKey has reached its processing limit") + print("\nAccessKey has reached its processing limit") finally: - orca.delete() + orca_thread.delete() if __name__ == "__main__": diff --git a/demo/python/requirements.txt b/demo/python/requirements.txt index a9a52a0c..695a008a 100644 --- a/demo/python/requirements.txt +++ b/demo/python/requirements.txt @@ -1,4 +1,4 @@ numpy>=1.24.0 pvorca==0.2.3 -sounddevice==0.4.6 +pvspeaker==1.0.2 tiktoken==0.6.0 diff --git a/demo/python/setup.py b/demo/python/setup.py index 7ce5120e..e310fa26 100644 --- a/demo/python/setup.py +++ b/demo/python/setup.py @@ -26,7 +26,7 @@ setuptools.setup( name="pvorcademo", - version="0.2.3", + version="0.2.4", author="Picovoice", author_email="hello@picovoice.ai", description="Orca Streaming Text-to-Speech Engine demos", @@ -34,7 +34,7 @@ long_description_content_type="text/markdown", url="https://github.com/Picovoice/orca", packages=["pvorcademo"], - install_requires=["numpy>=1.24.0", "pvorca==0.2.3", "sounddevice==0.4.6", "tiktoken==0.6.0"], + install_requires=["numpy>=1.24.0", "pvorca==0.2.3", "pvspeaker==1.0.2", "tiktoken==0.6.0"], include_package_data=True, classifiers=[ "Development Status :: 4 - Beta", diff --git a/resources/.lint/spell-check/dict.txt b/resources/.lint/spell-check/dict.txt index 7e77faff..4649d037 100644 --- a/resources/.lint/spell-check/dict.txt +++ b/resources/.lint/spell-check/dict.txt @@ -61,3 +61,6 @@ btns Btns pltf usleep +pvspeaker +popleft +appendleft \ No newline at end of file