From ed8b9f72e22dae6f1330d0a8656459e1352562f3 Mon Sep 17 00:00:00 2001 From: Benjamin Date: Thu, 2 May 2024 12:37:17 -0700 Subject: [PATCH] tweaks --- .github/workflows/python-demo.yml | 2 +- binding/python/README.md | 25 +++++++++++++++---------- binding/python/setup.py | 2 +- demo/python/orca_demo_streaming.py | 2 +- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index 0ccdb460..bdbe0963 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -32,7 +32,7 @@ jobs: install_dep: sudo apt install libportaudio2 - os: windows-latest - os: macos-latest - install_dep: brew install portaudio + install_dep: brew update && brew install portaudio --HEAD steps: - uses: actions/checkout@v3 diff --git a/binding/python/README.md b/binding/python/README.md index 5cbbe20a..895de915 100644 --- a/binding/python/README.md +++ b/binding/python/README.md @@ -35,7 +35,7 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. -In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to the Orca engine. +In the single synthesis mode, a complete text is synthesized in a single call to the Orca engine. Create an instance of the Orca engine: @@ -55,16 +55,18 @@ stream = orca.open_stream() for text_chunk in text_generator(): pcm = stream.synthesize(text_chunk) if pcm is not None: - # handle pcm +# handle pcm pcm = stream.flush() if pcm is not None: - # handle pcm +# handle pcm ``` The `text_generator()` function can be any stream generating text, for example an LLM response. -Orca produces audio chunks in parallel to the LLM, and returns the raw PCM whenever enough context has been added via `stream.synthesize()`. -The `stream.synthesize()` function returns an audio chunk that only includes the audio for a portion of the text that has been added. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `stream.synthesize()`. +To ensure smooth transitions between chunks, the `stream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. To generate the audio for the remaining text, `stream.flush()` needs to be invoked. When done with streaming text synthesis, the `Orca.Stream` object needs to be closed: @@ -72,7 +74,8 @@ When done with streaming text synthesis, the `Orca.Stream` object needs to be cl stream.close() ``` -If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to Orca: +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```python # Return raw PCM @@ -84,8 +87,9 @@ alignments = orca.synthesize_to_file(text='${TEXT}', path='${OUTPUT_PATH}') Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit PCM WAV file. -In single synthesis mode, Orca returns metadata of the synthesized audio in the form of a list of `Orca.WordAlignment` objects. -To print the metadata run: +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of a list of `Orca.WordAlignment` +objects. +You can print the metadata with: ```python for word in alignments: @@ -94,7 +98,7 @@ for word in alignments: print(f"\tphoneme=\"{phoneme.phoneme}\", start_sec={phoneme.start_sec:.2f}, end_sec={phoneme.end_sec:.2f}") ``` -When done make sure to explicitly release the resources with: +When done make sure to explicitly release the resources using: ```python orca.delete() @@ -131,7 +135,8 @@ and replace `${MODEL_PATH}` with the path to the model file with the desired voi ### Speech control -Orca allows for keyword arguments to be provided to the `open_stream` method or the single `synthesize` methods to control the synthesized speech: +Orca allows for keyword arguments to be provided to the `open_stream` method or the single `synthesize` methods to +control the synthesized speech: - `speech_rate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. diff --git a/binding/python/setup.py b/binding/python/setup.py index 92c73b84..8ecc8d7b 100644 --- a/binding/python/setup.py +++ b/binding/python/setup.py @@ -66,6 +66,6 @@ "Programming Language :: Python :: 3", "Topic :: Multimedia :: Sound/Audio :: Speech", ], - python_requires='>=3.7', + python_requires='>=3.8', keywords="Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", ) diff --git a/demo/python/orca_demo_streaming.py b/demo/python/orca_demo_streaming.py index b7358097..faede129 100644 --- a/demo/python/orca_demo_streaming.py +++ b/demo/python/orca_demo_streaming.py @@ -145,7 +145,7 @@ def play_audio_callback(pcm: Sequence[int]): "--tokens-per-second", type=int, default=15, - help="Number of tokens to be streamed per second to Orca, simulating an LLM response.") + help="Number of tokens per second to be streamed to Orca, simulating an LLM response.") parser.add_argument( "--audio-wait-chunks", type=int,