update

Picovoice · May 7, 2024 · 289154d · 289154d
1 parent b4c91e0
commit 289154d
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 68 deletions.
diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java
@@ -35,6 +35,9 @@ public class Orca {
 
     private long handle;
 
+    /**
+     * OrcaStream object that converts a stream of text to a stream of audio.
+     */
     public class OrcaStream {
         private long stream;
 
@@ -43,14 +46,19 @@ public OrcaStream(long stream) {
         }
 
         /**
-         * Generates audio from a live stream of text. The returned audio contains the speech representation of the text.
+         * Adds a chunk of text to the Stream object and generates audio if enough text has been added.
+         * This function is expected to be called multiple times with consecutive chunks of text from a text stream.
+         * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the
+         * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk
+         * for the remaining text that has not yet been synthesized.
          *
-         * @param text   Text to be converted to audio. The maximum length can be attained by calling
-         *               `getMaxCharacterLimit()`. Allowed characters can be retrieved by calling
-         *               `getValidCharacters()`. Custom pronunciations can be embedded in the text via the
-         *               syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format,
-         *               e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
-         * @return The output audio. If none is available, null is returned.
+         * @param text   A chunk of text from a text input stream, comprised of valid characters.
+         *               Valid characters can be retrieved by calling `.getValidCharacters()`.
+         *               Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`.
+         *               They need to be added in a single call to this function.
+         *               The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
+         * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no
+         *         audio chunk has been produced.
          * @throws OrcaException if there is an error while synthesizing audio.
          */
         public short[] synthesize(String text) throws OrcaException {
@@ -59,6 +67,7 @@ public short[] synthesize(String text) throws OrcaException {
                         "Attempted to call OrcaStream synthesize after delete."
                 );
             }
+
             if (stream == 0) {
                 throw new OrcaInvalidStateException(
                         "Attempted to call OrcaStream synthesize without an open stream."
@@ -71,17 +80,20 @@ public short[] synthesize(String text) throws OrcaException {
         }
 
         /**
-         * Flushes remaining text. The returned audio contains the speech representation of the text.
+         * Generates audio for all the buffered text that was added to the OrcaStream object
+         * via `OrcaStream.synthesize()`.
          *
-         * @return Any remaining output audio. If none is available, null is returned.
-         * @throws OrcaException if there is an error while synthesizing audio.
+         * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no
+         *         audio chunk has been produced.
+         * @throws OrcaException if there is an error while flushing audio.
          */
         public short[] flush() throws OrcaException {
             if (handle == 0) {
                 throw new OrcaInvalidStateException(
                         "Attempted to call OrcaStream flush after delete."
                 );
             }
+
             if (stream == 0) {
                 throw new OrcaInvalidStateException(
                         "Attempted to call OrcaStream flush without an open stream."
@@ -94,7 +106,7 @@ public short[] flush() throws OrcaException {
         }
 
         /**
-         * Deletes OrcaStream.
+         * Releases the resources acquired by the OrcaStream object.
          */
         public void close() {
             if (handle != 0 && stream != 0) {
@@ -162,7 +174,8 @@ public void delete() {
      *               syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format,
      *               e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
      * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
-     * @return The output audio and alignments data.
+     * @return An object containing the generated audio as a sequence of 16-bit linearly-encoded integers
+     *         and an array of OrcaWord objects representing the word alignments.
      * @throws OrcaException if there is an error while synthesizing audio.
      */
     public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws OrcaException {
@@ -191,7 +204,7 @@ public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws Orc
      * @param outputPath Absolute path to the output audio file. The output file is saved as
      *                   `WAV (.wav)` and consists of a single mono channel.
      * @param params     Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
-     * @return The alignments data.
+     * @return An array of OrcaWord objects representing the word alignments.
      * @throws OrcaException if there is an error while synthesizing audio to file.
      */
     public OrcaWord[] synthesizeToFile(
@@ -215,11 +228,9 @@ public OrcaWord[] synthesizeToFile(
     }
 
     /**
-     * TODO:
-     *
      * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
      * @return OrcaStream object.
-     * @throws OrcaException if there is an error while synthesizing audio.
+     * @throws OrcaException if there is an error while opening OrcaStream.
      */
     public OrcaStream streamOpen(OrcaSynthesizeParams params) throws OrcaException {
         if (handle == 0) {

diff --git a/binding/android/README.md b/binding/android/README.md
@@ -46,6 +46,10 @@ To enable AccessKey validation, you must add the following line to your `Android
 
 ## Usage
 
+Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an
+incoming text stream in real-time and generates audio in parallel. In the single synthesis mode, a complete text is
+synthesized in a single call to the Orca engine.
+
 Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app
 context:
 
@@ -62,77 +66,72 @@ try {
 } catch (OrcaException ex) { }
 ```
 
-### Streaming vs. Single Synthesis
-
-Orca supports two modes of operation: streaming and single synthesis.
-In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel.
-In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to the
-Orca engine.
-
-#### Streaming Synthesis
-
-To use streaming synthesis, call `streamOpen` to create an `OrcaStream` object.
+To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one:
 
 ```java
 Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build());
-```
-
-Then, call `synthesize` on `orcaStream` to generate speech for a live stream of text:
 
-```java
-String textStream = "${TEXT}";
-String[] words = textStream.split(" ");
-
-for (String word : words) {
-  short[] pcm = orcaStream.synthesize(word + " ");
+for (String textChunk : textGenerator()) {
+  short[] pcm = orcaStream.synthesize(textChunk);
   if (pcm != null) {
     // handle pcm
   }
 }
-```
-
-`OrcaStream` buffers input text until there is enough to generate audio. If there is not enough text to generate
-audio, `null` is returned.
-
-When done, call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object.
 
-```java
 short[] flushedPcm = orcaStream.flush();
 if (flushedPcm != null) {
   // handle pcm
 }
+```
+
+The `textGenerator()` function can be any stream generating text, for example an LLM response.
+Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has
+been added via `orcaStream.synthesize()`.
+To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only
+includes the audio for a portion of the text that has been added.
+To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked.
+When done with streaming text synthesis, the `OrcaStream` object needs to be closed:
 
+```java
 orcaStream.close();
 ```
 
-#### Single Synthesis
-
-To use single synthesis, simply call one of the available `synthesize` methods directly on the `Orca` instance.
-The `synthesize` method will send
-the text to the engine and return the speech audio as a `short[]`. The `synthesizeToFile` method will write the `pcm`
-data directly to a specified wav file.
+If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to
+Orca:
 
 ```java
 OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder().build();
 
-// Return raw PCM
-short[] pcm = orca.synthesize("${TEXT}", params);
+// Return raw PCM and alignments
+OrcaAudio audio = orca.synthesize("${TEXT}", params);
 
 // Save the generated audio to a WAV file directly
-orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params);
+OrcaWord[] orcaWords = orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params);
 ```
 
-Replace `${TEXT}` with the text to be synthesized (must be fewer characters than `.getMaxCharacterLimit()`). When
-using `synthesize`, the generated pcm has a sample rate equal to the one returned by `.getSampleRate()`. When
-using `synthesizeToFile`, replace `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit
-PCM WAV file. When done make sure to explicitly release the resources with `.delete()`.
+Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a
+single-channel 16-bit PCM WAV file.
+In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord`
+objects.
+
+When done make sure to explicitly release the resources using:
+
+```java
+orca.delete()
+```
+
+### Text input
+
+Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers,
+basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the
+`getValidCharacters()` method provided in the Orca SDK you are using.
+Pronunciations of characters or words not supported by this list can be achieved with
+[custom pronunciations](#custom-pronunciations).
 
-### Text Input
+### Custom pronunciations
 
-Orca accepts any character found in the list returned by the `getValidCharacters()` method.
-Pronunciations of characters or words not supported by this list can be achieved by embedding custom pronunciations in
-the text via the syntax: `{word|pronunciation}`. The pronunciation is expressed
-in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example:
+Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\
+The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example:
 
 - "This is a {custom|K AH S T AH M} pronunciation"
 - "{read|R IY D} this as {read|R EH D}, please."
@@ -159,11 +158,14 @@ import ai.picovoice.orca.*;
 
 OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder()
         .setSpeechRate(1.2f)
+        .setRandomState(1)
         .build();
 ```
 
 - `setSpeechRate()`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value
   produces speech that is faster. The default is `1.0`.
+- `setRandomState()`: Sets the random state for sampling during synthesis. This can be used to ensure that the
+  synthesized speech is deterministic across different runs.
 
 ### Alignment Metadata
 

diff --git a/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml b/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml
@@ -1,12 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
-<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
-                                                   xmlns:app="http://schemas.android.com/apk/res-auto"
-                                                   xmlns:tools="http://schemas.android.com/tools"
-                                                   android:id="@+id/layout"
-                                                   android:layout_width="match_parent"
-                                                   android:layout_height="match_parent"
-                                                   android:foregroundTint="#002A1F65"
-                                                   tools:context=".MainActivity">
+<androidx.constraintlayout.widget.ConstraintLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/layout"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:foregroundTint="#002A1F65"
+    tools:context=".MainActivity">
 
   <ToggleButton
       android:id="@+id/synthesizeButton"