Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
albho committed May 7, 2024
1 parent b4c91e0 commit 289154d
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 68 deletions.
43 changes: 27 additions & 16 deletions binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ public class Orca {

private long handle;

/**
* OrcaStream object that converts a stream of text to a stream of audio.
*/
public class OrcaStream {
private long stream;

Expand All @@ -43,14 +46,19 @@ public OrcaStream(long stream) {
}

/**
* Generates audio from a live stream of text. The returned audio contains the speech representation of the text.
* Adds a chunk of text to the Stream object and generates audio if enough text has been added.
* This function is expected to be called multiple times with consecutive chunks of text from a text stream.
* The incoming text is buffered as it arrives until there is enough context to convert a chunk of the
* buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk
* for the remaining text that has not yet been synthesized.
*
* @param text Text to be converted to audio. The maximum length can be attained by calling
* `getMaxCharacterLimit()`. Allowed characters can be retrieved by calling
* `getValidCharacters()`. Custom pronunciations can be embedded in the text via the
* syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format,
* e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
* @return The output audio. If none is available, null is returned.
* @param text A chunk of text from a text input stream, comprised of valid characters.
* Valid characters can be retrieved by calling `.getValidCharacters()`.
* Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`.
* They need to be added in a single call to this function.
* The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
* @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no
* audio chunk has been produced.
* @throws OrcaException if there is an error while synthesizing audio.
*/
public short[] synthesize(String text) throws OrcaException {
Expand All @@ -59,6 +67,7 @@ public short[] synthesize(String text) throws OrcaException {
"Attempted to call OrcaStream synthesize after delete."
);
}

if (stream == 0) {
throw new OrcaInvalidStateException(
"Attempted to call OrcaStream synthesize without an open stream."
Expand All @@ -71,17 +80,20 @@ public short[] synthesize(String text) throws OrcaException {
}

/**
* Flushes remaining text. The returned audio contains the speech representation of the text.
* Generates audio for all the buffered text that was added to the OrcaStream object
* via `OrcaStream.synthesize()`.
*
* @return Any remaining output audio. If none is available, null is returned.
* @throws OrcaException if there is an error while synthesizing audio.
* @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no
* audio chunk has been produced.
* @throws OrcaException if there is an error while flushing audio.
*/
public short[] flush() throws OrcaException {
if (handle == 0) {
throw new OrcaInvalidStateException(
"Attempted to call OrcaStream flush after delete."
);
}

if (stream == 0) {
throw new OrcaInvalidStateException(
"Attempted to call OrcaStream flush without an open stream."
Expand All @@ -94,7 +106,7 @@ public short[] flush() throws OrcaException {
}

/**
* Deletes OrcaStream.
* Releases the resources acquired by the OrcaStream object.
*/
public void close() {
if (handle != 0 && stream != 0) {
Expand Down Expand Up @@ -162,7 +174,8 @@ public void delete() {
* syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format,
* e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`.
* @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
* @return The output audio and alignments data.
* @return An object containing the generated audio as a sequence of 16-bit linearly-encoded integers
* and an array of OrcaWord objects representing the word alignments.
* @throws OrcaException if there is an error while synthesizing audio.
*/
public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws OrcaException {
Expand Down Expand Up @@ -191,7 +204,7 @@ public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws Orc
* @param outputPath Absolute path to the output audio file. The output file is saved as
* `WAV (.wav)` and consists of a single mono channel.
* @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
* @return The alignments data.
* @return An array of OrcaWord objects representing the word alignments.
* @throws OrcaException if there is an error while synthesizing audio to file.
*/
public OrcaWord[] synthesizeToFile(
Expand All @@ -215,11 +228,9 @@ public OrcaWord[] synthesizeToFile(
}

/**
* TODO:
*
* @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details.
* @return OrcaStream object.
* @throws OrcaException if there is an error while synthesizing audio.
* @throws OrcaException if there is an error while opening OrcaStream.
*/
public OrcaStream streamOpen(OrcaSynthesizeParams params) throws OrcaException {
if (handle == 0) {
Expand Down
90 changes: 46 additions & 44 deletions binding/android/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ To enable AccessKey validation, you must add the following line to your `Android

## Usage

Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an
incoming text stream in real-time and generates audio in parallel. In the single synthesis mode, a complete text is
synthesized in a single call to the Orca engine.

Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app
context:

Expand All @@ -62,77 +66,72 @@ try {
} catch (OrcaException ex) { }
```

### Streaming vs. Single Synthesis

Orca supports two modes of operation: streaming and single synthesis.
In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel.
In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to the
Orca engine.

#### Streaming Synthesis

To use streaming synthesis, call `streamOpen` to create an `OrcaStream` object.
To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one:

```java
Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build());
```

Then, call `synthesize` on `orcaStream` to generate speech for a live stream of text:

```java
String textStream = "${TEXT}";
String[] words = textStream.split(" ");

for (String word : words) {
short[] pcm = orcaStream.synthesize(word + " ");
for (String textChunk : textGenerator()) {
short[] pcm = orcaStream.synthesize(textChunk);
if (pcm != null) {
// handle pcm
}
}
```

`OrcaStream` buffers input text until there is enough to generate audio. If there is not enough text to generate
audio, `null` is returned.

When done, call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object.

```java
short[] flushedPcm = orcaStream.flush();
if (flushedPcm != null) {
// handle pcm
}
```

The `textGenerator()` function can be any stream generating text, for example an LLM response.
Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has
been added via `orcaStream.synthesize()`.
To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only
includes the audio for a portion of the text that has been added.
To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked.
When done with streaming text synthesis, the `OrcaStream` object needs to be closed:

```java
orcaStream.close();
```

#### Single Synthesis

To use single synthesis, simply call one of the available `synthesize` methods directly on the `Orca` instance.
The `synthesize` method will send
the text to the engine and return the speech audio as a `short[]`. The `synthesizeToFile` method will write the `pcm`
data directly to a specified wav file.
If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to
Orca:

```java
OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder().build();

// Return raw PCM
short[] pcm = orca.synthesize("${TEXT}", params);
// Return raw PCM and alignments
OrcaAudio audio = orca.synthesize("${TEXT}", params);

// Save the generated audio to a WAV file directly
orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params);
OrcaWord[] orcaWords = orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params);
```

Replace `${TEXT}` with the text to be synthesized (must be fewer characters than `.getMaxCharacterLimit()`). When
using `synthesize`, the generated pcm has a sample rate equal to the one returned by `.getSampleRate()`. When
using `synthesizeToFile`, replace `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit
PCM WAV file. When done make sure to explicitly release the resources with `.delete()`.
Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a
single-channel 16-bit PCM WAV file.
In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord`
objects.

When done make sure to explicitly release the resources using:

```java
orca.delete()
```

### Text input

Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers,
basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the
`getValidCharacters()` method provided in the Orca SDK you are using.
Pronunciations of characters or words not supported by this list can be achieved with
[custom pronunciations](#custom-pronunciations).

### Text Input
### Custom pronunciations

Orca accepts any character found in the list returned by the `getValidCharacters()` method.
Pronunciations of characters or words not supported by this list can be achieved by embedding custom pronunciations in
the text via the syntax: `{word|pronunciation}`. The pronunciation is expressed
in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example:
Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\
The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example:

- "This is a {custom|K AH S T AH M} pronunciation"
- "{read|R IY D} this as {read|R EH D}, please."
Expand All @@ -159,11 +158,14 @@ import ai.picovoice.orca.*;

OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder()
.setSpeechRate(1.2f)
.setRandomState(1)
.build();
```

- `setSpeechRate()`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value
produces speech that is faster. The default is `1.0`.
- `setRandomState()`: Sets the random state for sampling during synthesis. This can be used to ensure that the
synthesized speech is deterministic across different runs.

### Alignment Metadata

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:app="http://schemas.android.com/apk/res-auto"
xmlns:tools="http://schemas.android.com/tools"
android:id="@+id/layout"
android:layout_width="match_parent"
android:layout_height="match_parent"
android:foregroundTint="#002A1F65"
tools:context=".MainActivity">
<androidx.constraintlayout.widget.ConstraintLayout
xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:app="http://schemas.android.com/apk/res-auto"
xmlns:tools="http://schemas.android.com/tools"
android:id="@+id/layout"
android:layout_width="match_parent"
android:layout_height="match_parent"
android:foregroundTint="#002A1F65"
tools:context=".MainActivity">

<ToggleButton
android:id="@+id/synthesizeButton"
Expand Down

0 comments on commit 289154d

Please sign in to comment.