update demos

Picovoice · May 2, 2024 · b97651b · b97651b
1 parent 2347cc2
commit b97651b
Show file tree

Hide file tree

Showing 17 changed files with 370 additions and 173 deletions.
diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml
@@ -48,7 +48,10 @@ jobs:
       - name: Build dependencies
         run: |
           python -m pip install -U pip setuptools
-          pip install wheel && cd ../../binding/python && python3 setup.py sdist bdist_wheel && pip install dist/pvorca-0.1.4-py3-none-any.whl
+          pip install wheel
+          cd ../../binding/python
+          python3 setup.py sdist bdist_wheel
+          python3 -m pip install dist/pvorca-0.1.4-py3-none-any.whl
 
       - name: Install dependencies
         run: |
@@ -81,7 +84,10 @@ jobs:
       # TODO: remove after release
       - name: Build dependencies
         run: |
-          pip install wheel && cd ../../binding/python && python3 setup.py sdist bdist_wheel && pip install --force-reinstall dist/pvorca-0.1.4-py3-none-any.whl
+          pip install wheel 
+          cd ../../binding/python
+          python3 setup.py sdist bdist_wheel
+          python3 -m pip install --force-reinstall dist/pvorca-0.1.4-py3-none-any.whl
 
       - name: Install dependencies
         run: pip3 install -r requirements.txt

diff --git a/README.md b/README.md
@@ -213,6 +213,9 @@ orca = pvorca.create(access_key='${ACCESS_KEY}')
 ```
 
 Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/).
+
+#### Streaming synthesis
+
 To synthesize a text stream, create an Orca Stream object and add text to it one-by-one:
 
 ```python
@@ -235,6 +238,8 @@ When done with streaming text synthesis, the stream object needs to be closed:
 stream.close()
 ```
 
+#### Single synthesis
+
 Use single synthesis mode if the complete text is known in advance:
 
 ```python
@@ -284,9 +289,9 @@ The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant infor
 Build an instance of the object:
 
 ```c
-pv_orca_t *handle = NULL;
+pv_orca_t *orca = NULL;
 const char *model_path = "${MODEL_PATH}";
-pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &handle);
+pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &orca);
 if (status != PV_STATUS_SUCCESS) {
     // error handling logic
 }
@@ -303,15 +308,73 @@ status = pv_orca_synthesize_params_init(&synthesize_params);
 // change the default parameters of synthesize_params as desired
 ```
 
-Now, the `handle` and `synthesize_params` object can be used to synthesize speech:
+#### Streaming synthesis
+
+To synthesize a text stream, create an `orca_stream` object using the `synthesize_params`:
+
+```c
+pv_orca_stream_t *orca_stream = NULL;
+status = pv_orca_stream_open(orca, synthesize_params, &orca_stream);
+if (status != PV_STATUS_SUCCESS) {
+    // error handling logic
+}
+```
+
+Add text to `orca_stream` one-by-one and handle the synthesized audio:
+
+```c
+extern char *get_next_text_chunk(void);
+
+int32_t num_samples_chunk = 0;
+int16_t *pcm_chunk = NULL;
+status = pv_orca_stream_synthesize(
+    orca_stream, 
+    get_next_text_chunk(), 
+    &num_samples_chunk, 
+    &pcm_chunk);
+if (status != PV_STATUS_SUCCESS) {
+    // error handling logic
+}
+if (num_samples_chunk > 0) {
+    // handle pcm_chunk
+}
+```
+
+Once the text stream is complete, call the flush method to synthesize the remaining text: 
+
+```c
+status = pv_orca_stream_flush(orca_stream, &num_samples_chunk, &pcm_chunk);
+if (status != PV_STATUS_SUCCESS) {
+    // error handling logic
+}
+if (num_samples_chunk > 0) {
+    // handle pcm_chunk
+}
+```
+
+Once the pcms are handled, make sure to release the acquired resources for each chunk with:
+
+```c
+pv_orca_pcm_delete(pcm_chunk);
+```
+
+Finally, when done make sure to close the stream:
+    
+```c
+pv_orca_stream_close(orca_stream);
+```
+
+#### Single synthesis
+
+If the text is known in advance, single synthesis mode can be used:
 
 ```c
 int32_t num_samples = 0;
 int16_t *synthesized_pcm = NULL;
 int32_t num_alignments = 0;
 pv_orca_word_alignment_t **alignments = NULL;
 status = pv_orca_synthesize(
-    handle,
+    orca,
     "${TEXT}",
     synthesize_params,
     &num_samples,
@@ -322,13 +385,35 @@ status = pv_orca_synthesize(
 
 Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations).
 
+Print the metadata of the synthesized audio:
+
+```c
+for (int32_t i = 0; i < num_alignments; i++) {
+    fprintf(
+            stdout,
+            "[%s]\t.start_sec = %.2f .end_sec = %.2f\n",
+            alignments[i].word,
+            alignments[i].start_sec,
+            alignments[i].end_sec);
+    for (int32_t j = 0; j < alignments[i].num_phonemes; j++) {
+        fprintf(
+                stdout,
+                "\t[%s]\t.start_sec = %.2f .end_sec = %.2f\n",
+                alignments[i].phonemes[j].phoneme,
+                alignments[i].phonemes[j].start_sec,
+                alignments[i].phonemes[j].end_sec);
+
+    }
+}
+```
+
 Finally, when done make sure to release the acquired resources:
 
 ```c
 pv_orca_word_alignments_delete(num_alignments, alignments);
-pv_orca_delete_pcm(pcm);
+pv_orca_pcm_delete(pcm);
 pv_orca_synthesize_params_delete(synthesize_params);
-pv_orca_delete(handle);
+pv_orca_delete(orca);
 ```
 
 ### Web

diff --git a/binding/python/_orca.py b/binding/python/_orca.py
@@ -141,7 +141,7 @@ class COrcaStream(Structure):
 
     class Stream:
         """
-        Orca Stream object that allows to convert a stream of text to a stream of audio.
+        Orca Stream object that converts a stream of text to a stream of audio.
         """
 
         def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None:
@@ -150,7 +150,7 @@ def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None:
 
         def synthesize(self, text: str) -> Optional[Sequence[int]]:
             """
-            Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added.
+            Adds a chunk of text to the Stream object and generates audio if enough text has been added.
             This function is expected to be called multiple times with consecutive chunks of text from a text stream.
             The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the
             buffered text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk

diff --git a/demo/c/README.md b/demo/c/README.md
@@ -18,10 +18,8 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you
 
 # Speech Synthesis Demos
 Orca supports two modes of operation: streaming and single synthesis.
-
 In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel.
 This is demonstrated in the Orca streaming demo.
-
 In the single synthesis mode, the text is synthesized in a single call to the Orca engine.
 
 **Note**: the following commands are run from the root of the repo.

diff --git a/demo/c/orca_demo.c b/demo/c/orca_demo.c
@@ -284,7 +284,7 @@ int picovoice_main(int argc, char **argv) {
     double proc_sec = 0.;
     gettimeofday(&before, NULL);
 
-    fprintf(stdout, "Synthesizing text `%s` ...\n", text);
+    fprintf(stdout, "\nSynthesizing text `%s`\n", text);
 
     int32_t num_alignments = 0;
     pv_orca_word_alignment_t **alignments = NULL;
@@ -323,7 +323,33 @@ int picovoice_main(int argc, char **argv) {
             ((double) (after.tv_sec - before.tv_sec) +
              ((double) (after.tv_usec - before.tv_usec)) * 1e-6);
 
-    fprintf(stdout, "Synthesized text in %.1f sec\n", proc_sec);
+    if (num_alignments > 0) {
+        fprintf(stdout, "\nWord alignments");
+        if (num_alignments > 3) {
+            fprintf(stdout, " (only showing first 3):\n");
+        } else {
+            fprintf(stdout, ":\n");
+        }
+        int32_t num_alignments_shown = num_alignments > 3 ? 3 : num_alignments;
+        for (int32_t i = 0; i < num_alignments_shown; i++) {
+            fprintf(
+                    stdout,
+                    "word=\"%s\", start_sec=%.2f, end_sec=%.2f\n",
+                    alignments[i]->word,
+                    alignments[i]->start_sec,
+                    alignments[i]->end_sec);
+            for (int32_t j = 0; j < alignments[i]->num_phonemes; j++) {
+                fprintf(
+                        stdout,
+                        "\tphoneme=\"%s\", start_sec=%.2f, end_sec=%.2f\n",
+                        alignments[i]->phonemes[j]->phoneme,
+                        alignments[i]->phonemes[j]->start_sec,
+                        alignments[i]->phonemes[j]->end_sec);
+            }
+        }
+    }
+
+    fprintf(stdout, "\nSynthesized text in %.2f sec\n", proc_sec);
     fprintf(stdout, "Saved audio to `%s`\n", output_path);
 
     pv_status_t delete_status = pv_orca_word_alignments_delete_func(num_alignments, alignments);