diff --git a/.github/workflows/android-appcenter.yml b/.github/workflows/android-appcenter.yml index aff5c496..d0cbc773 100644 --- a/.github/workflows/android-appcenter.yml +++ b/.github/workflows/android-appcenter.yml @@ -25,109 +25,109 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* - - name: Install AppCenter CLI - run: npm install -g appcenter-cli + - name: Install AppCenter CLI + run: npm install -g appcenter-cli - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Copy test_resources - run: ./copy_test_resources.sh + - name: Copy test_resources + run: ./copy_test_resources.sh - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - name: Grant execute permission for gradlew - run: chmod +x gradlew + - name: Grant execute permission for gradlew + run: chmod +x gradlew - - name: Build app - run: ./gradlew assembleDebug + - name: Build app + run: ./gradlew assembleDebug - - name: Build androidTest - run: ./gradlew assembleAndroidTest + - name: Build androidTest + run: ./gradlew assembleAndroidTest - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/android-min-max" - --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/debug + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/android-min-max" + --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/debug build-integ: name: Run Android Integration Tests on AppCenter runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* - - - name: Install AppCenter CLI - run: npm install -g appcenter-cli - - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' - - - name: Copy test_resources - run: ./copy_test_resources.sh - - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties - - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - - name: Grant execute permission for gradlew - run: chmod +x gradlew - - - name: Build app - run: ./gradlew assembleRelease - - - name: Build androidTest - run: ./gradlew assembleReleaseAndroidTest -DtestBuildType=integ - - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/android-min-max" - --app-path orca-test-app/build/outputs/apk/release/orca-test-app-release.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/release \ No newline at end of file + - uses: actions/checkout@v3 + + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* + + - name: Install AppCenter CLI + run: npm install -g appcenter-cli + + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' + + - name: Copy test_resources + run: ./copy_test_resources.sh + + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties + + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + + - name: Grant execute permission for gradlew + run: chmod +x gradlew + + - name: Build app + run: ./gradlew assembleRelease + + - name: Build androidTest + run: ./gradlew assembleReleaseAndroidTest -DtestBuildType=integ + + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/android-min-max" + --app-path orca-test-app/build/outputs/apk/release/orca-test-app-release.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/release diff --git a/.github/workflows/android-demos.yml b/.github/workflows/android-demos.yml index 4c1d6708..b9d02ded 100644 --- a/.github/workflows/android-demos.yml +++ b/.github/workflows/android-demos.yml @@ -23,13 +23,13 @@ jobs: working-directory: demo/android/OrcaDemo steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Build - run: ./gradlew assembleDebug \ No newline at end of file + - name: Build + run: ./gradlew assembleDebug diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c4a90226..8fae2bcd 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -26,70 +26,70 @@ jobs: strategy: matrix: - device: [single-android, 32bit-android] + device: [ single-android, 32bit-android ] include: - - device: single-android - procPerformanceThresholdSec: 3.0 - - device: 32bit-android - procPerformanceThresholdSec: 19.0 + - device: single-android + procPerformanceThresholdSec: 3.0 + - device: 32bit-android + procPerformanceThresholdSec: 19.0 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* - - name: Install AppCenter CLI - run: npm install -g appcenter-cli + - name: Install AppCenter CLI + run: npm install -g appcenter-cli - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Copy test_resources - run: ./copy_test_resources.sh + - name: Copy test_resources + run: ./copy_test_resources.sh - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - name: Inject Number of Iterations - run: echo numTestIterations="30" >> local.properties + - name: Inject Number of Iterations + run: echo numTestIterations="30" >> local.properties - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - name: Inject Init Performance Threshold - run: echo initPerformanceThresholdSec="${{ matrix.initPerformanceThresholdSec }}" >> local.properties + - name: Inject Init Performance Threshold + run: echo initPerformanceThresholdSec="${{ matrix.initPerformanceThresholdSec }}" >> local.properties - - name: Inject Proc Performance Threshold - run: echo procPerformanceThresholdSec="${{ matrix.procPerformanceThresholdSec }}" >> local.properties + - name: Inject Proc Performance Threshold + run: echo procPerformanceThresholdSec="${{ matrix.procPerformanceThresholdSec }}" >> local.properties - - name: Grant execute permission for gradlew - run: chmod +x gradlew + - name: Grant execute permission for gradlew + run: chmod +x gradlew - - name: Build app - run: ./gradlew assembleDebug + - name: Build app + run: ./gradlew assembleDebug - - name: Build androidTest - run: ./gradlew assembleAndroidTest -DtestBuildType=perf + - name: Build androidTest + run: ./gradlew assembleAndroidTest -DtestBuildType=perf - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/${{ matrix.device }}" - --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/debug \ No newline at end of file + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/${{ matrix.device }}" + --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/debug diff --git a/.github/workflows/c-demos.yml b/.github/workflows/c-demos.yml index 119d6bc3..9b7f54b7 100644 --- a/.github/workflows/c-demos.yml +++ b/.github/workflows/c-demos.yml @@ -49,7 +49,7 @@ jobs: make_file: "MinGW Makefiles" - os: macos-latest platform: mac - arch: x86_64 + arch: undetermined make_file: "Unix Makefiles" steps: @@ -66,10 +66,12 @@ jobs: run: cmake -G "${{ matrix.make_file }}" -B ./build - name: Build demo - run: cmake --build ./build --target orca_demo + run: | + cmake --build ./build --target orca_demo + cmake --build ./build --target orca_demo_streaming - name: Test - run: python test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} + run: python3 test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} build-demo-self-hosted: runs-on: ${{ matrix.machine }} @@ -106,7 +108,9 @@ jobs: run: cmake -B ./build - name: Build demo - run: cmake --build ./build --target orca_demo + run: | + cmake --build ./build --target orca_demo + cmake --build ./build --target orca_demo_streaming - name: Test run: python3 test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} diff --git a/.github/workflows/ios-appcenter.yml b/.github/workflows/ios-appcenter.yml index c1d361c0..7a3318ee 100644 --- a/.github/workflows/ios-appcenter.yml +++ b/.github/workflows/ios-appcenter.yml @@ -73,4 +73,4 @@ jobs: --devices "Picovoice/ios-min-max" --test-series "orca-ios" --locale "en_US" - --build-dir ddp/Build/Products/Debug-iphoneos \ No newline at end of file + --build-dir ddp/Build/Products/Debug-iphoneos diff --git a/.github/workflows/ios-perf.yml b/.github/workflows/ios-perf.yml index 3f6848da..98c3df9a 100644 --- a/.github/workflows/ios-perf.yml +++ b/.github/workflows/ios-perf.yml @@ -26,10 +26,10 @@ jobs: strategy: matrix: - device: [ios-perf] + device: [ ios-perf ] include: - - device: ios-perf - performanceThresholdSec: 0.5 + - device: ios-perf + performanceThresholdSec: 0.5 steps: - name: Checkout @@ -88,4 +88,4 @@ jobs: --devices "Picovoice/${{ matrix.device }}" --test-series "orca-ios" --locale "en_US" - --build-dir ddp/Build/Products/Debug-iphoneos \ No newline at end of file + --build-dir ddp/Build/Products/Debug-iphoneos diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index 47630e87..0cf761a9 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -25,8 +25,13 @@ jobs: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + os: [ ubuntu-latest, windows-latest, macos-latest ] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + include: + - os: ubuntu-latest + install_dep: sudo apt install libportaudio2 + - os: windows-latest + - os: macos-latest steps: - uses: actions/checkout@v3 @@ -37,14 +42,22 @@ jobs: python-version: ${{ matrix.python-version }} - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies - run: pip install -r requirements.txt + run: | + ${{matrix.install_dep}} + pip install -r requirements.txt - - name: Test + - name: Test streaming run: > - python orca_demo.py + python3 orca_demo_streaming.py + --access_key ${{secrets.PV_VALID_ACCESS_KEY}} + --text_to_stream "Hello, I am Orca!" + + - name: Test single + run: > + python3 orca_demo.py --access_key ${{secrets.PV_VALID_ACCESS_KEY}} --text "Hello, I am Orca!" --output_path ./tmp.wav @@ -54,7 +67,7 @@ jobs: strategy: matrix: - machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] steps: - uses: actions/checkout@v3 @@ -62,9 +75,15 @@ jobs: - name: Install dependencies run: pip3 install -r requirements.txt - - name: Test + - name: Test streaming + run: > + python3 orca_demo_streaming.py + --access_key ${{secrets.PV_VALID_ACCESS_KEY}} + --text_to_stream "Hello, I am Orca!" + + - name: Test single run: > python3 orca_demo.py --access_key ${{secrets.PV_VALID_ACCESS_KEY}} --text "Hello, I am Orca!" - --output_path ./tmp.wav \ No newline at end of file + --output_path ./tmp.wav diff --git a/.github/workflows/python-perf.yml b/.github/workflows/python-perf.yml index 43b5f826..64c8a6cd 100644 --- a/.github/workflows/python-perf.yml +++ b/.github/workflows/python-perf.yml @@ -45,11 +45,11 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] include: - os: ubuntu-latest - proc_performance_threshold_sec: 1.5 + proc_performance_threshold_rtf: 5.0 - os: windows-latest - proc_performance_threshold_sec: 1.5 + proc_performance_threshold_rtf: 3.0 - os: macos-latest - proc_performance_threshold_sec: 2.5 + proc_performance_threshold_rtf: 3.0 steps: - uses: actions/checkout@v3 @@ -60,7 +60,7 @@ jobs: python-version: '3.10' - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies run: pip install -r requirements.txt @@ -70,7 +70,7 @@ jobs: python3 test_orca_perf.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} --num-test-iterations 10 - --proc-performance-threshold-sec ${{matrix.proc_performance_threshold_sec}} + --proc-performance-threshold-rtf ${{matrix.proc_performance_threshold_rtf}} perf-self-hosted: runs-on: ${{ matrix.machine }} @@ -81,17 +81,17 @@ jobs: machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] include: - machine: rpi3-32 - proc_performance_threshold_sec: 10.0 + proc_performance_threshold_rtf: 1.0 - machine: rpi3-64 - proc_performance_threshold_sec: 6.0 + proc_performance_threshold_rtf: 1.0 - machine: rpi4-32 - proc_performance_threshold_sec: 5.0 + proc_performance_threshold_rtf: 2.0 - machine: rpi4-64 - proc_performance_threshold_sec: 4.0 + proc_performance_threshold_rtf: 2.0 - machine: rpi5-64 - proc_performance_threshold_sec: 2.0 + proc_performance_threshold_rtf: 2.0 - machine: jetson - proc_performance_threshold_sec: 4.0 + proc_performance_threshold_rtf: 2.0 steps: - uses: actions/checkout@v3 @@ -108,8 +108,8 @@ jobs: python3 test_orca_perf.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} --num-test-iterations 10 - --proc-performance-threshold-sec ${{matrix.proc_performance_threshold_sec}} + --proc-performance-threshold-rtf ${{matrix.proc_performance_threshold_rtf}} - name: Machine state after working-directory: resources/.scripts - run: bash machine-state.sh \ No newline at end of file + run: bash machine-state.sh diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fab6801d..d52b714b 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -37,8 +37,8 @@ jobs: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + os: [ ubuntu-latest, windows-latest, macos-latest ] + python-version: [ '3.8' , '3.9', '3.10', '3.11', '3.12' ] steps: - uses: actions/checkout@v3 @@ -49,20 +49,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies run: pip install -r requirements.txt - name: Test - run: python test_orca.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} + run: python3 test_orca.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} build-self-hosted: runs-on: ${{ matrix.machine }} strategy: matrix: - machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] steps: - uses: actions/checkout@v3 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..41039694 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "demo/c/dr_libs"] + path = demo/c/dr_libs + url = ../../mackron/dr_libs.git \ No newline at end of file diff --git a/README.md b/README.md index 63d3c35d..b732e60d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,8 @@ Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) [![Twitter URL](https://img.shields.io/twitter/url?label=%40AiPicovoice&style=social&url=https%3A%2F%2Ftwitter.com%2FAiPicovoice)](https://twitter.com/AiPicovoice) [![YouTube Channel Views](https://img.shields.io/youtube/channel/views/UCAdi9sTCXLosG1XeqDwLx7w?label=YouTube&style=social)](https://www.youtube.com/channel/UCAdi9sTCXLosG1XeqDwLx7w) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -28,25 +29,26 @@ Orca may undergo changes as we continually enhance and refine the engine to prov - [Orca](#orca) - [Table of Contents](#table-of-contents) - - [Language Support](#language-support) - [Overview](#overview) + - [Orca streaming text synthesis](#orca-input-and-output-streaming-synthesis) + - [Text input](#text-input) - [Custom pronunciations](#custom-pronunciations) - [Voices](#voices) - [Speech control](#speech-control) - [Audio output](#audio-output) + - [AccessKey](#accesskey) - [Demos](#demos) - - [Python](#python-demos) - - [iOS](#ios-demo) - - [C](#c-demos) - - [Web](#web-demos) - - [Android](#android-demo) + - [Python Demos](#python-demos) + - [iOS Demo](#ios-demo) + - [C Demos](#c-demos) + - [Web Demos](#web-demos) + - [Android Demo](#android-demo) - [SDKs](#sdks) - [Python](#python) - [iOS](#ios) - [C](#c) - [Web](#web) - [Android](#android) - - [AccessKey](#accesskey) - [Releases](#releases) - [FAQ](#faq) @@ -58,10 +60,21 @@ Orca may undergo changes as we continually enhance and refine the engine to prov ## Overview +### Orca input and output streaming synthesis + +Orca is a text-to-speech engine designed specifically for LLMs. It can process +incoming text streams in real-time, generating audio continuously, i.e., as the LLM produces tokens, +Orca generates speech in parallel. +This enables seamless conversations with voice assistants, eliminating any audio delays. + +![](https://github.com/Picovoice/orca/blob/orca-prepare-v0.2/resources/assets/orca_streaming_animation.gif) + +Orca also supports single synthesis mode, where a complete text is synthesized in a single call to the Orca engine. + ### Text input -Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, as well as -common punctuation marks. You can get a list of all supported characters by calling the +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the `valid_characters()` method provided in the Orca SDK you are using. Pronunciations of characters or words not supported by this list can be achieved with [custom pronunciations](#custom-pronunciations). @@ -96,6 +109,7 @@ Orca provides a set of parameters to control the synthesized speech. The followi | Parameter | Default | Description | |:-----------:|:-------:|:--------------------------------------------------------------------------------------------------------------------------:| | speech rate | 1.0 | Speed of generated speech. Valid values are within [0.7, 1.3].
Higher (lower) values generate faster (slower) speech. | +| random state| random | Sets the random state for sampling during synthesis.
Valid values are all non-negative integers.
If not provided, a random seed will be chosen. | ### Audio output @@ -117,12 +131,20 @@ AccessKey also verifies that your usage is within the limits of your account. Ev ### Python Demos -To run the Python demo, run the following in the console: +Install the demo package: ```console pip3 install pvorcademo ``` +Run the streaming demo: + +```console +orca_demo_streaming --access_key ${ACCESS_KEY} --text_to_stream ${TEXT} +``` + +Run the single synthesis demo: + ```console orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_PATH} ``` @@ -130,6 +152,8 @@ orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_P Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with the text to be synthesized, and `${WAV_OUTPUT_PATH}` with a path to an output WAV file. +For more information about Python demos go to [demo/python](demo/python). + ### iOS Demo Run the following from [demo/ios](demo/ios) to install the Orca-iOS CocoaPod: @@ -147,16 +171,16 @@ For more information about iOS demos go to [demo/ios](demo/ios). ### C Demos -Build the demo: +Build the streaming demo: ```console -cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo_streaming ``` Run the demo: ```console -./demo/c/build/orca_demo -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} +./demo/c/build/orca_demo_streaming -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} ``` ### Web Demos @@ -196,17 +220,49 @@ Install the Python SDK: pip3 install pvorca ``` -Create an instance of the engine and generate speech: +Create an instance of the engine: ```python import pvorca orca = pvorca.create(access_key='${ACCESS_KEY}') -pcm = orca.synthesize('${TEXT}') ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${TEXT}` with -the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). + +#### Streaming synthesis + +To synthesize a text stream, create an Orca Stream object and add text to it one-by-one: + +```python +stream = orca.stream_open() + +for text_chunk in text_generator(): + pcm = stream.synthesize(text_chunk) + if pcm is not None: + # handle pcm + +pcm = stream.flush() +if pcm is not None: + # handle pcm +``` + +The `text_generator()` function can be any stream generating text, for example an LLM response. +When done with streaming text synthesis, the stream object needs to be closed: + +```python +stream.close() +``` + +#### Single synthesis + +Use single synthesis mode if the complete text is known in advance: + +```python +pcm, alignments = orca.synthesize('${TEXT}') +``` + +Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). Finally, when done make sure to explicitly release the resources: @@ -218,7 +274,7 @@ For more details see [Python SDK](./binding/python/README.md). ### iOS -Create an instance of the engine and synthesize: +Create an instance of the engine: ```swift import Orca @@ -230,18 +286,52 @@ let modelPath = Bundle(for: type(of: self)).path( do { let orca = try Orca(accessKey: "${ACCESS_KEY}", modelPath: modelPath) } catch {} +``` + +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${MODEL_FILE}` +with the model file name for Orca. + +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize` method. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```swift +let orcaStream = try orca.streamOpen() + +for textChunk in textGenerator() { + let pcm = orcaStream.synthesize(textChunk) + if pcm != nil { + // handle pcm + } +} +let pcm = orcaStream.flush() +if pcm != nil { + // handle pcm +} + +orcaStream.close() +``` + +`textGenerator()` can be any stream generating text, for example an LLM response. + +#### Single synthesis + +```swift do { - let pcm = try orca.synthesize(text: "${TEXT}") + let (pcm, wordArray) = try orca.synthesize(text: "${TEXT}") } catch {} ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/), `${MODEL_FILE}` -with the model file name for Orca and `${TEXT}` with -the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). + +#### Release resources When done be sure to explicitly release the resources using `orca.delete()`. +For more details, see the [iOS SDK](./binding/ios/). + ### C The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant information on Orca's C SDK. @@ -249,9 +339,9 @@ The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant infor Build an instance of the object: ```c -pv_orca_t *handle = NULL; +pv_orca_t *orca = NULL; const char *model_path = "${MODEL_PATH}"; -pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &handle); +pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &orca); if (status != PV_STATUS_SUCCESS) { // error handling logic } @@ -268,27 +358,112 @@ status = pv_orca_synthesize_params_init(&synthesize_params); // change the default parameters of synthesize_params as desired ``` -Now, the `handle` and `synthesize_params` object can be used to synthesize speech: +#### Streaming synthesis + +To synthesize a text stream, create an `orca_stream` object using `synthesize_params`: + +```c +pv_orca_stream_t *orca_stream = NULL; +status = pv_orca_stream_open(orca, synthesize_params, &orca_stream); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +``` + +Add text to `orca_stream` one-by-one and handle the synthesized audio: + +```c +extern char *get_next_text_chunk(void); + +int32_t num_samples_chunk = 0; +int16_t *pcm_chunk = NULL; +status = pv_orca_stream_synthesize( + orca_stream, + get_next_text_chunk(), + &num_samples_chunk, + &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the text stream is complete, call the flush method to synthesize the remaining text: + +```c +status = pv_orca_stream_flush(orca_stream, &num_samples_chunk, &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the PCM chunks are handled, make sure to release the acquired resources for each chunk with: + +```c +pv_orca_pcm_delete(pcm_chunk); +``` + +Finally, when done make sure to close the stream: + +```c +pv_orca_stream_close(orca_stream); +``` + +#### Single synthesis + +If the text is known in advance, single synthesis mode can be used: ```c int32_t num_samples = 0; int16_t *synthesized_pcm = NULL; +int32_t num_alignments = 0; +pv_orca_word_alignment_t **alignments = NULL; status = pv_orca_synthesize( - handle, + orca, "${TEXT}", synthesize_params, &num_samples, - &synthesized_pcm); + &synthesized_pcm, + &num_alignments, + &alignments); ``` Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Print the metadata of the synthesized audio: + +```c +for (int32_t i = 0; i < num_alignments; i++) { + fprintf( + stdout, + "[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].word, + alignments[i].start_sec, + alignments[i].end_sec); + for (int32_t j = 0; j < alignments[i].num_phonemes; j++) { + fprintf( + stdout, + "\t[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].phonemes[j].phoneme, + alignments[i].phonemes[j].start_sec, + alignments[i].phonemes[j].end_sec); + + } +} +``` + Finally, when done make sure to release the acquired resources: ```c -pv_orca_delete_pcm(pcm); +pv_orca_word_alignments_delete(num_alignments, alignments); +pv_orca_pcm_delete(pcm); pv_orca_synthesize_params_delete(synthesize_params); -pv_orca_delete(handle); +pv_orca_delete(orca); ``` ### Web @@ -315,12 +490,48 @@ const orca = await OrcaWorker.create( "${ACCESS_KEY}", { base64: orcaParams } ); +``` + +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). + +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize` method. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```typescript +const orcaStream = await orca.streamOpen(); -const speechPcm = await orca.synthesize("${TEXT}") +function* textStream(): IterableIterator { + ... // yield text chunks e.g. from an LLM response +} + +for (const textChunk of textStream()) { + const pcm = await orcaStream.synthesize(textChunk); + if (pcm !== null) { + // handle pcm + } +} + +const flushedPcm = orcaStream.flush(); +if (flushedPcm !== null) { + // handle pcm +} + +orcaStream.close(); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done -release the resources using `orca.release()`. +#### Single synthesis + +```typescript +const { speechPcm, alignments } = await orca.synthesize("${TEXT}") +``` + +#### Release resources + +Finally, when done release the resources using `orca.release()`. + +For more details, see the [Web SDK](./binding/web/). ### Android @@ -345,11 +556,6 @@ try { .setAccessKey(accessKey) .setModelPath(modelPath) .build(appContext); - - short[] pcm = orca.synthesize( - "${TEXT}", - new OrcaSynthesizeParams.Builder().build()); - } catch (OrcaException ex) { } ``` @@ -357,6 +563,35 @@ Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_FIL Orca [voice model file](./lib/common) and `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize`. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```java +Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + +for (String textChunk : textGenerator()) { + short[] pcm = orcaStream.synthesize(textChunk); + if (pcm != null) { + // handle pcm + } +} + +short[] flushedPcm = orcaStream.flush(); +if (flushedPcm != null) { + // handle pcm +} +``` + +#### Single synthesis + +```java +OrcaAudio audio = orca.synthesize( + "${TEXT}", + new OrcaSynthesizeParams.Builder().build()); +``` + Finally, when done make sure to explicitly release the resources: ```java @@ -367,6 +602,14 @@ For more details, see the [Android SDK](./binding/android/README.md). ## Releases +### v0.2.0 - May 3rd, 2024 + +- Support for streaming synthesis +- Reduced model size +- Improved performance +- Support for word alignments +- Improved naturalness and pronunciations + ### v0.1.0 - January 24th, 2024 - Beta release diff --git a/binding/android/Orca/orca/build.gradle b/binding/android/Orca/orca/build.gradle index 150f441a..de554471 100644 --- a/binding/android/Orca/orca/build.gradle +++ b/binding/android/Orca/orca/build.gradle @@ -2,7 +2,7 @@ apply plugin: 'com.android.library' ext { PUBLISH_GROUP_ID = 'ai.picovoice' - PUBLISH_VERSION = '0.1.0' + PUBLISH_VERSION = '0.2.0' PUBLISH_ARTIFACT_ID = 'orca-android' } @@ -14,8 +14,8 @@ android { defaultConfig { minSdkVersion 21 targetSdkVersion defaultTargetSdkVersion - versionCode 1 - versionName "0.1" + versionCode 2 + versionName "0.2" consumerProguardFiles "consumer-rules.pro" } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java index ba3c687e..2493f1e1 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java @@ -35,6 +35,91 @@ public class Orca { private long handle; + private int maxCharacterLimit; + private int sampleRate; + private String[] validCharacters; + + /** + * OrcaStream object that converts a stream of text to a stream of audio. + */ + public class OrcaStream { + private long stream; + + public OrcaStream(long stream) { + this.stream = stream; + } + + /** + * Adds a chunk of text to the Stream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `.getValidCharacters()`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + * @throws OrcaException if there is an error while synthesizing audio. + */ + public short[] synthesize(String text) throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream synthesize after delete." + ); + } + + if (stream == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream synthesize without an open stream." + ); + } + + short[] pcm = OrcaNative.streamSynthesize(stream, text); + + return pcm.length == 0 ? null : pcm; + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + * @throws OrcaException if there is an error while flushing audio. + */ + public short[] flush() throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream flush after delete." + ); + } + + if (stream == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream flush without an open stream." + ); + } + + short[] pcm = OrcaNative.streamFlush(stream); + + return pcm.length == 0 ? null : pcm; + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public void close() { + if (handle != 0 && stream != 0) { + OrcaNative.streamClose(stream); + stream = 0; + } + } + } + /** * Constructor. * @@ -47,6 +132,9 @@ private Orca(String accessKey, String modelPath) throws OrcaException { handle = OrcaNative.init( accessKey, modelPath); + maxCharacterLimit = OrcaNative.getMaxCharacterLimit(handle); + sampleRate = OrcaNative.getSampleRate(handle); + validCharacters = OrcaNative.getValidCharacters(handle); } public static void setSdk(String sdk) { @@ -93,10 +181,11 @@ public void delete() { * syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format, * e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. - * @return The output audio. + * @return An object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and an array of OrcaWord objects representing the word alignments. * @throws OrcaException if there is an error while synthesizing audio. */ - public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaException { + public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws OrcaException { if (handle == 0) { throw new OrcaInvalidStateException( "Attempted to call Orca synthesize after delete." @@ -106,7 +195,8 @@ public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaE return OrcaNative.synthesize( handle, text, - params.getSpeechRate()); + params.getSpeechRate(), + params.getRandomState()); } /** @@ -121,9 +211,10 @@ public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaE * @param outputPath Absolute path to the output audio file. The output file is saved as * `WAV (.wav)` and consists of a single mono channel. * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. + * @return An array of OrcaWord objects representing the word alignments. * @throws OrcaException if there is an error while synthesizing audio to file. */ - public void synthesizeToFile( + public OrcaWord[] synthesizeToFile( String text, String outputPath, OrcaSynthesizeParams params) throws OrcaException { @@ -133,11 +224,34 @@ public void synthesizeToFile( ); } - OrcaNative.synthesizeToFile( + OrcaAudio result = OrcaNative.synthesizeToFile( handle, text, outputPath, - params.getSpeechRate()); + params.getSpeechRate(), + params.getRandomState()); + + return result.getWordArray(); + } + + /** + * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. + * @return OrcaStream object. + * @throws OrcaException if there is an error while opening OrcaStream. + */ + public OrcaStream streamOpen(OrcaSynthesizeParams params) throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call Orca streamOpen after delete." + ); + } + + long stream = OrcaNative.streamOpen( + handle, + params.getSpeechRate(), + params.getRandomState()); + + return new OrcaStream(stream); } /** @@ -155,7 +269,7 @@ public String getVersion() { * @return The maximum number of characters that can be synthesized at once. */ public int getMaxCharacterLimit() { - return OrcaNative.getMaxCharacterLimit(); + return maxCharacterLimit; } /** @@ -163,14 +277,8 @@ public int getMaxCharacterLimit() { * * @return Audio sampling rate of the audio produced by Orca. */ - public int getSampleRate() throws OrcaException { - if (handle == 0) { - throw new OrcaInvalidStateException( - "Attempted to call Orca getSampleRate after delete." - ); - } - - return OrcaNative.getSampleRate(handle); + public int getSampleRate() { + return sampleRate; } /** @@ -178,14 +286,8 @@ public int getSampleRate() throws OrcaException { * * @return Array of characters that are accepted as input to Orca synthesize functions. */ - public String[] getValidCharacters() throws OrcaException { - if (handle == 0) { - throw new OrcaInvalidStateException( - "Attempted to call Orca getValidCharacters after delete." - ); - } - - return OrcaNative.getValidCharacters(handle); + public String[] getValidCharacters() { + return validCharacters; } /** diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java new file mode 100644 index 00000000..6f0d3311 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java @@ -0,0 +1,48 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaAudio { + + private final short[] pcm; + private final OrcaWord[] wordArray; + + /** + * Constructor. + * + * @param pcm Synthesized audio. + * @param wordArray Synthesized words and their associated metadata. + */ + public OrcaAudio(short[] pcm, OrcaWord[] wordArray) { + this.pcm = pcm; + this.wordArray = wordArray; + } + + /** + * Getter for the synthesized audio. + * + * @return Synthesized audio. + */ + public short[] getPcm() { + return pcm; + } + + /** + * Getter for synthesized words and their associated metadata. + * + * @return Synthesized words and their associated metadata. + */ + public OrcaWord[] getWordArray() { + return wordArray; + } +} diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java index 50307ea3..6a57cbdc 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java @@ -26,16 +26,31 @@ class OrcaNative { static native String[] getValidCharacters(long object) throws OrcaException; - static native int getMaxCharacterLimit(); + static native int getMaxCharacterLimit(long object) throws OrcaException; - static native short[] synthesize( + static native OrcaAudio synthesize( long object, String text, - float speechRate) throws OrcaException; + float speechRate, + long randomState) throws OrcaException; - static native void synthesizeToFile( + static native OrcaAudio synthesizeToFile( long object, String text, String outputPath, - float speechRate) throws OrcaException; + float speechRate, + long randomState) throws OrcaException; + + static native long streamOpen( + long object, + float speechRate, + long randomState) throws OrcaException; + + static native short[] streamSynthesize( + long object, + String text) throws OrcaException; + + static native short[] streamFlush(long object) throws OrcaException; + + static native void streamClose(long object); } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java new file mode 100644 index 00000000..d38b56a4 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java @@ -0,0 +1,60 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaPhoneme { + + private final String phoneme; + private final float startSec; + private final float endSec; + + /** + * Constructor. + * + * @param phoneme Synthesized phoneme. + * @param startSec Start time of the phoneme in seconds. + * @param endSec End time of the phoneme in seconds. + */ + public OrcaPhoneme(String phoneme, float startSec, float endSec) { + this.phoneme = phoneme; + this.startSec = startSec; + this.endSec = endSec; + } + + /** + * Getter for the synthesized phoneme. + * + * @return Synthesized phoneme. + */ + public String getPhoneme() { + return phoneme; + } + + /** + * Getter for the start time of the phoneme in seconds. + * + * @return Start time of the phoneme in seconds. + */ + public float getStartSec() { + return startSec; + } + + /** + * Getter for the end time of the phoneme in seconds. + * + * @return End time of the phoneme in seconds. + */ + public float getEndSec() { + return endSec; + } +} diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java index df787e2a..d749d7d9 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java @@ -18,12 +18,14 @@ public class OrcaSynthesizeParams { private final float speechRate; + private final long randomState; /** * Constructor. */ - private OrcaSynthesizeParams(float speechRate) { + private OrcaSynthesizeParams(float speechRate, long randomState) { this.speechRate = speechRate; + this.randomState = randomState; } /** @@ -35,12 +37,22 @@ public float getSpeechRate() { return this.speechRate; } + /** + * Getter for the random state (i.e. the random state for the synthesized speech). + * + * @return Random State. + */ + public long getRandomState() { + return this.randomState; + } + /** * Builder for creating instance of OrcaSynthesizeParams. */ public static class Builder { private float speechRate = 1.0f; + private long randomState = -1; /** * Sets the speech rate. @@ -53,6 +65,17 @@ public Builder setSpeechRate(float speechRate) { return this; } + /** + * Sets the random state. + * + * @param randomState The random state for the synthesized speech. + * @return Modified builder object. + */ + public Builder setRandomState(long randomState) { + this.randomState = randomState; + return this; + } + /** * Validates properties and creates an instance of OrcaSynthesizeParams. * @@ -66,7 +89,7 @@ public OrcaSynthesizeParams build() throws OrcaInvalidArgumentException { ); } - return new OrcaSynthesizeParams(speechRate); + return new OrcaSynthesizeParams(speechRate, randomState); } } } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java new file mode 100644 index 00000000..cb9b9868 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java @@ -0,0 +1,72 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaWord { + + private final String word; + private final float startSec; + private final float endSec; + private final OrcaPhoneme[] phonemeArray; + + /** + * Constructor. + * + * @param word Synthesized word. + * @param startSec Start time of the word in seconds. + * @param endSec End time of the word in seconds. + * @param phonemeArray Synthesized phonemes and their associated metadata. + */ + public OrcaWord(String word, float startSec, float endSec, OrcaPhoneme[] phonemeArray) { + this.word = word; + this.startSec = startSec; + this.endSec = endSec; + this.phonemeArray = phonemeArray; + } + + /** + * Getter for the synthesized word. + * + * @return Synthesized word. + */ + public String getWord() { + return word; + } + + /** + * Getter for the start time of the word in seconds. + * + * @return Start time of the word in seconds. + */ + public float getStartSec() { + return startSec; + } + + /** + * Getter for the end time of the word in seconds. + * + * @return End time of the word in seconds. + */ + public float getEndSec() { + return endSec; + } + + /** + * Getter for synthesized phonemes and their associated metadata. + * + * @return Synthesized phonemes and their associated metadata. + */ + public OrcaPhoneme[] getPhonemeArray() { + return phonemeArray; + } +} diff --git a/binding/android/OrcaTestApp/copy_test_resources.sh b/binding/android/OrcaTestApp/copy_test_resources.sh index 5b967362..511e0eed 100755 --- a/binding/android/OrcaTestApp/copy_test_resources.sh +++ b/binding/android/OrcaTestApp/copy_test_resources.sh @@ -10,5 +10,11 @@ cp ../../../lib/common/*.pv ./orca-test-app/src/androidTest/assets/test_resource echo "Copying test data file..." cp ../../../resources/.test/test_data.json ./orca-test-app/src/androidTest/assets/test_resources -echo "Copying test model files ..." -cp ../../../resources/.test/models/*.pv ./orca-test-app/src/androidTest/assets/test_resources/model_files +if [ ! -d "./orca-test-app/src/androidTest/assets/test_resources/wav" ] +then + echo "Creating test model files directory..." + mkdir -p ./orca-test-app/src/androidTest/assets/test_resources/wav +fi + +echo "Copying test wav files..." +cp ../../../resources/.test/wav/*.wav ./orca-test-app/src/androidTest/assets/test_resources/wav diff --git a/binding/android/OrcaTestApp/orca-test-app/build.gradle b/binding/android/OrcaTestApp/orca-test-app/build.gradle index 5225afa2..b2fa669b 100644 --- a/binding/android/OrcaTestApp/orca-test-app/build.gradle +++ b/binding/android/OrcaTestApp/orca-test-app/build.gradle @@ -106,7 +106,7 @@ dependencies { implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation 'ai.picovoice:orca-android:0.1.0' + implementation 'ai.picovoice:orca-android:0.2.0' // Espresso UI Testing androidTestImplementation 'androidx.test.ext:junit:1.1.5' diff --git a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java index 35f6c177..23bac17e 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java @@ -28,13 +28,23 @@ import java.io.BufferedInputStream; import java.io.BufferedOutputStream; +import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import ai.picovoice.orca.OrcaWord; +import ai.picovoice.orca.OrcaPhoneme; public class BaseTest { @@ -46,7 +56,6 @@ public class BaseTest { AssetManager assetManager; String testResourcesPath; JsonObject testJson; - String leopardModelPath; String accessKey; @Before @@ -89,35 +98,64 @@ public static String[] getModelFiles() { }; } - protected static float getWordErrorRate( - String transcript, - String expectedTranscript, - boolean useCER) { - String splitter = (useCER) ? "" : " "; - return (float) levenshteinDistance( - transcript.split(splitter), - expectedTranscript.split(splitter)) / transcript.length(); + protected static boolean compareArrays(short[] arr1, short[] arr2, int step) { + for (int i = 0; i < arr1.length - step; i += step) { + if (!(Math.abs(arr1[i] - arr2[i]) <= 500)) { + return false; + } + } + return true; } - private static int levenshteinDistance(String[] words1, String[] words2) { - int[][] res = new int[words1.length + 1][words2.length + 1]; - for (int i = 0; i <= words1.length; i++) { - res[i][0] = i; - } - for (int j = 0; j <= words2.length; j++) { - res[0][j] = j; + protected static short[] concatArrays(short[] existingArray, short[] arrayToAdd) { + short[] result = new short[existingArray.length + arrayToAdd.length]; + + System.arraycopy(existingArray, 0, result, 0, existingArray.length); + System.arraycopy(arrayToAdd, 0, result, existingArray.length, arrayToAdd.length); + + return result; + } + + protected static short[] readAudioFile(String audioFile) throws Exception { + FileInputStream audioInputStream = new FileInputStream(audioFile); + ByteArrayOutputStream audioByteBuffer = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + for (int length; (length = audioInputStream.read(buffer)) != -1; ) { + audioByteBuffer.write(buffer, 0, length); } - for (int i = 1; i <= words1.length; i++) { - for (int j = 1; j <= words2.length; j++) { - res[i][j] = Math.min( - Math.min( - res[i - 1][j] + 1, - res[i][j - 1] + 1), - res[i - 1][j - 1] + (words1[i - 1].equalsIgnoreCase(words2[j - 1]) ? 0 : 1) - ); + byte[] rawData = audioByteBuffer.toByteArray(); + + short[] pcm = new short[rawData.length / 2]; + ByteBuffer pcmBuff = ByteBuffer.wrap(rawData).order(ByteOrder.LITTLE_ENDIAN); + pcmBuff.asShortBuffer().get(pcm); + pcm = Arrays.copyOfRange(pcm, 22, pcm.length); + + return pcm; + } + + protected void validateMetadata( + OrcaWord[] words, + OrcaWord[] expectedWords, + boolean isExpectExact + ) { + assertEquals(words.length, expectedWords.length); + for (int i = 0; i < words.length; i++) { + assertEquals(words[i].getWord(), expectedWords[i].getWord()); + if (isExpectExact) { + assertEquals(words[i].getStartSec(), expectedWords[i].getStartSec(), 0.1); + assertEquals(words[i].getEndSec(), expectedWords[i].getEndSec(), 0.1); + } + OrcaPhoneme[] phonemes = words[i].getPhonemeArray(); + OrcaPhoneme[] expectedPhonemes = expectedWords[i].getPhonemeArray(); + assertEquals(phonemes.length, expectedPhonemes.length); + for (int j = 0; j < phonemes.length; j++) { + assertEquals(phonemes[j].getPhoneme(), expectedPhonemes[j].getPhoneme()); + if (isExpectExact) { + assertEquals(phonemes[j].getStartSec(), expectedPhonemes[j].getStartSec(), 0.1); + assertEquals(phonemes[j].getEndSec(), expectedPhonemes[j].getEndSec(), 0.1); + } } } - return res[words1.length][words2.length]; } private void extractAssetsRecursively(String path) throws IOException { diff --git a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java index 45ab09c5..0f9bda25 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java @@ -12,12 +12,14 @@ package ai.picovoice.orca.testapp; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import com.google.gson.JsonArray; import com.google.gson.JsonObject; import org.junit.After; @@ -28,19 +30,20 @@ import org.junit.runners.Parameterized; import java.io.File; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Objects; -import ai.picovoice.leopard.Leopard; - -import ai.picovoice.leopard.LeopardTranscript; import ai.picovoice.orca.Orca; +import ai.picovoice.orca.OrcaAudio; import ai.picovoice.orca.OrcaException; import ai.picovoice.orca.OrcaInvalidArgumentException; import ai.picovoice.orca.OrcaSynthesizeParams; - +import ai.picovoice.orca.OrcaWord; +import ai.picovoice.orca.OrcaPhoneme; @RunWith(Enclosed.class) public class OrcaTest { @@ -101,9 +104,14 @@ public static Collection initParameters() { String text; String textNoPunctuation; String textCustomPronunciation; + String textAlignment; + static JsonArray textInvalid; - float werThreshold; - String leopardModelPath; + long randomState; + static JsonArray alignments; + + String modelFileUsed; + String EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = "female"; Orca orca; @@ -115,10 +123,13 @@ public void Setup() throws Exception { text = testSentences.get("text").getAsString(); textNoPunctuation = testSentences.get("text_no_punctuation").getAsString(); textCustomPronunciation = testSentences.get("text_custom_pronunciation").getAsString(); - werThreshold = testJson.get("wer_threshold").getAsFloat(); - leopardModelPath = new File( - testResourcesPath, - "model_files/leopard_params.pv").getAbsolutePath(); + textAlignment = testSentences.get("text_alignment").getAsString(); + textInvalid = testSentences.get("text_invalid").getAsJsonArray(); + + randomState = testJson.get("random_state").getAsLong(); + alignments = testJson.getAsJsonArray("alignments"); + + modelFileUsed = modelFile.contains("female") ? "female" : "male"; orca = new Orca.Builder() .setAccessKey(accessKey) @@ -133,11 +144,6 @@ public void TearDown() { } } - @Test - public void testMaxCharacterLimit() { - assertTrue(orca.getMaxCharacterLimit() > 0); - } - @Test public void testVersion() { final String version = orca.getVersion(); @@ -146,97 +152,244 @@ public void testVersion() { } @Test - public void testSampleRate() throws OrcaException { + public void testSampleRate() { assertTrue(orca.getSampleRate() > 0); } @Test - public void testValidCharacters() throws OrcaException { + public void testMaxCharacterLimit() { + assertTrue(orca.getMaxCharacterLimit() > 0); + } + + @Test + public void testValidCharacters() { String[] characters = orca.getValidCharacters(); assertTrue(characters.length > 0); assertTrue(Arrays.asList(characters).contains(",")); } @Test - public void testSynthesize() throws Exception { - Leopard leopard = new Leopard.Builder() - .setAccessKey(accessKey) - .setModelPath(leopardModelPath) - .build(appContext); + public void testStreaming() throws Exception { + Orca.OrcaStream orcaStream = orca.streamOpen( + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + + short[] fullPcm = new short[0]; + for (char c : text.toCharArray()) { + short[] pcm = orcaStream.synthesize(String.valueOf(c)); + if (pcm != null && pcm.length > 0) { + fullPcm = concatArrays(fullPcm, pcm); + } + } + + short[] flushedPcm = orcaStream.flush(); + if (flushedPcm != null && flushedPcm.length > 0) { + fullPcm = concatArrays(fullPcm, flushedPcm); + } + + orcaStream.close(); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_stream.wav", testResourcesPath, modelFileUsed)); - final short[] pcm = orca.synthesize( + compareArrays(fullPcm, testFilePcm, 1); + } + + @Test + public void testSynthesize() throws Exception { + final OrcaAudio pcm = orca.synthesize( text, - new OrcaSynthesizeParams.Builder().build()); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); - LeopardTranscript leopardTranscript = leopard.process(pcm); - leopard.delete(); - final float wer = getWordErrorRate( - leopardTranscript.getTranscriptString(), - textNoPunctuation, - false); - assertTrue(wer < werThreshold); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_single.wav", testResourcesPath, modelFileUsed)); + + compareArrays(pcm.getPcm(), testFilePcm, 1); } @Test public void testSynthesizeToFile() throws Exception { - Leopard leopard = new Leopard.Builder() - .setAccessKey(accessKey) - .setModelPath(leopardModelPath) - .build(appContext); - final File outputFile = new File( appContext.getFilesDir(), "text.wav"); orca.synthesizeToFile( text, outputFile.getAbsolutePath(), - new OrcaSynthesizeParams.Builder().build()); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + + short[] outputFilePcm = readAudioFile(outputFile.getAbsolutePath()); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_single.wav", testResourcesPath, modelFileUsed)); - LeopardTranscript leopardTranscript = leopard.processFile(outputFile.getAbsolutePath()); + compareArrays(outputFilePcm, testFilePcm, 1); outputFile.delete(); - leopard.delete(); - final float wer = getWordErrorRate( - leopardTranscript.getTranscriptString(), + } + + @Test + public void testSynthesizeNoPronunciation() throws OrcaException { + final OrcaAudio result = orca.synthesize( textNoPunctuation, - false); - assertTrue(wer < werThreshold); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + assertTrue(result.getPcm().length > 0); } @Test public void testSynthesizeCustomPronunciation() throws OrcaException { - final short[] pcm = orca.synthesize( + final OrcaAudio result = orca.synthesize( textCustomPronunciation, - new OrcaSynthesizeParams.Builder().build()); - assertTrue(pcm.length > 0); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + assertTrue(result.getPcm().length > 0); + } + + @Test + public void testSynthesizeAlignment() throws OrcaException { + final OrcaAudio result = orca.synthesize( + textAlignment, + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + final OrcaWord[] synthesizeTestData = new OrcaWord[alignments.size()]; + for (int i = 0; i < alignments.size(); i++) { + final JsonObject testData = alignments.get(i).getAsJsonObject(); + final String word = testData.get("word").getAsString(); + final float startSec = testData.get("start_sec").getAsFloat(); + final float endSec = testData.get("end_sec").getAsFloat(); + final JsonArray phonemesJson = testData.getAsJsonArray("phonemes"); + final OrcaPhoneme[] phonemes = new OrcaPhoneme[phonemesJson.size()]; + for (int j = 0; j < phonemesJson.size(); j++) { + final JsonObject phonemeJson = phonemesJson.get(j).getAsJsonObject(); + phonemes[j] = new OrcaPhoneme( + phonemeJson.get("phoneme").getAsString(), + phonemeJson.get("start_sec").getAsFloat(), + phonemeJson.get("end_sec").getAsFloat()); + } + synthesizeTestData[i] = new OrcaWord( + word, + startSec, + endSec, + phonemes); + } + validateMetadata( + result.getWordArray(), + synthesizeTestData, + Objects.equals(modelFileUsed, EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER)); + } + + @Test + public void testSynthesizeToFileAlignment() throws OrcaException { + final File outputFile = new File( + appContext.getFilesDir(), + "text.wav"); + OrcaWord[] result = orca.synthesizeToFile( + textAlignment, + outputFile.getAbsolutePath(), + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + outputFile.delete(); + + final OrcaWord[] synthesizeTestData = new OrcaWord[alignments.size()]; + for (int i = 0; i < alignments.size(); i++) { + final JsonObject testData = alignments.get(i).getAsJsonObject(); + final String word = testData.get("word").getAsString(); + final float startSec = testData.get("start_sec").getAsFloat(); + final float endSec = testData.get("end_sec").getAsFloat(); + final JsonArray phonemesJson = testData.getAsJsonArray("phonemes"); + final OrcaPhoneme[] phonemes = new OrcaPhoneme[phonemesJson.size()]; + for (int j = 0; j < phonemesJson.size(); j++) { + final JsonObject phonemeJson = phonemesJson.get(j).getAsJsonObject(); + phonemes[j] = new OrcaPhoneme( + phonemeJson.get("phoneme").getAsString(), + phonemeJson.get("start_sec").getAsFloat(), + phonemeJson.get("end_sec").getAsFloat()); + } + synthesizeTestData[i] = new OrcaWord( + word, + startSec, + endSec, + phonemes); + } + validateMetadata( + result, + synthesizeTestData, + Objects.equals(modelFileUsed, EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER)); } @Test public void testSynthesizeSpeechRate() throws OrcaException { - final short[] pcmSlow = orca.synthesize( + final OrcaAudio slow = orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(0.7f) + .setRandomState(randomState) .build()); - assertTrue(pcmSlow.length > 0); + assertTrue(slow.getPcm().length > 0); - final short[] pcmFast = orca.synthesize( + final OrcaAudio fast = orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(1.3f) + .setRandomState(randomState) .build()); - assertTrue(pcmFast.length > 0); - assertTrue(pcmFast.length < pcmSlow.length); + assertTrue(slow.getPcm().length > 0); + assertTrue(fast.getPcm().length < slow.getPcm().length); try { orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(9999f) + .setRandomState(randomState) .build()); fail(); } catch (OrcaInvalidArgumentException e) { assertNotNull(e); } } + + @Test + public void testSynthesizeRandomState() throws OrcaException { + final OrcaAudio randomState1 = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(1) + .build()); + assertTrue(randomState1.getPcm().length > 0); + assertTrue(randomState1.getWordArray().length > 0); + + final OrcaAudio randomState2 = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(2) + .build()); + assertTrue(randomState2.getPcm().length > 0); + assertTrue(randomState2.getWordArray().length > 0); + + assertNotEquals(randomState1, randomState2); + assertNotEquals(randomState1.getWordArray(), randomState2.getWordArray()); + + final OrcaAudio randomStateNull = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .build()); + assertTrue(randomStateNull.getPcm().length > 0); + assertTrue(randomStateNull.getWordArray().length > 0); + + final OrcaAudio randomStateMaxValue = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(Long.MAX_VALUE) + .build()); + assertTrue(randomStateMaxValue.getPcm().length > 0); + assertTrue(randomStateMaxValue.getWordArray().length > 0); + } } } diff --git a/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java b/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java index 25ae1c27..d9349f5f 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java @@ -27,6 +27,7 @@ import java.util.HashMap; import ai.picovoice.orca.Orca; +import ai.picovoice.orca.OrcaAudio; import ai.picovoice.orca.OrcaException; import ai.picovoice.orca.OrcaSynthesizeParams; @@ -102,8 +103,8 @@ public void runTest() { result = new TestResult(); result.testName = "Test Synthesize"; try { - short[] pcm = orca.synthesize("Hello", new OrcaSynthesizeParams.Builder().build()); - if (pcm.length > 0) { + OrcaAudio orcaAudio = orca.synthesize("Hello", new OrcaSynthesizeParams.Builder().build()); + if (orcaAudio.getPcm().length > 0 && orcaAudio.getWordArray() != null) { result.success = true; } else { result.success = false; @@ -138,6 +139,30 @@ public void runTest() { results.add(result); } + result = new TestResult(); + result.testName = "Test Streaming"; + try { + Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + short[] pcm = orcaStream.synthesize("Hello"); + short[] flushedPcm = orcaStream.flush(); + orcaStream.close(); + + short[] pcm1 = pcm == null ? new short[0] : pcm; + short[] pcm2 = flushedPcm == null ? new short[0] : flushedPcm; + short[] streamPcm = new short[pcm1.length + pcm2.length]; + if (streamPcm.length > 0) { + result.success = true; + } else { + result.success = false; + result.errorMessage = "Stream returned invalid result."; + } + } catch (Exception e) { + result.success = false; + result.errorMessage = String.format("Failed to stream with '%s'", e); + } finally { + results.add(result); + } + result = new TestResult(); result.testName = "Test Exception"; try { diff --git a/binding/android/README.md b/binding/android/README.md index 2b7b1690..48ceeb51 100644 --- a/binding/android/README.md +++ b/binding/android/README.md @@ -1,10 +1,11 @@ # Orca Binding for Android -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -19,7 +20,8 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Installation -Orca can be found on Maven Central. To include the package in your Android project, ensure you have included `mavenCentral()` in your top-level `build.gradle` file and then add the following to your app's `build.gradle`: +Orca can be found on Maven Central. To include the package in your Android project, ensure you have +included `mavenCentral()` in your top-level `build.gradle` file and then add the following to your app's `build.gradle`: ```groovy dependencies { @@ -37,13 +39,20 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Permissions To enable AccessKey validation, you must add the following line to your `AndroidManifest.xml` file: + ```xml - + + ``` ## Usage -Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app context: +Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an +incoming text stream in real-time and generates audio in parallel. In the single synthesis mode, a complete text is +synthesized in a single call to the Orca engine. + +Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app +context: ```java import ai.picovoice.orca.*; @@ -58,24 +67,72 @@ try { } catch (OrcaException ex) { } ``` -You can synthesize speech by calling one of the available `synthesize` methods: +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one: + +```java +Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + +for (String textChunk : textGenerator()) { + short[] pcm = orcaStream.synthesize(textChunk); + if (pcm != null) { + // handle pcm + } +} + +short[] flushedPcm = orcaStream.flush(); +if (flushedPcm != null) { + // handle pcm +} +``` + +The `textGenerator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `orcaStream.synthesize()`. +To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked. +When done with streaming text synthesis, the `OrcaStream` object needs to be closed: + +```java +orcaStream.close(); +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```java OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder().build(); -// Return raw PCM -short[] pcm = orca.synthesize("${TEXT}", params); +// Return raw PCM and alignments +OrcaAudio audio = orca.synthesize("${TEXT}", params); // Save the generated audio to a WAV file directly -orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params); +OrcaWord[] orcaWords = orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params); +``` + +Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a +single-channel 16-bit PCM WAV file. +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord` +objects. + +When done make sure to explicitly release the resources using: + +```java +orca.delete() ``` -Replace `${TEXT}` with the text to be synthesized (must be fewer characters than `.getMaxCharacterLimit()`). When using `synthesize`, the generated pcm has a sample rate equal to the one returned by `getSampleRate()`. When using `synthesizeToFile`, replace `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit PCM WAV file. When done make sure to explicitly release the resources with `orca.delete()`. +### Text input -### Text Input +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the +`getValidCharacters()` method provided in the Orca SDK you are using. +Pronunciations of characters or words not supported by this list can be achieved with +[custom pronunciations](#custom-pronunciations). -Orca accepts any character found in the list returned by the `getValidCharacters()` method. -Pronunciations of characters or words not supported by this list can be achieved by embedding custom pronunciations in the text via the syntax: `{word|pronunciation}`. The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example: +### Custom pronunciations + +Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\ +The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example: - "This is a {custom|K AH S T AH M} pronunciation" - "{read|R IY D} this as {read|R EH D}, please." @@ -89,23 +146,44 @@ in [lib/common](../../lib/common). To add the Orca model file to your Android application: - Download the desired voice model from the [Orca GitHub repository](../../lib/common). -- Add the model file as a bundled resource by placing it under the assets directory of your Android project (`src/main/assets/`). +- Add the model file as a bundled resource by placing it under the assets directory of your Android + project (`src/main/assets/`). ### Additional Synthesis Controls -Orca allows you to control the synthesized speech via the `OrcaSynthesizeParams` class. You can pass in additional settings by using the nested Builder class: +Orca allows you to control the synthesized speech via the `OrcaSynthesizeParams` class. You can pass in additional +settings by using the nested Builder class: ```java import ai.picovoice.orca.*; OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder() .setSpeechRate(1.2f) + .setRandomState(1) .build(); ``` - `setSpeechRate()`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value produces speech that is faster. The default is `1.0`. +- `setRandomState()`: Sets the random state for sampling during synthesis. This can be used to ensure that the + synthesized speech is deterministic across different runs. + +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaWord` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. ## Demos -To see Orca used in an app, refer to our [Android demo app](../../demo/android/OrcaDemo). \ No newline at end of file +To see Orca used in an app, refer to our [Android demo app](../../demo/android/OrcaDemo). diff --git a/binding/ios/Orca-iOS.podspec b/binding/ios/Orca-iOS.podspec index dcd7efaf..f7127f37 100644 --- a/binding/ios/Orca-iOS.podspec +++ b/binding/ios/Orca-iOS.podspec @@ -1,7 +1,7 @@ Pod::Spec.new do |s| s.name = 'Orca-iOS' s.module_name = 'Orca' - s.version = '0.1.0' + s.version = '0.2.0' s.license = {:type => 'Apache 2.0'} s.summary = 'iOS binding for Picovoice\'s Orca Text-to-Speech Engine.' s.description = @@ -18,7 +18,7 @@ Pod::Spec.new do |s| DESC s.homepage = 'https://github.com/Picovoice/orca/tree/main/binding/ios' s.author = { 'Picovoice' => 'hello@picovoice.ai' } - s.source = { :git => "https://github.com/Picovoice/orca.git", :tag => "Orca-iOS-v0.1.0" } + s.source = { :git => "https://github.com/Picovoice/orca.git", :tag => "Orca-iOS-v0.2.0" } s.ios.deployment_target = '13.0' s.swift_version = '5.0' s.vendored_frameworks = 'lib/ios/PvOrca.xcframework' diff --git a/binding/ios/Orca.swift b/binding/ios/Orca.swift index 1599990a..04cf5ea2 100644 --- a/binding/ios/Orca.swift +++ b/binding/ios/Orca.swift @@ -9,51 +9,193 @@ import PvOrca +public struct OrcaPhoneme { + + /// Synthesized phoneme. + public let phoneme: String + + /// Start of phoneme in seconds. + public let startSec: Float + + /// End of phoneme in seconds. + public let endSec: Float + + /// Constructor. + /// + /// - Parameters: + /// - phoneme: Synthesized phoneme. + /// - startSec: Start of phoneme in seconds. + /// - endSec: End of phoneme in seconds. + public init( + phoneme: String, + startSec: Float, + endSec: Float) { + self.phoneme = phoneme + self.startSec = startSec + self.endSec = endSec + } +} + +public struct OrcaWord { + + /// Synthesized word. + public let word: String + + /// Start of word in seconds. + public let startSec: Float + + /// End of word in seconds. + public let endSec: Float + + /// Array of phonemes. + public let phonemeArray: [OrcaPhoneme] + + /// Constructor. + /// + /// - Parameters: + /// - word: Synthesized word. + /// - startSec: Start of word in seconds. + /// - endSec: End of word in seconds. + /// - phonemeArray: Array of phonemes. + public init( + word: String, + startSec: Float, + endSec: Float, + phonemeArray: [OrcaPhoneme]) { + self.word = word + self.startSec = startSec + self.endSec = endSec + self.phonemeArray = phonemeArray + } +} + /// iOS (Swift) binding for Orca Text-to-Speech engine. Provides a Swift interface to the Orca library. public class Orca { private var handle: OpaquePointer? + + private var stream: OpaquePointer? /// Orca valid symbols private var _validCharacters: Set? /// Orca sample rate private var _sampleRate: Int32? /// Maximum number of characters allowed in a single synthesis request. - public static let maxCharacterLimit = Int32(pv_orca_max_character_limit()) + private var _maxCharacterLimit: Int32? /// Orca version string public static let version = String(cString: pv_orca_version()) private static var sdk = "ios" + /// OrcaStream object that converts a stream of text to a stream of audio. + public class OrcaStream { + + private var orca: Orca + + private var stream: OpaquePointer? + + /// Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + /// This function is expected to be called multiple times with consecutive chunks of text from a text stream. + /// The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the + /// buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + /// for the remaining text that has not yet been synthesized. + /// + /// - Parameters: + /// - text: A chunk of text from a text input stream, comprised of valid characters. + /// Valid characters can be retrieved by calling `.validCharacters`. + /// Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + /// They need to be added in a single call to this function. + /// The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". + /// - Returns: The generated audio as a sequence of 16-bit linearly-encoded integers, `nil` if no + /// audio chunk has been produced. + /// - Throws: OrcaError + public func synthesize(text: String) throws -> [Int16]? { + if stream == nil { + throw OrcaInvalidStateError("Unable to synthesize - stream not open") + } + + var cNumSamples: Int32 = 0 + var cPcm: UnsafeMutablePointer? + + let status = pv_orca_stream_synthesize( + stream, + text, + &cNumSamples, + &cPcm) + if status != PV_STATUS_SUCCESS { + let messageStack = try orca.getMessageStack() + throw orca.pvStatusToOrcaError(status, "Unable to synthesize streaming speech", messageStack) + } + + let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) + let pcm = Array(buffer) + + pv_orca_pcm_delete(cPcm) + + return pcm.isEmpty ? nil : pcm + } + + /// Generates audio for all the buffered text that was added to the OrcaStream object + /// via `OrcaStream.synthesize()`. + /// + /// - Returns: The generated audio as a sequence of 16-bit linearly-encoded integers, `nil` if no + /// audio chunk has been produced. + /// - Throws: OrcaError + public func flush() throws -> [Int16]? { + if stream == nil { + throw OrcaInvalidStateError("Unable to flush - stream not open") + } + + var cNumSamples: Int32 = 0 + var cPcm: UnsafeMutablePointer? + + let status = pv_orca_stream_flush( + stream, + &cNumSamples, + &cPcm) + if status != PV_STATUS_SUCCESS { + let messageStack = try orca.getMessageStack() + throw orca.pvStatusToOrcaError(status, "Unable to flush streaming speech", messageStack) + } + + let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) + let pcm = Array(buffer) + + pv_orca_pcm_delete(cPcm) + + return pcm.isEmpty ? nil : pcm + } + + /// Releases the resources acquired by the OrcaStream object. + public func close() { + if stream != nil { + pv_orca_stream_close(stream) + stream = nil + } + } + + public init(orca: Orca, stream: OpaquePointer) { + self.orca = orca + self.stream = stream + } + } + public static func setSdk(sdk: String) { self.sdk = sdk } /// Set of characters supported by Orca. - public var validCharacters: Set { - get throws { - if _validCharacters == nil { - _validCharacters = try getValidCharacters() - } - return _validCharacters! - } + public var validCharacters: Set? { + return self._validCharacters } /// Audio sample rate of generated audio. - public var sampleRate: Int32 { - get throws { - if _sampleRate == nil { - var cSampleRate: Int32 = 0 - let status = pv_orca_sample_rate(handle, &cSampleRate) - if status != PV_STATUS_SUCCESS { - let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Orca failed to get sample rate", messageStack) - } - - _sampleRate = cSampleRate - } + public var sampleRate: Int32? { + return self._sampleRate + } - return _sampleRate! - } + /// Maximum number of characters allowed per call to `synthesize()`. + public var maxCharacterLimit: Int32? { + return self._maxCharacterLimit } /// Constructor. @@ -73,11 +215,44 @@ public class Orca { pv_set_sdk(Orca.sdk) - let status = pv_orca_init(accessKey, modelPathArg, &handle) - if status != PV_STATUS_SUCCESS { + let initStatus = pv_orca_init(accessKey, modelPathArg, &handle) + if initStatus != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Orca init failed", messageStack) + throw pvStatusToOrcaError(initStatus, "Orca init failed", messageStack) } + + var cNumCharacters: Int32 = 0 + var cCharacters: UnsafeMutablePointer?>? + let validCharactersStatus = pv_orca_valid_characters(handle, &cNumCharacters, &cCharacters) + if validCharactersStatus != PV_STATUS_SUCCESS { + let messageStack = try getMessageStack() + throw pvStatusToOrcaError(validCharactersStatus, "Unable to get Orca valid characters", messageStack) + } + var validCharacters: Set = [] + for i in 0.. [Int16] { + public func synthesize( + text: String, + speechRate: Double? = nil, + randomState: Int64? = nil + ) throws -> (pcm: [Int16], wordArray: [OrcaWord]) { if handle == nil { throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - if text.count > Orca.maxCharacterLimit { + if text.count > self._maxCharacterLimit! { throw OrcaInvalidArgumentError( - "Text length (\(text.count)) must be smaller than \(Orca.maxCharacterLimit)") + "Text length (\(text.count)) must be smaller than \(self._maxCharacterLimit!)") } - let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) var cNumSamples: Int32 = 0 var cPcm: UnsafeMutablePointer? - let status = pv_orca_synthesize(handle, text, cSynthesizeParams, &cNumSamples, &cPcm) + + var cNumAlignments: Int32 = 0 + var cAlignments: UnsafeMutablePointer?>? + + let status = pv_orca_synthesize( + handle, + text, + cSynthesizeParams, + &cNumSamples, + &cPcm, + &cNumAlignments, + &cAlignments) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() throw pvStatusToOrcaError(status, "Unable to synthesize speech", messageStack) @@ -140,10 +332,40 @@ public class Orca { let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) let pcm = Array(buffer) - pv_orca_delete_pcm(cPcm) + var wordArray = [OrcaWord]() + if let cAlignments = cAlignments { + for alignmentIndex in 0.. [OrcaWord] { if handle == nil { throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - if text.count > Orca.maxCharacterLimit { + if text.count > self._maxCharacterLimit! { throw OrcaInvalidArgumentError( - "Text length (\(text.count)) must be smaller than \(Orca.maxCharacterLimit)") + "Text length (\(text.count)) must be smaller than \(self._maxCharacterLimit!)") } - let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) - let status = pv_orca_synthesize_to_file(handle, text, cSynthesizeParams, outputPath) + var cNumAlignments: Int32 = 0 + var cAlignments: UnsafeMutablePointer?>? + + let status = pv_orca_synthesize_to_file( + handle, + text, + cSynthesizeParams, + outputPath, + &cNumAlignments, + &cAlignments) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() throw pvStatusToOrcaError(status, "Unable to synthesize speech to file", messageStack) } + var wordArray = [OrcaWord]() + if let cAlignments = cAlignments { + for alignmentIndex in 0.. [OrcaWord] { + try synthesizeToFile(text: text, outputPath: outputURL.path, speechRate: speechRate, randomState: randomState) } - private func getCSynthesizeParams(speechRate: Double? = nil) throws -> OpaquePointer? { + private func getCSynthesizeParams(speechRate: Double? = nil, randomState: Int64? = nil) throws -> OpaquePointer? { var cParams: OpaquePointer? var status = pv_orca_synthesize_params_init(&cParams) @@ -210,49 +487,60 @@ public class Orca { } } + if randomState != nil { + status = pv_orca_synthesize_params_set_random_state(cParams, randomState!) + if status != PV_STATUS_SUCCESS { + let messageStack = try getMessageStack() + throw pvStatusToOrcaError(status, "Unable to set Orca random state", messageStack) + } + } + return cParams } - private func getValidCharacters() throws -> Set { + /// Opens a stream for streaming text synthesis. + /// + /// - Parameters: + /// - speechRate: Rate of speech of the generated audio. Valid values are within [0.7, 1.3]. + /// - randomState: Random seed for the synthesis process. + /// - Returns: An instance of the OrcaStream class. + /// - Throws: OrcaError + public func streamOpen(speechRate: Double? = nil, randomState: Int64? = nil) throws -> OrcaStream { if handle == nil { - throw OrcaInvalidStateError("Unable to get valid characters - resources have been released") + throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - var cNumCharacters: Int32 = 0 - var cCharacters: UnsafePointer?>? - let status = pv_orca_valid_characters(handle, &cNumCharacters, &cCharacters) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) + + let status = pv_orca_stream_open( + handle, + cSynthesizeParams, + &stream) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Unable to get Orca valid characters", messageStack) - } - - var characters: Set = [] - for i in 0.. String { - if let resourcePath = Bundle(for: type(of: self)).resourceURL?.appendingPathComponent(filePath).path { - if FileManager.default.fileExists(atPath: resourcePath) { - return resourcePath - } + /// + /// - Parameters: + /// - filePath: relative path of a file in the bundle. + /// - Throws: OrcaIOError + /// - Returns: The full path of the resource. + private func getResourcePath(_ filePath: String) throws -> String { + if let resourcePath = Bundle(for: type(of: self)).resourceURL?.appendingPathComponent(filePath).path { + if FileManager.default.fileExists(atPath: resourcePath) { + return resourcePath } - - throw OrcaIOError("Could not find file at path '\(filePath)'. " + - "If this is a packaged asset, ensure you have added it to your xcode project.") } + throw OrcaIOError("Could not find file at path '\(filePath)'. " + + "If this is a packaged asset, ensure you have added it to your xcode project.") + } + private func pvStatusToOrcaError( _ status: pv_status_t, _ message: String, diff --git a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj index 215b4006..045c2669 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj +++ b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj @@ -18,10 +18,10 @@ 1EAEDDE12B745E6A003B8C18 /* BaseTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1EAEDDDF2B745E6A003B8C18 /* BaseTest.swift */; }; 1EAEDDE32B76A9DB003B8C18 /* test_resources in Resources */ = {isa = PBXBuildFile; fileRef = 1EAEDDE22B76A9DB003B8C18 /* test_resources */; }; 1EAEDDE42B76A9DB003B8C18 /* test_resources in Resources */ = {isa = PBXBuildFile; fileRef = 1EAEDDE22B76A9DB003B8C18 /* test_resources */; }; - 392C5FC2C59B4299F5FB7D3B /* libPods-OrcaAppTestUITests.a in Frameworks */ = {isa = PBXBuildFile; fileRef = B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */; }; - 50CC58C08AAD59C8922AC105 /* libPods-PerformanceTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */; }; 6A9164E4B0B1626D27DBA0A1 /* BuildFile in Frameworks */ = {isa = PBXBuildFile; }; - 837665FCC740E76ED8323395 /* libPods-OrcaAppTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */; }; + 6E98C462BF64583E878F8D23 /* libPods-OrcaAppTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */; }; + A24BE54871C74F1054CEE31C /* libPods-PerformanceTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */; }; + C81664FA3D1463F091F643C7 /* libPods-OrcaAppTestUITests.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -42,7 +42,6 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ - 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.debug.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.debug.xcconfig"; sourceTree = ""; }; 1E00644827CEDF9B006FF6E9 /* OrcaAppTest.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = OrcaAppTest.app; sourceTree = BUILT_PRODUCTS_DIR; }; 1E00644B27CEDF9B006FF6E9 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; 1E00644F27CEDF9B006FF6E9 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = ""; }; @@ -58,14 +57,15 @@ 1E5B7AEF2800B2E300F8BDDB /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 1EAEDDDF2B745E6A003B8C18 /* BaseTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BaseTest.swift; sourceTree = ""; }; 1EAEDDE22B76A9DB003B8C18 /* test_resources */ = {isa = PBXFileReference; lastKnownFileType = folder; path = test_resources; sourceTree = ""; }; - 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.release.xcconfig"; sourceTree = ""; }; - 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.debug.xcconfig"; sourceTree = ""; }; - 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.debug.xcconfig"; sourceTree = ""; }; - 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTestUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.release.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.release.xcconfig"; sourceTree = ""; }; - C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-PerformanceTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.release.xcconfig"; sourceTree = ""; }; + 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.release.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.release.xcconfig"; sourceTree = ""; }; + 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTestUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.release.xcconfig"; sourceTree = ""; }; + B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-PerformanceTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.debug.xcconfig"; sourceTree = ""; }; + C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.debug.xcconfig"; sourceTree = ""; }; + D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.release.xcconfig"; sourceTree = ""; }; + D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.debug.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.debug.xcconfig"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -73,7 +73,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 837665FCC740E76ED8323395 /* libPods-OrcaAppTest.a in Frameworks */, + 6E98C462BF64583E878F8D23 /* libPods-OrcaAppTest.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -82,7 +82,7 @@ buildActionMask = 2147483647; files = ( 6A9164E4B0B1626D27DBA0A1 /* BuildFile in Frameworks */, - 392C5FC2C59B4299F5FB7D3B /* libPods-OrcaAppTestUITests.a in Frameworks */, + C81664FA3D1463F091F643C7 /* libPods-OrcaAppTestUITests.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -90,7 +90,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 50CC58C08AAD59C8922AC105 /* libPods-PerformanceTest.a in Frameworks */, + A24BE54871C74F1054CEE31C /* libPods-PerformanceTest.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -106,7 +106,7 @@ 1E00646B27CEDF9C006FF6E9 /* OrcaAppTestUITests */, 1E00644927CEDF9B006FF6E9 /* Products */, FA7D97C92E04F06D3273CCF3 /* Pods */, - D7256B2AE1CB33B60277231D /* Frameworks */, + F38640B51414B75BCC2F786D /* Frameworks */, ); sourceTree = ""; }; @@ -152,12 +152,12 @@ path = PerformanceTest; sourceTree = ""; }; - D7256B2AE1CB33B60277231D /* Frameworks */ = { + F38640B51414B75BCC2F786D /* Frameworks */ = { isa = PBXGroup; children = ( - 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */, - C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */, - B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */, + 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */, + 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */, + B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */, ); name = Frameworks; sourceTree = ""; @@ -165,12 +165,12 @@ FA7D97C92E04F06D3273CCF3 /* Pods */ = { isa = PBXGroup; children = ( - 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */, - FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */, - 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */, - 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */, - 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */, - B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */, + B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */, + D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */, + C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */, + 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */, + D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */, + 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */, ); path = Pods; sourceTree = ""; @@ -182,11 +182,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E00647227CEDF9C006FF6E9 /* Build configuration list for PBXNativeTarget "OrcaAppTest" */; buildPhases = ( - EC36C0D4EA212BA0832C79AF /* [CP] Check Pods Manifest.lock */, + 8E1BD0B920D35BE249B07D2C /* [CP] Check Pods Manifest.lock */, 1E00644427CEDF9B006FF6E9 /* Sources */, 1E00644527CEDF9B006FF6E9 /* Frameworks */, 1E00644627CEDF9B006FF6E9 /* Resources */, - 676B5C752F03BA675A1EB67A /* [CP] Embed Pods Frameworks */, + 21A9820ABBFC1F00206743E6 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -201,11 +201,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E00647827CEDF9C006FF6E9 /* Build configuration list for PBXNativeTarget "OrcaAppTestUITests" */; buildPhases = ( - 111DBCFB7BC184413D3B26CE /* [CP] Check Pods Manifest.lock */, + 0709E7DDD8FF7E7E665392C0 /* [CP] Check Pods Manifest.lock */, 1E00646427CEDF9C006FF6E9 /* Sources */, 1E00646527CEDF9C006FF6E9 /* Frameworks */, 1E00646627CEDF9C006FF6E9 /* Resources */, - 29E6F72AA55D35729B883608 /* [CP] Embed Pods Frameworks */, + 0F4644102E1332551FF466EA /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -221,11 +221,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E5B7AE92800B29F00F8BDDB /* Build configuration list for PBXNativeTarget "PerformanceTest" */; buildPhases = ( - 1E5B7AE12800B29F00F8BDDB /* [CP] Check Pods Manifest.lock */, + D99C6A3E9E6311DFD1CA9C59 /* [CP] Check Pods Manifest.lock */, 1E5B7AE22800B29F00F8BDDB /* Sources */, 1E5B7AE42800B29F00F8BDDB /* Frameworks */, 1E5B7AE62800B29F00F8BDDB /* Resources */, - 1E5B7AE82800B29F00F8BDDB /* [CP] Embed Pods Frameworks */, + 7C76F0EC41B925B2353F2779 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -310,7 +310,7 @@ /* End PBXResourcesBuildPhase section */ /* Begin PBXShellScriptBuildPhase section */ - 111DBCFB7BC184413D3B26CE /* [CP] Check Pods Manifest.lock */ = { + 0709E7DDD8FF7E7E665392C0 /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -332,80 +332,80 @@ shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - 1E5B7AE12800B29F00F8BDDB /* [CP] Check Pods Manifest.lock */ = { + 0F4644102E1332551FF466EA /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - inputPaths = ( - "${PODS_PODFILE_DIR_PATH}/Podfile.lock", - "${PODS_ROOT}/Manifest.lock", - ); - name = "[CP] Check Pods Manifest.lock"; + name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - ); - outputPaths = ( - "$(DERIVED_FILE_DIR)/Pods-PerformanceTest-checkManifestLockResult.txt", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 1E5B7AE82800B29F00F8BDDB /* [CP] Embed Pods Frameworks */ = { + 21A9820ABBFC1F00206743E6 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks.sh\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 29E6F72AA55D35729B883608 /* [CP] Embed Pods Frameworks */ = { + 7C76F0EC41B925B2353F2779 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-input-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-output-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks.sh\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 676B5C752F03BA675A1EB67A /* [CP] Embed Pods Frameworks */ = { + 8E1BD0B920D35BE249B07D2C /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - name = "[CP] Embed Pods Frameworks"; + inputPaths = ( + "${PODS_PODFILE_DIR_PATH}/Podfile.lock", + "${PODS_ROOT}/Manifest.lock", + ); + name = "[CP] Check Pods Manifest.lock"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", + ); + outputPaths = ( + "$(DERIVED_FILE_DIR)/Pods-OrcaAppTest-checkManifestLockResult.txt", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks.sh\"\n"; + shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - EC36C0D4EA212BA0832C79AF /* [CP] Check Pods Manifest.lock */ = { + D99C6A3E9E6311DFD1CA9C59 /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -420,7 +420,7 @@ outputFileListPaths = ( ); outputPaths = ( - "$(DERIVED_FILE_DIR)/Pods-OrcaAppTest-checkManifestLockResult.txt", + "$(DERIVED_FILE_DIR)/Pods-PerformanceTest-checkManifestLockResult.txt", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; @@ -608,18 +608,20 @@ }; 1E00647327CEDF9C006FF6E9 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */; + baseConfigurationReference = B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = OrcaAppTest/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = OrcaDemoApp; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; + INFOPLIST_KEY_UIRequiredDeviceCapabilities = armv7; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; LD_RUNPATH_SEARCH_PATHS = ( @@ -637,18 +639,20 @@ }; 1E00647427CEDF9C006FF6E9 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */; + baseConfigurationReference = D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = OrcaAppTest/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = OrcaDemoApp; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; + INFOPLIST_KEY_UIRequiredDeviceCapabilities = armv7; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; LD_RUNPATH_SEARCH_PATHS = ( @@ -666,11 +670,11 @@ }; 1E00647927CEDF9C006FF6E9 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */; + baseConfigurationReference = C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */; buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -689,11 +693,11 @@ }; 1E00647A27CEDF9C006FF6E9 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */; + baseConfigurationReference = 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */; buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -712,12 +716,12 @@ }; 1E5B7AEA2800B29F00F8BDDB /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */; + baseConfigurationReference = D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */; buildSettings = { CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -737,12 +741,12 @@ }; 1E5B7AEB2800B29F00F8BDDB /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */; + baseConfigurationReference = 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */; buildSettings = { CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", diff --git a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme index c351e8e5..5aadb9b4 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme +++ b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme @@ -60,6 +60,13 @@ ReferencedContainer = "container:OrcaAppTest.xcodeproj"> + + + + CFBundleDevelopmentRegion $(DEVELOPMENT_LANGUAGE) CFBundleDisplayName - PorcupineDemoApp + OrcaDemoApp CFBundleExecutable $(EXECUTABLE_NAME) CFBundleIdentifier diff --git a/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift b/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift index 5269e9fa..4228c9fc 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift +++ b/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift @@ -14,60 +14,34 @@ import Orca struct TestData: Decodable { var test_sentences: TestSentences - var wer_threshold: Float32 + var random_state: Int64 + var alignments: [TestAlignments] } struct TestSentences: Decodable { var text: String var text_no_punctuation: String var text_custom_pronunciation: String + var text_alignment: String var text_invalid: [String] } -extension String { - subscript(index: Int) -> Character { - return self[self.index(self.startIndex, offsetBy: index)] - } +struct TestAlignments: Decodable { + var word: String + var start_sec: Float + var end_sec: Float + var phonemes: [TestPhonemes] } -extension String { - public func levenshtein(_ other: String) -> Int { - let sCount = self.count - let oCount = other.count - - guard sCount != 0 else { - return oCount - } - - guard oCount != 0 else { - return sCount - } - - let line: [Int] = Array(repeating: 0, count: oCount + 1) - var mat: [[Int]] = Array(repeating: line, count: sCount + 1) - - for i in 0...sCount { - mat[i][0] = i - } - - for j in 0...oCount { - mat[0][j] = j - } - - for j in 1...oCount { - for i in 1...sCount { - if self[i - 1] == other[j - 1] { - mat[i][j] = mat[i - 1][j - 1] // no operation - } else { - let del = mat[i - 1][j] + 1 // deletion - let ins = mat[i][j - 1] + 1 // insertion - let sub = mat[i - 1][j - 1] + 1 // substitution - mat[i][j] = min(min(del, ins), sub) - } - } - } +struct TestPhonemes: Decodable { + var phoneme: String + var start_sec: Float + var end_sec: Float +} - return mat[sCount][oCount] +extension String { + subscript(index: Int) -> Character { + return self[self.index(self.startIndex, offsetBy: index)] } } @@ -81,6 +55,15 @@ class BaseTest: XCTestCase { var orcas: [Orca] = [] var testData: TestData? + let testAudioMaleSingle = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_male_single", withExtension: "wav")! + let testAudioMaleStream = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_male_stream", withExtension: "wav")! + let testAudioFemaleSingle = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_female_single", withExtension: "wav")! + let testAudioFemaleStream = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_female_stream", withExtension: "wav")! + override func setUp() async throws { try await super.setUp() @@ -118,7 +101,45 @@ class BaseTest: XCTestCase { return testData } - func characterErrorRate(transcript: String, expectedTranscript: String) -> Float { - return Float(transcript.levenshtein(expectedTranscript)) / Float(expectedTranscript.count) + func compareArrays(arr1: [Int16], arr2: [Int16], step: Int) -> Bool { + for i in stride(from: 0, to: arr1.count - step, by: step) where !(abs(arr1[i] - arr2[i]) <= 500) { + return false + } + return true + } + + func getPcm(fileUrl: URL) throws -> [Int16] { + let data = try Data(contentsOf: fileUrl) + let pcmData = data.withUnsafeBytes { (ptr: UnsafePointer) -> [Int16] in + let count = data.count / MemoryLayout.size + return Array(UnsafeBufferPointer(start: ptr.advanced(by: 22), count: count - 22)) + } + return pcmData + } + + func validateMetadata(words: [OrcaWord], expectedWords: [OrcaWord], isExpectExact: Bool) { + XCTAssertEqual(words.count, expectedWords.count) + + for i in 0.. 0 { results.append(totalNSec) } - orca?.delete() + orca.delete() } let avgNSec = results.reduce(0.0, +) / Double(numTestIterations) diff --git a/binding/ios/OrcaAppTest/Podfile b/binding/ios/OrcaAppTest/Podfile index 0cafd440..f7606386 100644 --- a/binding/ios/OrcaAppTest/Podfile +++ b/binding/ios/OrcaAppTest/Podfile @@ -2,16 +2,13 @@ source 'https://cdn.cocoapods.org/' platform :ios, '13.0' target 'OrcaAppTest' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end target 'OrcaAppTestUITests' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end target 'PerformanceTest' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end diff --git a/binding/ios/OrcaAppTest/Podfile.lock b/binding/ios/OrcaAppTest/Podfile.lock index 9776fe52..41e5ed08 100644 --- a/binding/ios/OrcaAppTest/Podfile.lock +++ b/binding/ios/OrcaAppTest/Podfile.lock @@ -1,20 +1,16 @@ PODS: - - Leopard-iOS (2.0.1) - - Orca-iOS (0.1.0) + - Orca-iOS (0.2.0) DEPENDENCIES: - - Leopard-iOS (~> 2.0.1) - - Orca-iOS (~> 0.1.0) + - Orca-iOS (~> 0.2.0) SPEC REPOS: trunk: - - Leopard-iOS - Orca-iOS SPEC CHECKSUMS: - Leopard-iOS: 8c94dcf886800b4ed361c1c6af763780ef16f722 - Orca-iOS: 808b4c77678454905ea0a0c1408eff8f9255e3ac + Orca-iOS: 01bbf44ba52a102104fc09aded6bfda7beb4865e -PODFILE CHECKSUM: 1ab9a668595c361f16dadb12876e074b4092d531 +PODFILE CHECKSUM: 01d0a4d9f05893e5371be0f1775f4f59ed59da27 -COCOAPODS: 1.11.3 +COCOAPODS: 1.15.2 diff --git a/binding/ios/OrcaAppTest/copy_test_resources.sh b/binding/ios/OrcaAppTest/copy_test_resources.sh index deee665b..2c850b78 100755 --- a/binding/ios/OrcaAppTest/copy_test_resources.sh +++ b/binding/ios/OrcaAppTest/copy_test_resources.sh @@ -6,9 +6,9 @@ echo "Copying test model files..." mkdir -p ${ASSETS_DIR}/model_files cp ${LIB_DIR}/common/*.pv ${ASSETS_DIR}/model_files -echo "Copying Leopard model files..." -mkdir -p ${ASSETS_DIR}/model_files -cp ${RESOURCE_DIR}/.test/models/*.pv ${ASSETS_DIR}/model_files +echo "Copying wav files..." +mkdir -p ${ASSETS_DIR}/wav +cp ${RESOURCE_DIR}/.test/wav/*.wav ${ASSETS_DIR}/wav echo "Copying test data file..." -cp ${RESOURCE_DIR}/.test/test_data.json ${ASSETS_DIR} \ No newline at end of file +cp ${RESOURCE_DIR}/.test/test_data.json ${ASSETS_DIR} diff --git a/binding/ios/README.md b/binding/ios/README.md index 7bd8dea2..a232528f 100644 --- a/binding/ios/README.md +++ b/binding/ios/README.md @@ -1,8 +1,9 @@ -# Orca Text-to-Speech Engine +# Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency voice +assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -18,7 +19,8 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Installation -The Orca iOS binding is available via [Cocoapods](https://cocoapods.org/pods/Orca-iOS). To import it into your iOS project, add the following line to your Podfile and run `pod install`: +The Orca iOS binding is available via [Cocoapods](https://cocoapods.org/pods/Orca-iOS). To import it into your iOS +project, add the following line to your Podfile and run `pod install`: ```ruby @@ -27,13 +29,18 @@ pod 'Orca-iOS' ## AccessKey -Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca SDKs. +Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca +SDKs. You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. ## Usage -Create an instance of the engine: +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, a complete text is synthesized in a single call to the Orca engine. + +Create an instance of the Orca engine: ```swift import Orca @@ -51,25 +58,65 @@ do { Alternatively, you can provide `modelPath` as an absolute path to the model file on device. -You can synthesize speech by calling one of the `synthesize` methods: +To synthesize a text stream, create an `Orca.OrcaStream` object and add text to it one-by-one: + +```swift +let orcaStream = try orca.streamOpen() + +for textChunk in textGenerator() { + let pcm = orcaStream.synthesize(textChunk) + if pcm != nil { + // handle pcm + } +} + +let pcm = orcaStream.flush() +if pcm != nil { + // handle pcm +} +``` + +The `textGenerator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `orcaStream.synthesize()`. +To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked. +When done with streaming text synthesis, the `Orca.OrcaStream` object needs to be closed: + +```swift +orcaStream.close() +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```swift -// return raw pcm -let pcm = try orca.synthesize(text: "${TEXT}") +// Return raw PCM and alignments +let (pcm, wordArray) = try orca.synthesize(text: "${TEXT}") -// save to a file -try orca.synthesizeToFile(text: "${TEXT}", outputPath: "${OUTPUT_PATH}") +// Save the generated audio to a WAV file directly +let wordArray = try orca.synthesizeToFile(text: "${TEXT}", outputPath: "${OUTPUT_PATH}") ``` Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit PCM WAV file. - +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord` +objects. When done, resources have to be released explicitly: ```swift orca.delete() ``` +### Text input + +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the +`validCharacters()` method provided in the Orca SDK you are using. +Pronunciations of characters or words not supported by this list can be achieved with +[custom pronunciations](#custom-pronunciations). + ### Custom pronunciations Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\ @@ -99,26 +146,48 @@ and replace `${MODEL_FILE_PATH}` or `${MODEL_FILE_URL}` with the path to the mod ### Speech control -Orca allows for keyword arguments to be provided to the `synthesize` methods to control the synthesized speech: +Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `streamOpen` +method or the single synthesis methods `synthesize` and `synthesizeToFile`: - `speechRate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. +- `randomState`: Sets the random state for sampling during synthesis. This can be used to ensure that the synthesized + speech is deterministic across different runs. ```swift let pcm = orca.synthesize( text: "${TEXT}", - speechRate: 1.0) + speechRate: 1.0, + randomState: 1) ``` ### Orca properties -To obtain the set of valid punctuation symbols, call `Orca.validPunctuationSymbols`. +To obtain the set of valid characters, call `Orca.validCharacters`. To retrieve the maximum number of characters allowed, call `Orca.maxCharacterLimit`. The sample rate of Orca is `Orca.sampleRate`. +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaWord` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. + ## Running Unit Tests -Copy your `AccessKey` into the `accessKey` variable in [`OrcaAppTestUITests.swift`](OrcaAppTest/OrcaAppTestUITests/OrcaAppTestUITests.swift). Open `OrcaAppTest.xcworkspace` with XCode and run the tests with `Product > Test`. +Copy your `AccessKey` into the `accessKey` variable +in [`OrcaAppTestUITests.swift`](OrcaAppTest/OrcaAppTestUITests/OrcaAppTestUITests.swift). Open `OrcaAppTest.xcworkspace` +with XCode and run the tests with `Product > Test`. ## Demo App diff --git a/binding/python/README.md b/binding/python/README.md index 6dcb13de..39b598a1 100644 --- a/binding/python/README.md +++ b/binding/python/README.md @@ -1,10 +1,11 @@ # Orca Binding for Python -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -15,7 +16,7 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Compatibility -- Python 3.7+ +- Python 3.8+ - Runs on Linux (x86_64), macOS (x86_64, arm64), Windows (x86_64), Raspberry Pi (5, 4, 3), and NVIDIA Jetson Nano. ## Installation @@ -32,6 +33,10 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Usage +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, a complete text is synthesized in a single call to the Orca engine. + Create an instance of the Orca engine: ```python @@ -42,24 +47,67 @@ orca = pvorca.create(access_key='${ACCESS_KEY}') Replace the `${ACCESS_KEY}` with your AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/). -You can synthesize speech by calling one of the `synthesize` methods: +To synthesize a text stream, create an `Orca.OrcaStream` object and add text to it one-by-one: + +```python +stream = orca.stream_open() + +for text_chunk in text_generator(): + pcm = stream.synthesize(text_chunk) + if pcm is not None: + # handle pcm + +pcm = stream.flush() +if pcm is not None: + # handle pcm +``` + +The `text_generator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `stream.synthesize()`. +To ensure smooth transitions between chunks, the `stream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `stream.flush()` needs to be invoked. +When done with streaming text synthesis, the `Orca.OrcaStream` object needs to be closed: + +```python +stream.close() +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```python # Return raw PCM -pcm = orca.synthesize(text='${TEXT}') +pcm, alignments = orca.synthesize(text='${TEXT}') # Save the generated audio to a WAV file directly -orca.synthesize_to_file(text='${TEXT}', path='${OUTPUT_PATH}') +alignments = orca.synthesize_to_file(text='${TEXT}', path='${OUTPUT_PATH}') ``` Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a -single-channel 16-bit PCM WAV file.\ -When done make sure to explicitly release the resources with `orca.delete()`. +single-channel 16-bit PCM WAV file. +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of a list of `Orca.WordAlignment` +objects. +You can print the metadata with: + +```python +for token in alignments: + print(f"word=\"{token.word}\", start_sec={token.start_sec:.2f}, end_sec={token.end_sec:.2f}") + for phoneme in token.phonemes: + print(f"\tphoneme=\"{phoneme.phoneme}\", start_sec={phoneme.start_sec:.2f}, end_sec={phoneme.end_sec:.2f}") +``` + +When done make sure to explicitly release the resources using: + +```python +orca.delete() +``` ### Text input -Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, as well as -common punctuation marks. You can get a list of all supported characters by calling the +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the `valid_characters()` method provided in the Orca SDK you are using. Pronunciations of characters or words not supported by this list can be achieved with [custom pronunciations](#custom-pronunciations). @@ -87,10 +135,14 @@ and replace `${MODEL_PATH}` with the path to the model file with the desired voi ### Speech control -Orca allows for keyword arguments to be provided to the `synthesize` methods to control the synthesized speech: +Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `stream_open` +method or the single synthesis methods `synthesize` and `synthesize_to_file`: - `speech_rate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. +- `random_state`: Sets the random state for sampling during synthesis. This can be used to ensure that the synthesized + speech is deterministic across different runs. Valid values are all non-negative integers. If not provided, a random + seed will be chosen and the synthesis process will be non-deterministic. ### Orca properties @@ -98,8 +150,23 @@ To obtain the set of valid characters, call `orca.valid_characters`.\ To retrieve the maximum number of characters allowed, call `orca.max_character_limit`.\ The sample rate of Orca is `orca.sample_rate`. +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `Orca.WordAlignment` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** A list of `Orca.PhonemeAlignment` objects. + +The `Orca.PhonemeAlignment` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. + ## Demos [pvorcademo](https://pypi.org/project/pvorcademo/) provides command-line utilities for synthesizing audio using Orca. - diff --git a/binding/python/_orca.py b/binding/python/_orca.py index 53a5030d..15c37aa1 100644 --- a/binding/python/_orca.py +++ b/binding/python/_orca.py @@ -1,10 +1,12 @@ import os +from collections import namedtuple from ctypes import * from enum import Enum from typing import ( Optional, Sequence, - Set) + Set, + Tuple) class OrcaError(Exception): @@ -75,45 +77,152 @@ class OrcaActivationRefusedError(OrcaError): pass -class Orca(object): +class PicovoiceStatuses(Enum): + SUCCESS = 0 + OUT_OF_MEMORY = 1 + IO_ERROR = 2 + INVALID_ARGUMENT = 3 + STOP_ITERATION = 4 + KEY_ERROR = 5 + INVALID_STATE = 6 + RUNTIME_ERROR = 7 + ACTIVATION_ERROR = 8 + ACTIVATION_LIMIT_REACHED = 9 + ACTIVATION_THROTTLED = 10 + ACTIVATION_REFUSED = 11 + + +_PICOVOICE_STATUS_TO_EXCEPTION = { + PicovoiceStatuses.OUT_OF_MEMORY: OrcaMemoryError, + PicovoiceStatuses.IO_ERROR: OrcaIOError, + PicovoiceStatuses.INVALID_ARGUMENT: OrcaInvalidArgumentError, + PicovoiceStatuses.STOP_ITERATION: OrcaStopIterationError, + PicovoiceStatuses.KEY_ERROR: OrcaKeyError, + PicovoiceStatuses.INVALID_STATE: OrcaInvalidStateError, + PicovoiceStatuses.RUNTIME_ERROR: OrcaRuntimeError, + PicovoiceStatuses.ACTIVATION_ERROR: OrcaActivationError, + PicovoiceStatuses.ACTIVATION_LIMIT_REACHED: OrcaActivationLimitError, + PicovoiceStatuses.ACTIVATION_THROTTLED: OrcaActivationThrottledError, + PicovoiceStatuses.ACTIVATION_REFUSED: OrcaActivationRefusedError, +} + + +class COrcaPhonemeAlignment(Structure): + _fields_ = [ + ("phoneme", c_char_p), + ("start_sec", c_float), + ("end_sec", c_float), + ] + + +class COrcaWordAlignment(Structure): + _fields_ = [ + ("word", c_char_p), + ("start_sec", c_float), + ("end_sec", c_float), + ("num_phonemes", c_int32), + ("phonemes", POINTER(POINTER(COrcaPhonemeAlignment))), + ] + + +class Orca: """ Python binding for Orca Text-to-Speech engine. """ - class PicovoiceStatuses(Enum): - SUCCESS = 0 - OUT_OF_MEMORY = 1 - IO_ERROR = 2 - INVALID_ARGUMENT = 3 - STOP_ITERATION = 4 - KEY_ERROR = 5 - INVALID_STATE = 6 - RUNTIME_ERROR = 7 - ACTIVATION_ERROR = 8 - ACTIVATION_LIMIT_REACHED = 9 - ACTIVATION_THROTTLED = 10 - ACTIVATION_REFUSED = 11 - - _PICOVOICE_STATUS_TO_EXCEPTION = { - PicovoiceStatuses.OUT_OF_MEMORY: OrcaMemoryError, - PicovoiceStatuses.IO_ERROR: OrcaIOError, - PicovoiceStatuses.INVALID_ARGUMENT: OrcaInvalidArgumentError, - PicovoiceStatuses.STOP_ITERATION: OrcaStopIterationError, - PicovoiceStatuses.KEY_ERROR: OrcaKeyError, - PicovoiceStatuses.INVALID_STATE: OrcaInvalidStateError, - PicovoiceStatuses.RUNTIME_ERROR: OrcaRuntimeError, - PicovoiceStatuses.ACTIVATION_ERROR: OrcaActivationError, - PicovoiceStatuses.ACTIVATION_LIMIT_REACHED: OrcaActivationLimitError, - PicovoiceStatuses.ACTIVATION_THROTTLED: OrcaActivationThrottledError, - PicovoiceStatuses.ACTIVATION_REFUSED: OrcaActivationRefusedError, - } - class COrca(Structure): pass class COrcaSynthesizeParams(Structure): pass + class COrcaStream(Structure): + pass + + class OrcaStream: + """ + Orca Stream object that converts a stream of text to a stream of audio. + """ + + def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None: + self._handle = handle + self._orca = orca + + def synthesize(self, text: str) -> Optional[Sequence[int]]: + """ + Adds a chunk of text to the Stream object and generates audio if enough text has been added. + This function is expected to be called multiple times with consecutive chunks of text from a text stream. + The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + buffered text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk + for the remaining text that has not yet been synthesized. + The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + + :param text: A chunk of text from a text input stream, comprised of valid characters. + Valid characters can be retrieved by calling `pv_orca_valid_characters()`. + Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + They need to be added in a single call to this function. + The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + :return: The generated audio as a sequence of 16-bit linearly-encoded integers, `None` if no + audio chunk has been produced. + """ + + c_num_samples = c_int32() + c_pcm = POINTER(c_int16)() + + status = self._orca._stream_synthesize_func( + self._handle, + text.encode("utf-8"), + byref(c_num_samples), + byref(c_pcm) + ) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to synthesize text in Orca stream", + message_stack=self._orca._get_error_stack()) + + pcm = None + if c_num_samples.value > 0: + pcm = [c_pcm[i] for i in range(c_num_samples.value)] + + self._orca._pcm_delete_func(c_pcm) + + return pcm + + def flush(self) -> Optional[Sequence[int]]: + """ + Generates audio for all the buffered text that was added to the OrcaStream object + via `pv_orca_stream_synthesize()`. + The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + + :return: The generated audio as a sequence of 16-bit linearly-encoded integers, `None` if no + audio chunk has been produced. + """ + + c_num_samples = c_int32() + c_pcm = POINTER(c_int16)() + + status = self._orca._stream_flush_func( + self._handle, + byref(c_num_samples), + byref(c_pcm) + ) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to flush Orca stream", + message_stack=self._orca._get_error_stack()) + + pcm = [c_pcm[i] for i in range(c_num_samples.value)] + self._orca._pcm_delete_func(c_pcm) + + return pcm + + def close(self) -> None: + """ + Releases the resources acquired by the OrcaStream object. + """ + + self._orca._stream_close_func(self._handle) + def __init__(self, access_key: str, model_path: str, library_path: str) -> None: """ Constructor. @@ -142,7 +251,7 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._get_error_stack_func = library.pv_get_error_stack self._get_error_stack_func.argtypes = [POINTER(POINTER(c_char_p)), POINTER(c_int)] - self._get_error_stack_func.restype = self.PicovoiceStatuses + self._get_error_stack_func.restype = PicovoiceStatuses self._free_error_stack_func = library.pv_free_error_stack self._free_error_stack_func.argtypes = [POINTER(c_char_p)] @@ -150,12 +259,12 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: init_func = library.pv_orca_init init_func.argtypes = [c_char_p, c_char_p, POINTER(POINTER(self.COrca))] - init_func.restype = self.PicovoiceStatuses + init_func.restype = PicovoiceStatuses self._handle = POINTER(self.COrca)() status = init_func(access_key.encode(), model_path.encode(), byref(self._handle)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message='Initialization failed', message_stack=self._get_error_stack()) @@ -163,27 +272,58 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._delete_func.argtypes = [POINTER(self.COrca)] self._delete_func.restype = None - self._valid_characters_func = library.pv_orca_valid_characters - self._valid_characters_func.argtypes = [ + valid_characters_func = library.pv_orca_valid_characters + valid_characters_func.argtypes = [ POINTER(self.COrca), POINTER(c_int32), POINTER(POINTER(POINTER(c_char_p))), ] - self._valid_characters_func.restype = self.PicovoiceStatuses + valid_characters_func.restype = PicovoiceStatuses + + valid_characters_delete_func = library.pv_orca_valid_characters_delete + valid_characters_delete_func.argtypes = [POINTER(POINTER(c_char_p))] + valid_characters_delete_func.restype = None + + c_num_characters = c_int32() + c_characters = POINTER(POINTER(c_char_p))() + status = valid_characters_func(self._handle, byref(c_num_characters), byref(c_characters)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca valid characters", + message_stack=self._get_error_stack()) + + num_characters = c_num_characters.value + characters_array_pointer = cast(c_characters, POINTER(c_char_p * num_characters)) + self._valid_characters = set([symbol.decode('utf-8') for symbol in list(characters_array_pointer.contents)]) + valid_characters_delete_func(c_characters) - self._valid_characters_delete_func = library.pv_orca_valid_characters_delete - self._valid_characters_delete_func.argtypes = [POINTER(POINTER(c_char_p))] - self._valid_characters_delete_func.restype = None + sample_rate_func = library.pv_orca_sample_rate + sample_rate_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] + sample_rate_func.restype = PicovoiceStatuses - self._sample_rate_func = library.pv_orca_sample_rate - self._sample_rate_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] - self._sample_rate_func.restype = self.PicovoiceStatuses + c_sample_rate = c_int32() + status = sample_rate_func(self._handle, byref(c_sample_rate)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca sample rate", + message_stack=self._get_error_stack()) + self._sample_rate = c_sample_rate.value - self._max_character_limit = library.pv_orca_max_character_limit() + max_character_limit_func = library.pv_orca_max_character_limit + max_character_limit_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] + max_character_limit_func.restype = PicovoiceStatuses + + c_max_character_limit = c_int32() + status = max_character_limit_func(self._handle, byref(c_max_character_limit)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca maximum character limit", + message_stack=self._get_error_stack()) + self._max_character_limit = c_max_character_limit.value self._synthesize_params_init_func = library.pv_orca_synthesize_params_init self._synthesize_params_init_func.argtypes = [POINTER(POINTER(self.COrcaSynthesizeParams))] - self._synthesize_params_init_func.restype = self.PicovoiceStatuses + self._synthesize_params_init_func.restype = PicovoiceStatuses self._synthesize_params_delete_func = library.pv_orca_synthesize_params_delete self._synthesize_params_delete_func.argtypes = [POINTER(self.COrcaSynthesizeParams)] @@ -191,7 +331,11 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._synthesize_params_set_speech_rate_func = library.pv_orca_synthesize_params_set_speech_rate self._synthesize_params_set_speech_rate_func.argtypes = [POINTER(self.COrcaSynthesizeParams), c_float] - self._synthesize_params_set_speech_rate_func.restype = self.PicovoiceStatuses + self._synthesize_params_set_speech_rate_func.restype = PicovoiceStatuses + + self._synthesize_params_set_random_state_func = library.pv_orca_synthesize_params_set_random_state + self._synthesize_params_set_random_state_func.argtypes = [POINTER(self.COrcaSynthesizeParams), c_int64] + self._synthesize_params_set_random_state_func.restype = PicovoiceStatuses self._synthesize_func = library.pv_orca_synthesize self._synthesize_func.argtypes = [ @@ -200,8 +344,10 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: POINTER(self.COrcaSynthesizeParams), POINTER(c_int32), POINTER(POINTER(c_int16)), + POINTER(c_int32), + POINTER(POINTER(POINTER(COrcaWordAlignment))), ] - self._synthesize_func.restype = self.PicovoiceStatuses + self._synthesize_func.restype = PicovoiceStatuses self._synthesize_to_file_func = library.pv_orca_synthesize_to_file self._synthesize_to_file_func.argtypes = [ @@ -209,18 +355,56 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: c_char_p, POINTER(self.COrcaSynthesizeParams), c_char_p, + POINTER(c_int32), + POINTER(POINTER(POINTER(COrcaWordAlignment))), ] - self._synthesize_to_file_func.restype = self.PicovoiceStatuses + self._synthesize_to_file_func.restype = PicovoiceStatuses + + self._word_alignments_delete_func = library.pv_orca_word_alignments_delete + self._word_alignments_delete_func.argtypes = [c_int32, POINTER(POINTER(COrcaWordAlignment))] + self._word_alignments_delete_func.restype = PicovoiceStatuses - self._delete_pcm_func = library.pv_orca_delete_pcm - self._delete_pcm_func.argtypes = [POINTER(c_int16)] - self._delete_pcm_func.restype = None + self._pcm_delete_func = library.pv_orca_pcm_delete + self._pcm_delete_func.argtypes = [POINTER(c_int16)] + self._pcm_delete_func.restype = None + + self._stream_open_func = library.pv_orca_stream_open + self._stream_open_func.argtypes = [ + POINTER(self.COrca), + POINTER(self.COrcaSynthesizeParams), + POINTER(POINTER(self.COrcaStream)) + ] + self._stream_open_func.restype = PicovoiceStatuses + + self._stream_synthesize_func = library.pv_orca_stream_synthesize + self._stream_synthesize_func.argtypes = [ + POINTER(self.COrcaStream), + c_char_p, + POINTER(c_int32), + POINTER(POINTER(c_int16)) + ] + self._stream_synthesize_func.restype = PicovoiceStatuses + + self._stream_flush_func = library.pv_orca_stream_flush + self._stream_flush_func.argtypes = [ + POINTER(self.COrcaStream), + POINTER(c_int32), + POINTER(POINTER(c_int16)) + ] + self._stream_flush_func.restype = PicovoiceStatuses + + self._stream_close_func = library.pv_orca_stream_close + self._stream_close_func.argtypes = [POINTER(self.COrcaStream)] + self._stream_close_func.restype = None version_func = library.pv_orca_version version_func.argtypes = [] version_func.restype = c_char_p self._version = version_func().decode("utf-8") + PhonemeAlignment = namedtuple('Phoneme', ['phoneme', 'start_sec', 'end_sec']) + WordAlignment = namedtuple('Word', ['word', 'start_sec', 'end_sec', 'phonemes']) + def delete(self) -> None: """Releases resources acquired by Orca.""" @@ -230,36 +414,13 @@ def delete(self) -> None: def valid_characters(self) -> Set[str]: """Set of characters supported by Orca.""" - c_num_characters = c_int32() - c_characters = POINTER(POINTER(c_char_p))() - - status = self._valid_characters_func(self._handle, byref(c_num_characters), byref(c_characters)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( - message="Unable to get Orca valid characters", - message_stack=self._get_error_stack()) - - num_characters = c_num_characters.value - characters_array_pointer = cast(c_characters, POINTER(c_char_p * num_characters)) - characters = set([symbol.decode('utf-8') for symbol in list(characters_array_pointer.contents)]) - - self._valid_characters_delete_func(c_characters) - - return characters + return self._valid_characters @property def sample_rate(self) -> int: """Audio sample rate of generated audio.""" - c_sample_rate = c_int32() - - status = self._sample_rate_func(self._handle, byref(c_sample_rate)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( - message="Unable to get Orca sample rate", - message_stack=self._get_error_stack()) - - return c_sample_rate.value + return self._sample_rate @property def max_character_limit(self) -> int: @@ -270,7 +431,8 @@ def max_character_limit(self) -> int: def synthesize( self, text: str, - speech_rate: Optional[float] = None) -> Sequence[int]: + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> Tuple[Sequence[int], Sequence[WordAlignment]]: """ Generates audio from text. The returned audio contains the speech representation of the text. @@ -278,37 +440,49 @@ def synthesize( `self.max_character_limit`. Allowed characters can be retrieved by calling `self.pv_orca_valid_characters`. Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". - :param speech_rate: Rate of speech of the synthesized audio. - :return: The generated audio, stored as a sequence of 16-bit linearly-encoded integers. + :param speech_rate: Rate of speech of the synthesized audio. Higher numbers correspond to faster speech. + Valid values are within [0.7, 1.3]. + :param random_state: Random seed for the synthesis process. Valid values are all non-negative integer. If not + provided, a random seed will be chosen. + :return: A tuple containing the generated audio as a sequence of 16-bit linearly-encoded integers + and a sequence of OrcaWordAlignment objects representing the word alignments. """ - c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate) + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) c_num_samples = c_int32() c_pcm = POINTER(c_int16)() + c_num_alignments = c_int32() + c_alignments = POINTER(POINTER(COrcaWordAlignment))() + status = self._synthesize_func( self._handle, text.encode("utf-8"), c_synthesize_params, byref(c_num_samples), - byref(c_pcm)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + byref(c_pcm), + byref(c_num_alignments), + byref(c_alignments)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to synthesize speech", message_stack=self._get_error_stack()) pcm = [c_pcm[i] for i in range(c_num_samples.value)] + self._pcm_delete_func(c_pcm) + + alignments = self._get_alignments(c_num_alignments=c_num_alignments, c_alignments=c_alignments) - self._delete_pcm_func(c_pcm) self._synthesize_params_delete_func(c_synthesize_params) - return pcm + return pcm, alignments def synthesize_to_file( self, text: str, output_path: str, - speech_rate: Optional[float] = None) -> None: + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> Sequence[WordAlignment]: """ Generates audio from text. The returned audio contains the speech representation of the text. @@ -319,44 +493,127 @@ def synthesize_to_file( :param output_path: Absolute path to the output audio file. The output file is saved as `WAV (.wav)` and consists of a single mono channel. :param speech_rate: Rate of speech of the generated audio. + :param random_state: Random seed for the synthesis process. + :return: A sequence of OrcaWordAlignment objects representing the word alignments. """ - c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate) + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) + + c_num_alignments = c_int32() + c_alignments = POINTER(POINTER(COrcaWordAlignment))() status = self._synthesize_to_file_func( self._handle, text.encode("utf-8"), c_synthesize_params, - output_path.encode("utf-8")) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + output_path.encode("utf-8"), + byref(c_num_alignments), + byref(c_alignments)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to synthesize speech", message_stack=self._get_error_stack()) + alignments = self._get_alignments(c_num_alignments=c_num_alignments, c_alignments=c_alignments) + self._synthesize_params_delete_func(c_synthesize_params) + return alignments + + def stream_open(self, speech_rate: Optional[float] = None, random_state: Optional[int] = None) -> 'Orca.OrcaStream': + """ + Opens a stream for streaming text synthesis. + + :param speech_rate: Rate of speech of the generated audio. + :param random_state: Random seed for the synthesis process. + :return: An instance of Orca.OrcaStream. + """ + + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) + + stream_handle = POINTER(Orca.COrcaStream)() + status = self._stream_open_func( + self._handle, + c_synthesize_params, + byref(stream_handle)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to open Orca stream", + message_stack=self._get_error_stack()) + + self._synthesize_params_delete_func(c_synthesize_params) + + return self.OrcaStream(stream_handle, self) + @property def version(self) -> str: """Version.""" return self._version - def _get_c_synthesize_params(self, speech_rate: Optional[float] = None) -> POINTER(COrcaSynthesizeParams): + def _get_alignments( + self, + c_num_alignments: c_int32, + c_alignments: POINTER(POINTER(COrcaWordAlignment))) -> Sequence[WordAlignment]: + alignments = [] + for i in range(c_num_alignments.value): + word_alignment = c_alignments[i].contents + word = word_alignment.word.decode("utf-8") + start_sec = word_alignment.start_sec + end_sec = word_alignment.end_sec + num_phonemes = word_alignment.num_phonemes + phoneme_alignments = [] + for j in range(num_phonemes): + phoneme_alignment = word_alignment.phonemes[j].contents + phoneme = phoneme_alignment.phoneme.decode("utf-8") + phoneme_start_sec = phoneme_alignment.start_sec + phoneme_end_sec = phoneme_alignment.end_sec + phoneme_alignment = self.PhonemeAlignment( + phoneme=phoneme, + start_sec=phoneme_start_sec, + end_sec=phoneme_end_sec) + phoneme_alignments.append(phoneme_alignment) + word_alignment = self.WordAlignment( + word=word, + start_sec=start_sec, + end_sec=end_sec, + phonemes=phoneme_alignments) + alignments.append(word_alignment) + + status = self._word_alignments_delete_func(c_num_alignments.value, c_alignments) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to delete Orca word alignments", + message_stack=self._get_error_stack()) + + return alignments + + def _get_c_synthesize_params( + self, + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> POINTER(COrcaSynthesizeParams): c_params = POINTER(self.COrcaSynthesizeParams)() status = self._synthesize_params_init_func(byref(c_params)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to create Orca synthesize params object", message_stack=self._get_error_stack()) if speech_rate is not None: status = self._synthesize_params_set_speech_rate_func(c_params, c_float(speech_rate)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to set Orca speech rate", message_stack=self._get_error_stack()) + if random_state is not None: + status = self._synthesize_params_set_random_state_func(c_params, c_int64(random_state)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to set Orca random state", + message_stack=self._get_error_stack()) + return c_params def _get_error_stack(self) -> Sequence[str]: @@ -364,8 +621,8 @@ def _get_error_stack(self) -> Sequence[str]: message_stack_depth = c_int() status = self._get_error_stack_func(byref(message_stack_ref), byref(message_stack_depth)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](message="Unable to get Orca error state") + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status](message="Unable to get Orca error state") message_stack = list() for i in range(message_stack_depth.value): diff --git a/binding/python/requirements.txt b/binding/python/requirements.txt index 01726e58..e69de29b 100644 --- a/binding/python/requirements.txt +++ b/binding/python/requirements.txt @@ -1,2 +0,0 @@ -editdistance>=0.6.1 -pvleopard==2.0.1 \ No newline at end of file diff --git a/binding/python/setup.py b/binding/python/setup.py index 92c73b84..f0bd08cb 100644 --- a/binding/python/setup.py +++ b/binding/python/setup.py @@ -49,10 +49,10 @@ setuptools.setup( name="pvorca", - version="0.1.4", + version="0.2.1", author="Picovoice", author_email="hello@picovoice.ai", - description="Orca Text-to-Speech Engine.", + description="Orca Streaming Text-to-Speech Engine", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Picovoice/orca", @@ -66,6 +66,6 @@ "Programming Language :: Python :: 3", "Topic :: Multimedia :: Sound/Audio :: Speech", ], - python_requires='>=3.7', - keywords="Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", + python_requires='>=3.8', + keywords="Streaming Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", ) diff --git a/binding/python/test_orca.py b/binding/python/test_orca.py index c7f0fef5..a5f8f784 100644 --- a/binding/python/test_orca.py +++ b/binding/python/test_orca.py @@ -13,21 +13,21 @@ import os import sys import unittest -from typing import List - -import editdistance -import pvleopard +from typing import List, Sequence from _orca import Orca, OrcaError, OrcaInvalidArgumentError from _util import default_library_path, default_model_path -from test_util import get_model_paths, get_test_data +from test_util import get_model_paths, get_test_data, read_wav_file -test_sentences, wer_threshold = get_test_data() +test_data = get_test_data() class OrcaTestCase(unittest.TestCase): + EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = "female" + access_key: str orcas: List[Orca] + model_paths: List[str] @classmethod def setUpClass(cls): @@ -37,12 +37,44 @@ def setUpClass(cls): model_path=model_path, library_path=default_library_path('../..')) for model_path in get_model_paths()] + cls.model_paths = get_model_paths() @classmethod def tearDownClass(cls): for orca in cls.orcas: orca.delete() + def _test_audio(self, pcm: Sequence[int], ground_truth: Sequence[int]) -> None: + pcm = pcm[:len(ground_truth)] # compensate for discrepancies due to wav header + self.assertEqual(len(pcm), len(ground_truth)) + for i in range(len(pcm)): + self.assertAlmostEqual(pcm[i], ground_truth[i], delta=500) + + def _test_equal_timestamp(self, timestamp: float, timestamp_truth: float) -> None: + self.assertAlmostEqual(timestamp, timestamp_truth, places=2) + + def _test_phoneme_equal(self, phoneme: Orca.PhonemeAlignment, phoneme_truth: Orca.PhonemeAlignment) -> None: + self.assertEqual(phoneme.phoneme, phoneme_truth.phoneme) + self._test_equal_timestamp(phoneme.start_sec, phoneme_truth.start_sec) + self._test_equal_timestamp(phoneme.end_sec, phoneme_truth.end_sec) + + def _test_word_equal(self, word: Orca.WordAlignment, word_truth: Orca.WordAlignment) -> None: + self.assertEqual(word.word, word_truth.word) + self._test_equal_timestamp(word.start_sec, word_truth.start_sec) + self._test_equal_timestamp(word.end_sec, word_truth.end_sec) + + self.assertEqual(len(word.phonemes), len(word_truth.phonemes)) + for phoneme, phoneme_truth in zip(word.phonemes, word_truth.phonemes): + self._test_phoneme_equal(phoneme, phoneme_truth) + + @staticmethod + def _get_pcm(model_path: str, audio_data_folder: str, synthesis_type: str = "single") -> Sequence[int]: + test_wav_folder = os.path.join(os.path.dirname(__file__), "../../", audio_data_folder) + model_name = os.path.basename(model_path) + test_wav_path = \ + os.path.join(f"{test_wav_folder}", model_name.replace(".pv", f"_{synthesis_type}.wav")) + return read_wav_file(test_wav_path) + def test_valid_characters(self) -> None: for orca in self.orcas: characters = orca.valid_characters @@ -59,53 +91,89 @@ def test_sample_rate(self) -> None: self.assertGreater(orca.sample_rate, 0) def test_synthesize(self) -> None: - leopard = None - try: - leopard = pvleopard.create(access_key=self.access_key) - except NotImplementedError as e: - pass - - for orca in self.orcas: - pcm = orca.synthesize(test_sentences.text) + for i, orca in enumerate(self.orcas): + pcm, alignment = orca.synthesize(test_data.text, random_state=test_data.random_state) self.assertGreater(len(pcm), 0) - if leopard is None: - continue + ground_truth = self._get_pcm( + model_path=self.model_paths[i], + audio_data_folder=test_data.audio_data_folder, + synthesis_type="single") - ground_truth = test_sentences.text_no_punctuation.split() - predicted, _ = leopard.process(pcm) + self._test_audio(pcm=pcm, ground_truth=ground_truth) - wer = editdistance.eval(predicted.split(), ground_truth) / len(ground_truth) + def test_synthesize_alignment_exact(self) -> None: + orca = [ + orca for i, orca in enumerate(self.orcas) if + self.EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER in self.model_paths[i]].pop() + pcm, alignments = orca.synthesize(test_data.text_alignment, random_state=test_data.random_state) + self.assertGreater(len(pcm), 0) - if wer > wer_threshold: - print("Ground truth transcript: `%s`" % " ".join(ground_truth)) - print("Predicted transcript from synthesized audio: `%s`" % predicted) - print("=> WER: %.2f" % wer) - self.assertTrue(wer <= wer_threshold) + self.assertTrue(len(alignments) == len(test_data.alignments)) + for word, word_truth in zip(alignments, test_data.alignments): + self._test_word_equal(word, word_truth) + + def test_synthesize_alignment(self) -> None: + for i, orca in enumerate(self.orcas): + if self.EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER in self.model_paths[i]: + continue + + pcm, alignments = orca.synthesize(test_data.text_alignment, random_state=test_data.random_state) + self.assertGreater(len(pcm), 0) + + previous_word_end_sec = 0 + previous_phoneme_end_sec = 0 + for word in alignments: + self.assertTrue(word.start_sec == previous_word_end_sec) + self.assertTrue(word.end_sec > word.start_sec) + previous_word_end_sec = word.end_sec + + for phoneme in word.phonemes: + self.assertTrue(phoneme.start_sec == previous_phoneme_end_sec) + self.assertTrue(phoneme.start_sec >= word.start_sec) + self.assertTrue(phoneme.end_sec <= word.end_sec) + self.assertTrue(phoneme.end_sec > phoneme.start_sec) + previous_phoneme_end_sec = phoneme.end_sec + + def test_streaming_synthesis(self) -> None: + for i, orca in enumerate(self.orcas): + stream = orca.stream_open(random_state=test_data.random_state) + pcm = [] + for c in test_data.text: + pcm_chunk = stream.synthesize(c) + if pcm_chunk is not None: + pcm.extend(pcm_chunk) + pcm_chunk = stream.flush() + if pcm_chunk is not None: + pcm.extend(pcm_chunk) + stream.close() + + ground_truth = self._get_pcm( + model_path=self.model_paths[i], + audio_data_folder=test_data.audio_data_folder, + synthesis_type="stream") + + self._test_audio(pcm=pcm, ground_truth=ground_truth) def test_synthesize_custom_pron(self) -> None: for orca in self.orcas: - pcm_custom = orca.synthesize(test_sentences.text_custom_pronunciation) - self.assertGreater(len(pcm_custom), 0) + pcm, _ = orca.synthesize(test_data.text_custom_pronunciation) + self.assertGreater(len(pcm), 0) def test_synthesize_speech_rate(self) -> None: for orca in self.orcas: - pcm_fast = orca.synthesize(test_sentences.text, speech_rate=1.3) - pcm_slow = orca.synthesize(test_sentences.text, speech_rate=0.7) + pcm_fast, _ = orca.synthesize(test_data.text, speech_rate=1.3) + pcm_slow, _ = orca.synthesize(test_data.text, speech_rate=0.7) self.assertLess(len(pcm_fast), len(pcm_slow)) - try: - _ = orca.synthesize(test_sentences.text, speech_rate=9999) - except OrcaError: - pass - else: - self.fail("Expected OrcaError") + with self.assertRaises(OrcaError): + _ = orca.synthesize(test_data.text, speech_rate=9999) def test_synthesize_to_file(self) -> None: for orca in self.orcas: output_path = os.path.join(os.path.dirname(__file__), "output.wav") - orca.synthesize_to_file(test_sentences.text, output_path=output_path) + orca.synthesize_to_file(test_data.text, output_path=output_path) self.assertTrue(os.path.isfile(output_path)) os.remove(output_path) @@ -117,7 +185,7 @@ def test_version(self) -> None: def test_invalid_input(self) -> None: for orca in self.orcas: - for sentence in test_sentences.text_invalid: + for sentence in test_data.text_invalid: with self.assertRaises(OrcaInvalidArgumentError): orca.synthesize(sentence) @@ -159,7 +227,7 @@ def test_process_message_stack(self): orca._handle = None try: - res = orca.synthesize(test_sentences.text) + res = orca.synthesize(test_data.text) self.assertEqual(len(res), 0) except OrcaError as e: self.assertGreater(len(e.message_stack), 0) diff --git a/binding/python/test_orca_perf.py b/binding/python/test_orca_perf.py index f997efad..e5a3838b 100644 --- a/binding/python/test_orca_perf.py +++ b/binding/python/test_orca_perf.py @@ -19,13 +19,13 @@ from _util import default_library_path from test_util import get_model_paths, get_test_data -test_sentences, _ = get_test_data() +test_data = get_test_data() class OrcaPerformanceTestCase(unittest.TestCase): access_key: str num_test_iterations: int - proc_performance_threshold_sec: float + proc_performance_threshold_rtf: float def test_performance_proc(self) -> None: for model_path in get_model_paths(): @@ -34,29 +34,31 @@ def test_performance_proc(self) -> None: library_path=default_library_path('../..'), model_path=model_path) - perf_results = list() + num_audio_seconds = 0 + num_proc_seconds = 0 for i in range(self.num_test_iterations): start = perf_counter() - _ = orca.synthesize(test_sentences.text) + pcm, _ = orca.synthesize(test_data.text) if i > 0: - perf_results.append(perf_counter() - start) + num_audio_seconds += len(pcm) / orca.sample_rate + num_proc_seconds += perf_counter() - start orca.delete() - avg_perf = sum(perf_results) / self.num_test_iterations - print("Average proc performance [model=%s]: %s seconds" % (os.path.basename(model_path), avg_perf)) - self.assertLess(avg_perf, self.proc_performance_threshold_sec) + real_time_factor = num_audio_seconds / num_proc_seconds + print("Average proc performance[model=%s]: RTF = %s " % (os.path.basename(model_path), real_time_factor)) + self.assertGreater(real_time_factor, self.proc_performance_threshold_rtf) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--access-key', required=True) parser.add_argument('--num-test-iterations', type=int, required=True) - parser.add_argument('--proc-performance-threshold-sec', type=float, required=True) + parser.add_argument('--proc-performance-threshold-rtf', type=float, required=True) args = parser.parse_args() OrcaPerformanceTestCase.access_key = args.access_key OrcaPerformanceTestCase.num_test_iterations = args.num_test_iterations - OrcaPerformanceTestCase.proc_performance_threshold_sec = args.proc_performance_threshold_sec + OrcaPerformanceTestCase.proc_performance_threshold_rtf = args.proc_performance_threshold_rtf unittest.main(argv=sys.argv[:1]) diff --git a/binding/python/test_util.py b/binding/python/test_util.py index 9dd06468..210ccc10 100644 --- a/binding/python/test_util.py +++ b/binding/python/test_util.py @@ -11,25 +11,31 @@ import json import os +import struct +import wave from dataclasses import dataclass -from typing import Sequence, Tuple +from typing import List, Sequence -from typing import List +from _orca import Orca @dataclass -class TestSentences: +class TestData: text: str text_no_punctuation: str text_custom_pronunciation: str + text_alignment: str text_invalid: Sequence[str] + alignments: Sequence[Orca.WordAlignment] + random_state: int + audio_data_folder: str -def get_test_data() -> Tuple[TestSentences, float]: - data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/.test/test_data.json") - with open(data_file_path, encoding="utf8") as data_file: - test_data = json.loads(data_file.read()) - return TestSentences(**test_data["test_sentences"]), test_data["wer_threshold"] +def read_wav_file(path: str) -> Sequence[int]: + with wave.open(path, 'rb') as f: + buffer = f.readframes(f.getnframes()) + # minus 4 because of the header + return struct.unpack(f"{f.getnframes() - 4}h", buffer) def get_model_paths() -> List[str]: @@ -37,7 +43,39 @@ def get_model_paths() -> List[str]: return [os.path.join(model_folder, model_name) for model_name in os.listdir(model_folder)] +def get_test_data() -> TestData: + data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/.test/test_data.json") + with open(data_file_path, encoding="utf8") as data_file: + test_data = json.loads(data_file.read()) + + alignments = [] + for word_data in test_data["alignments"]: + phonemes = [] + for phoneme_data in word_data["phonemes"]: + phoneme = Orca.PhonemeAlignment( + phoneme=phoneme_data["phoneme"], + start_sec=phoneme_data["start_sec"], + end_sec=phoneme_data["end_sec"]) + phonemes.append(phoneme) + + word = Orca.WordAlignment( + word=word_data["word"], + start_sec=word_data["start_sec"], + end_sec=word_data["end_sec"], + phonemes=phonemes) + alignments.append(word) + + test_data = TestData( + alignments=alignments, + random_state=test_data["random_state"], + audio_data_folder=test_data["audio_data_folder"], + **test_data["test_sentences"]) + + return test_data + + __all__ = [ "get_test_data", "get_model_paths", + "read_wav_file", ] diff --git a/binding/web/.gitignore b/binding/web/.gitignore index cf2f85e9..4c610367 100644 --- a/binding/web/.gitignore +++ b/binding/web/.gitignore @@ -1,7 +1,6 @@ node_modules dist lib/pv_orca*.wasm -cypress/fixtures/.test/* +cypress/fixtures/resources/* test/orca_params*.js test/orca_params*.pv -test/leopard_params.pv diff --git a/binding/web/README.md b/binding/web/README.md index 4d002fcb..ad97cc5f 100644 --- a/binding/web/README.md +++ b/binding/web/README.md @@ -1,14 +1,16 @@ # Orca Binding for Web -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: - Linux (x86_64), macOS (x86_64, arm64), and Windows (x86_64) + - Android and iOS - Chrome, Safari, Firefox, and Edge - Raspberry Pi (3, 4, 5) and NVIDIA Jetson Nano @@ -122,6 +124,13 @@ const orca = await OrcaWorker.create( ); ``` +### Streaming vs. Single Synthesis + +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to +the Orca engine. + ### Custom Pronunciations Orca allows the embedding of custom pronunciations in the text via the syntax: `{word|pronunciation}`. The pronunciation @@ -131,33 +140,96 @@ is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for e - "{read|R IY D} this as {read|R EH D}, please." - "I {live|L IH V} in {Sevilla|S EH V IY Y AH}. We have great {live|L AY V} sports!" -### Synthesize Speech +### Orca Properties + +To obtain the complete set of valid characters, call `.validCharacters`. To retrieve the maximum number of +characters allowed, call `.maxCharacterLimit`. The sample rate of the generated `Int16Array` is `.sampleRate`. -The `synthesize` function will send the text to the engine and return the speech audio as an `Int16Array`. +### Usage + +#### Streaming Synthesis + +To use streaming synthesis, call `streamOpen` to create an `OrcaStream` object. ```typescript -const speechPcm = await orca.synthesize("${TEXT}"); +const orcaStream = await orca.streamOpen(); +``` + +Then, call `synthesize` on `orcaStream` to generate speech from a stream of text: + +```typescript +function* textStream(): IterableIterator { + ... // yield text chunks e.g. from an LLM response +} + +for (const textChunk of textStream()) { + const pcm = await orcaStream.synthesize(textChunk); + if (pcm !== null) { + // handle pcm + } +} +``` + +The `OrcaStream` object buffers input text until there is enough to generate audio. If there is not enough text to generate +audio, `null` is returned. + +When done, call `flush` to synthesize any remaining text, and `close` to delete the `orcaStream` object. + +```typescript +const flushedPcm = orcaStream.flush(); +if (flushedPcm !== null) { + // handle pcm +} + +orcaStream.close(); +``` + +#### Single Synthesis + +To use single synthesis, simply call `synthesize` directly on the `Orca` instance. The `synthesize` function will send +the text to the engine and return the speech audio data as an `Int16Array` as well as +the [alignments metadata](#alignments-metadata). + +```typescript +const { pcm, alignments } = await orca.synthesize("${TEXT}"); ``` ### Speech Control -Orca allows for an additional argument to be provided to the `synthesize` method to control the synthesized speech: +Orca allows for additional arguments to control the synthesized speech. +These can be provided to `streamOpen` or one of the single mode `synthesize` methods: - `speechRate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value produces speech that is faster, and a lower value produces speech that is slower. The default value is `1.0`. ```typescript const synthesizeParams = { - speechRate: 1.3 + speechRate: 1.3, }; -const speechPcm = await orca.synthesize("${TEXT}", synthesizeParams); +// Streaming synthesis +const OrcaStream = await orca.streamOpen(synthesizeParams); + +// Single synthesis +const result = await orca.synthesize("${TEXT}", synthesizeParams); + ``` -### Orca Properties +### Alignments Metadata -To obtain the complete set of valid characters, call `.validCharacters`. To retrieve the maximum number of -characters allowed, call `.maxCharacterLimit`. The sample rate of Orca is `.sampleRate`. +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaAlignment` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. ### Clean Up diff --git a/binding/web/cypress.config.ts b/binding/web/cypress.config.ts index 95dea01a..ff2969e8 100644 --- a/binding/web/cypress.config.ts +++ b/binding/web/cypress.config.ts @@ -6,8 +6,8 @@ export default defineConfig({ 'PROC_PERFORMANCE_THRESHOLD_SEC': 10, }, e2e: { + supportFile: 'cypress/support/index.ts', defaultCommandTimeout: 30000, - supportFile: false, specPattern: 'test/*.test.{js,jsx,ts,tsx}', video: false, screenshotOnRunFailure: false, diff --git a/binding/web/cypress/support/commands.ts b/binding/web/cypress/support/commands.ts new file mode 100644 index 00000000..2c2e7297 --- /dev/null +++ b/binding/web/cypress/support/commands.ts @@ -0,0 +1,9 @@ + +const WAV_HEADER_SIZE = 44; + +Cypress.Commands.add("getFramesFromFile", (path: string) => { + cy.fixture(path, 'base64').then(Cypress.Blob.base64StringToBlob).then(async blob => { + const data = new Int16Array(await blob.arrayBuffer()); + return data.slice(WAV_HEADER_SIZE / Int16Array.BYTES_PER_ELEMENT); + }); +}); diff --git a/binding/web/cypress/support/index.ts b/binding/web/cypress/support/index.ts new file mode 100644 index 00000000..a4db0a5e --- /dev/null +++ b/binding/web/cypress/support/index.ts @@ -0,0 +1,9 @@ +import './commands'; + +declare global { + namespace Cypress { + interface Chainable { + getFramesFromFile(path: string): Chainable; + } + } +} diff --git a/binding/web/package.json b/binding/web/package.json index 2595abfe..cd740bab 100644 --- a/binding/web/package.json +++ b/binding/web/package.json @@ -3,7 +3,7 @@ "description": "Orca Text-to-Speech engine for web browsers (via WebAssembly)", "author": "Picovoice Inc", "license": "Apache-2.0", - "version": "0.1.1", + "version": "0.2.0", "keywords": [ "orca", "web", diff --git a/binding/web/scripts/setup_test.js b/binding/web/scripts/setup_test.js index a9aee392..a68d22d6 100644 --- a/binding/web/scripts/setup_test.js +++ b/binding/web/scripts/setup_test.js @@ -4,7 +4,7 @@ const { join } = require('path'); console.log('Copying the orca & leopard models...'); const testDirectory = join(__dirname, '..', 'test'); -const fixturesDirectory = join(__dirname, '..', 'cypress', 'fixtures'); +const fixturesDirectory = join(__dirname, '..', 'cypress', 'fixtures', 'resources'); const paramsSourceDirectory = join( __dirname, @@ -21,16 +21,7 @@ const sourceDirectory = join( '..', '..', 'resources', -); - -const testingModelFilesSourceDirectory = join( - __dirname, - '..', - '..', - '..', - 'resources', '.test', - 'models', ); try { @@ -40,12 +31,12 @@ try { fs.copyFileSync(join(paramsSourceDirectory, file), join(testDirectory, file)); }); - fs.readdirSync(testingModelFilesSourceDirectory).forEach(file => { - fs.copyFileSync(join(testingModelFilesSourceDirectory, file), join(testDirectory, file)); - }); + fs.mkdirSync(join(fixturesDirectory, '.test', 'wav'), { recursive: true }); + fs.copyFileSync(join(sourceDirectory, 'test_data.json'), join(fixturesDirectory, '.test', 'test_data.json')); - fs.mkdirSync(join(fixturesDirectory, '.test'), { recursive: true }); - fs.copyFileSync(join(sourceDirectory, '.test', 'test_data.json'), join(fixturesDirectory, '.test', 'test_data.json')); + fs.readdirSync(join(sourceDirectory, 'wav')).forEach(file => { + fs.copyFileSync(join(sourceDirectory, 'wav', file), join(fixturesDirectory, '.test', 'wav', file)); + }); } catch (error) { console.error(error); } diff --git a/binding/web/src/index.ts b/binding/web/src/index.ts index 728a928c..dcd7a8f0 100644 --- a/binding/web/src/index.ts +++ b/binding/web/src/index.ts @@ -1,8 +1,12 @@ -import { Orca } from './orca'; -import { OrcaWorker } from './orca_worker'; +import { Orca, OrcaStream } from './orca'; +import { OrcaWorker, OrcaStreamWorker } from './orca_worker'; import { OrcaModel, + OrcaSynthesizeParams, + OrcaPhoneme, + OrcaAlignment, + OrcaSynthesizeResult, OrcaWorkerInitRequest, OrcaWorkerSynthesizeRequest, OrcaWorkerReleaseRequest, @@ -26,8 +30,15 @@ OrcaWorker.setWasmSimd(orcaWasmSimd); export { Orca, + OrcaStream, + OrcaErrors, OrcaModel, + OrcaSynthesizeParams, + OrcaPhoneme, + OrcaAlignment, + OrcaSynthesizeResult, OrcaWorker, + OrcaStreamWorker, OrcaWorkerInitRequest, OrcaWorkerSynthesizeRequest, OrcaWorkerReleaseRequest, @@ -37,5 +48,4 @@ export { OrcaWorkerReleaseResponse, OrcaWorkerFailureResponse, OrcaWorkerResponse, - OrcaErrors, }; diff --git a/binding/web/src/orca.ts b/binding/web/src/orca.ts index 40e2282b..07cf52ab 100644 --- a/binding/web/src/orca.ts +++ b/binding/web/src/orca.ts @@ -13,6 +13,8 @@ import { Mutex } from 'async-mutex'; +import { simd } from 'wasm-feature-detect'; + import { aligned_alloc_type, arrayBufferToStringAtIndex, @@ -23,9 +25,15 @@ import { PvError, } from '@picovoice/web-utils'; -import { simd } from 'wasm-feature-detect'; - -import { OrcaModel, PvStatus, SynthesizeParams } from './types'; +import { + OrcaAlignment, + OrcaModel, + OrcaPhoneme, + OrcaStreamSynthesizeResult, + OrcaSynthesizeParams, + OrcaSynthesizeResult, + PvStatus, +} from './types'; import * as OrcaErrors from './orca_errors'; import { pvStatusToException } from './orca_errors'; @@ -38,109 +46,379 @@ type pv_orca_delete_type = (object: number) => Promise; type pv_orca_valid_characters_type = (object: number, numCharacters: number, validCharacters: number) => Promise; type pv_orca_valid_characters_delete_type = (validCharacters: number) => Promise; type pv_orca_sample_rate_type = (object: number, sampleRate: number) => Promise; -type pv_orca_max_character_limit_type = () => Promise; +type pv_orca_max_character_limit_type = (object: number, maxCharacterLimit: number) => Promise; type pv_orca_synthesize_params_init_type = (object: number) => Promise; type pv_orca_synthesize_params_delete_type = (object: number) => Promise; type pv_orca_synthesize_params_set_speech_rate_type = (object: number, speechRate: number) => Promise; -type pv_orca_synthesize_type = (object: number, text: number, synthesizeParams: number, numSamples: number, pcm: number) => Promise; -type pv_orca_delete_pcm_type = (object: number) => Promise; +type pv_orca_synthesize_params_set_random_state_type = (object: number, randomState: bigint) => Promise; +type pv_orca_synthesize_type = (object: number, text: number, synthesizeParams: number, numSamples: number, pcm: number, numAlignments: number, alignments: number) => Promise; +type pv_orca_pcm_delete_type = (object: number) => Promise; +type pv_orca_word_alignments_delete_type = (numAlignments: number, alignments: number) => Promise; +type pv_orca_stream_open_type = (object: number, synthesizeParams: number, stream: number) => Promise; +type pv_orca_stream_synthesize_type = (object: number, text: number, numSamples: number, pcm: number) => Promise; +type pv_orca_stream_flush_type = (object: number, numSamples: number, pcm: number) => Promise; +type pv_orca_stream_close_type = (object: number) => Promise; type pv_orca_version_type = () => Promise; -type pv_status_to_string_type = (status: number) => Promise type pv_set_sdk_type = (sdk: number) => Promise; type pv_get_error_stack_type = (messageStack: number, messageStackDepth: number) => Promise; type pv_free_error_stack_type = (messageStack: number) => Promise; -/** - * JavaScript/WebAssembly Binding for Orca - */ - type OrcaWasmOutput = { - alignedAlloc: aligned_alloc_type; - memory: WebAssembly.Memory; - pvFree: pv_free_type; - version: string; sampleRate: number; - maxCharacterLimit: number; validCharacters: string[]; + maxCharacterLimit: number; - objectAddress: number; - inputBufferAddress: number; - synthesizeParamsAddressAddress: number; - speechRateAddress: number; + memory: WebAssembly.Memory; + alignedAlloc: aligned_alloc_type; + pvFree: pv_free_type; + pvGetErrorStack: pv_get_error_stack_type; + pvFreeErrorStack: pv_free_error_stack_type; messageStackAddressAddressAddress: number; messageStackDepthAddress: number; + objectAddress: number; pvOrcaDelete: pv_orca_delete_type; pvOrcaSynthesize: pv_orca_synthesize_type; pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type - pvOrcaDeletePcm: pv_orca_delete_pcm_type; - pvStatusToString: pv_status_to_string_type; - pvGetErrorStack: pv_get_error_stack_type; - pvFreeErrorStack: pv_free_error_stack_type; + pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state_type + pvOrcaPcmDelete: pv_orca_pcm_delete_type; + pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete_type; + + streamPcmAddressAddress: number; + pvOrcaStreamOpen: pv_orca_stream_open_type; + pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + pvOrcaStreamFlush: pv_orca_stream_flush_type; + pvOrcaStreamClose: pv_orca_stream_close_type; }; -const PV_STATUS_SUCCESS = 10000; - -export class Orca { - private readonly _pvOrcaDelete: pv_orca_delete_type; - private readonly _pvOrcaSynthesize: pv_orca_synthesize_type; - private readonly _pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; - private readonly _pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; - private readonly _pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type; - private readonly _pvOrcaDeletePcm: pv_orca_delete_pcm_type; +/** + * OrcaStream object that converts a stream of text to a stream of audio. + */ +class Stream { + private _wasmMemory: WebAssembly.Memory; + private readonly _alignedAlloc: CallableFunction; + private readonly _pvFree: pv_free_type; private readonly _pvGetErrorStack: pv_get_error_stack_type; private readonly _pvFreeErrorStack: pv_free_error_stack_type; + private readonly _messageStackAddressAddressAddress: number; + private readonly _messageStackDepthAddress: number; - private _wasmMemory: WebAssembly.Memory | undefined; + private readonly _functionMutex: Mutex; + private readonly _streamPcmAddressAddress: number; + private readonly _pvOrcaPcmDelete: pv_orca_pcm_delete_type; + private readonly _pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + private readonly _pvOrcaStreamFlush: pv_orca_stream_flush_type; + private readonly _pvOrcaStreamClose: pv_orca_stream_close_type; + private readonly _streamAddress: number; + private readonly _getMessageStack: any; + + constructor( + wasmMemory: WebAssembly.Memory, + alignedAlloc: CallableFunction, + pvFree: pv_free_type, + pvGetErrorStack: pv_get_error_stack_type, + pvFreeErrorStack: pv_free_error_stack_type, + messageStackAddressAddressAddress: number, + messageStackDepthAddress: number, + functionMutex: Mutex, + streamPcmAddressAddress: number, + pvOrcaPcmDelete: pv_orca_pcm_delete_type, + pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type, + pvOrcaStreamFlush: pv_orca_stream_flush_type, + pvOrcaStreamClose: pv_orca_stream_close_type, + streamAddress: number, + getMessageStack: any, + ) { + this._wasmMemory = wasmMemory; + this._alignedAlloc = alignedAlloc; + this._pvFree = pvFree; + this._pvGetErrorStack = pvGetErrorStack; + this._pvFreeErrorStack = pvFreeErrorStack; + this._messageStackAddressAddressAddress = messageStackAddressAddressAddress; + this._messageStackDepthAddress = messageStackDepthAddress; + this._functionMutex = functionMutex; + this._streamPcmAddressAddress = streamPcmAddressAddress; + this._pvOrcaPcmDelete = pvOrcaPcmDelete; + this._pvOrcaStreamSynthesize = pvOrcaStreamSynthesize; + this._pvOrcaStreamFlush = pvOrcaStreamFlush; + this._pvOrcaStreamClose = pvOrcaStreamClose; + this._streamAddress = streamAddress; + this._getMessageStack = getMessageStack; + } - private readonly _pvFree: pv_free_type; - private readonly _synthesizeMutex: Mutex; + /** + * Adds a chunk of text to the Stream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public async synthesize(text: string): Promise { + if (typeof text !== 'string') { + throw new OrcaErrors.OrcaInvalidArgumentError( + 'The argument \'text\' must be provided as a string', + ); + } - private readonly _objectAddress: number; - private readonly _alignedAlloc: CallableFunction; - private readonly _inputBufferAddress: number; - private readonly _messageStackAddressAddressAddress: number; - private readonly _messageStackDepthAddress: number; + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError( + 'Attempted to call Orca stream synthesize after release.', + ); + } + + const memoryBufferText = new Uint8Array(this._wasmMemory.buffer); + const encodedText = new TextEncoder().encode(text); + const textAddress = await this._alignedAlloc( + Uint8Array.BYTES_PER_ELEMENT, + (encodedText.length + 1) * Uint8Array.BYTES_PER_ELEMENT, + ); + if (textAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError( + 'malloc failed: Cannot allocate memory', + ); + } + memoryBufferText.set(encodedText, textAddress); + memoryBufferText[textAddress + encodedText.length] = 0; + + const numSamplesAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numSamplesAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamSynthesizeStatus = await this._pvOrcaStreamSynthesize( + this._streamAddress, + textAddress, + numSamplesAddress, + this._streamPcmAddressAddress, + ); + await this._pvFree(textAddress); + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + if (streamSynthesizeStatus !== PvStatus.SUCCESS) { + const messageStack = await this._getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + throw pvStatusToException(streamSynthesizeStatus, 'Stream synthesize failed', messageStack); + } + + const pcmAddress = memoryBufferView.getInt32( + this._streamPcmAddressAddress, + true, + ); + + const numSamples = memoryBufferView.getInt32( + numSamplesAddress, + true, + ); + await this._pvFree(numSamplesAddress); + + const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + return pcm.length > 0 ? pcm : null; + }) + .then((result: OrcaStreamSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public async flush(): Promise { + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError('Attempted to call OrcaStream flush after release.'); + } + + const numSamplesAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numSamplesAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const pcmAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (pcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamFlushStatus = await this._pvOrcaStreamFlush( + this._streamAddress, + numSamplesAddress, + pcmAddressAddress, + ); + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + if (streamFlushStatus !== PvStatus.SUCCESS) { + const messageStack = await this._getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(streamFlushStatus, 'Flush failed', messageStack); + } + + const pcmAddress = memoryBufferView.getInt32( + pcmAddressAddress, + true, + ); + await this._pvFree(pcmAddressAddress); + + const numSamples = memoryBufferView.getInt32( + numSamplesAddress, + true, + ); + await this._pvFree(numSamplesAddress); + + const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + return pcm.length > 0 ? pcm : null; + }) + .then((result: OrcaStreamSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public async close(): Promise { + await this._pvOrcaStreamClose(this._streamAddress); + } +} + +export type OrcaStream = Stream + +/** + * JavaScript/WebAssembly Binding for Orca + */ +export class Orca { private static _version: string; private static _sampleRate: number; - private static _maxCharacterLimit: number; private static _validCharacters: string[]; + private static _maxCharacterLimit: number; + + private _wasmMemory?: WebAssembly.Memory; + private readonly _alignedAlloc: CallableFunction; + private readonly _pvFree: pv_free_type; + private readonly _pvGetErrorStack: pv_get_error_stack_type; + private readonly _pvFreeErrorStack: pv_free_error_stack_type; + private readonly _messageStackAddressAddressAddress: number; + private readonly _messageStackDepthAddress: number; + + private readonly _objectAddress: number; + private readonly _pvOrcaDelete: pv_orca_delete_type; + private readonly _pvOrcaSynthesize: pv_orca_synthesize_type; + private readonly _pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; + private readonly _pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; + private readonly _pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type; + private readonly _pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state_type; + private readonly _pvOrcaPcmDelete: pv_orca_pcm_delete_type; + private readonly _pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete_type; + + private readonly _streamPcmAddressAddress: number; + private readonly _pvOrcaStreamOpen: pv_orca_stream_open_type; + private readonly _pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + private readonly _pvOrcaStreamFlush: pv_orca_stream_flush_type; + private readonly _pvOrcaStreamClose: pv_orca_stream_close_type; + private readonly _functionMutex: Mutex; + private static _wasm: string; private static _wasmSimd: string; private static _sdk: string = 'web'; private static _orcaMutex = new Mutex(); - private constructor( - handleWasm: OrcaWasmOutput, - ) { + private constructor(handleWasm: OrcaWasmOutput) { Orca._version = handleWasm.version; Orca._sampleRate = handleWasm.sampleRate; - Orca._maxCharacterLimit = handleWasm.maxCharacterLimit; Orca._validCharacters = handleWasm.validCharacters; + Orca._maxCharacterLimit = handleWasm.maxCharacterLimit; + this._wasmMemory = handleWasm.memory; + this._alignedAlloc = handleWasm.alignedAlloc; + this._pvFree = handleWasm.pvFree; + this._pvGetErrorStack = handleWasm.pvGetErrorStack; + this._pvFreeErrorStack = handleWasm.pvFreeErrorStack; + this._messageStackAddressAddressAddress = handleWasm.messageStackAddressAddressAddress; + this._messageStackDepthAddress = handleWasm.messageStackDepthAddress; + + this._objectAddress = handleWasm.objectAddress; this._pvOrcaDelete = handleWasm.pvOrcaDelete; this._pvOrcaSynthesize = handleWasm.pvOrcaSynthesize; this._pvOrcaSynthesizeParamsInit = handleWasm.pvOrcaSynthesizeParamsInit; this._pvOrcaSynthesizeParamsDelete = handleWasm.pvOrcaSynthesizeParamsDelete; this._pvOrcaSynthesizeParamsSetSpeechRate = handleWasm.pvOrcaSynthesizeParamsSetSpeechRate; - this._pvOrcaDeletePcm = handleWasm.pvOrcaDeletePcm; - this._pvGetErrorStack = handleWasm.pvGetErrorStack; - this._pvFreeErrorStack = handleWasm.pvFreeErrorStack; + this._pvOrcaSynthesizeParamsSetRandomState = handleWasm.pvOrcaSynthesizeParamsSetRandomState; + this._pvOrcaPcmDelete = handleWasm.pvOrcaPcmDelete; + this._pvOrcaWordAlignmentsDelete = handleWasm.pvOrcaWordAlignmentsDelete; - this._alignedAlloc = handleWasm.alignedAlloc; - this._wasmMemory = handleWasm.memory; - this._pvFree = handleWasm.pvFree; - this._objectAddress = handleWasm.objectAddress; - this._inputBufferAddress = handleWasm.inputBufferAddress; - this._messageStackAddressAddressAddress = handleWasm.messageStackAddressAddressAddress; - this._messageStackDepthAddress = handleWasm.messageStackDepthAddress; + this._streamPcmAddressAddress = handleWasm.streamPcmAddressAddress; + this._pvOrcaStreamOpen = handleWasm.pvOrcaStreamOpen; + this._pvOrcaStreamSynthesize = handleWasm.pvOrcaStreamSynthesize; + this._pvOrcaStreamFlush = handleWasm.pvOrcaStreamFlush; + this._pvOrcaStreamClose = handleWasm.pvOrcaStreamClose; - this._synthesizeMutex = new Mutex(); + this._functionMutex = new Mutex(); } /** @@ -158,17 +436,17 @@ export class Orca { } /** - * Get maximum character limit. + * Get valid characters. */ - get maxCharacterLimit(): number { - return Orca._maxCharacterLimit; + get validCharacters(): string[] { + return Orca._validCharacters; } /** - * Get valid characters. + * Get maximum character limit. */ - get validCharacters(): string[] { - return Orca._validCharacters; + get maxCharacterLimit(): number { + return Orca._maxCharacterLimit; } /** @@ -218,13 +496,10 @@ export class Orca { const customWritePath = (model.customWritePath) ? model.customWritePath : 'orca_model'; const modelPath = await loadModel({ ...model, customWritePath }); - return Orca._init( - accessKey, - modelPath, - ); + return Orca._init(accessKey, modelPath); } - public static async _init( + public static _init( accessKey: string, modelPath: string, ): Promise { @@ -236,7 +511,7 @@ export class Orca { Orca._orcaMutex .runExclusive(async () => { const isSimd = await simd(); - const wasmOutput = await Orca.initWasm(accessKey.trim(), (isSimd) ? this._wasmSimd : this._wasm, modelPath); + const wasmOutput = await Orca.initWasm(accessKey.trim(), modelPath, (isSimd) ? this._wasmSimd : this._wasm); return new Orca(wasmOutput); }) .then((result: Orca) => { @@ -248,35 +523,50 @@ export class Orca { }); } + /** * Generates audio from text. The returned audio contains the speech representation of the text. - * - * @param text Generates audio from text. The returned audio contains the speech representation of the text. * The maximum number of characters per call to `.synthesize()` is `.maxCharacterLimit`. * Allowed characters are lower-case and upper-case letters and punctuation marks that can be retrieved with `.validCharacters`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". + * + * @param text A string of text. * @param synthesizeParams Optional configuration arguments. * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + * + * @return A result object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and a sequence of OrcaAlignment objects representing the word alignments. */ - public async synthesize(text: string, synthesizeParams: SynthesizeParams = {}): Promise { + public async synthesize( + text: string, + synthesizeParams: OrcaSynthesizeParams = { + speechRate: 1.0, + randomState: null, + }, + ): Promise { if (typeof text !== 'string') { - throw new OrcaErrors.OrcaInvalidArgumentError('The argument \'text\' must be provided as a string'); + throw new OrcaErrors.OrcaInvalidArgumentError( + `The argument 'text' must be provided as a string`, + ); } - const { - speechRate = 1.0, - } = synthesizeParams; + if (text.trim().length > Orca._maxCharacterLimit) { + throw new OrcaErrors.OrcaInvalidArgumentError(` + 'text' length must be smaller than ${Orca._maxCharacterLimit} + `); + } - return new Promise((resolve, reject) => { - this._synthesizeMutex + return new Promise((resolve, reject) => { + this._functionMutex .runExclusive(async () => { if (this._wasmMemory === undefined) { - throw new OrcaErrors.OrcaInvalidStateError('Attempted to call Orca synthesize after release.'); + throw new OrcaErrors.OrcaInvalidStateError( + 'Attempted to call Orca synthesize after release.', + ); } - const memoryBufferView = new DataView(this._wasmMemory.buffer); - const memoryBufferText = new Uint8Array(this._wasmMemory.buffer); const encodedText = new TextEncoder().encode(text); const textAddress = await this._alignedAlloc( @@ -299,9 +589,11 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } + const memoryBufferView = new DataView(this._wasmMemory.buffer); const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + const initStatus = await this._pvOrcaSynthesizeParamsInit(synthesizeParamsAddressAddress); - if (initStatus !== PV_STATUS_SUCCESS) { + if (initStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( this._pvGetErrorStack, this._pvFreeErrorStack, @@ -311,23 +603,47 @@ export class Orca { memoryBufferUint8, ); - throw pvStatusToException(initStatus, 'Synthesizing failed', messageStack); + throw pvStatusToException(initStatus, 'Synthesize failed', messageStack); } const synthesizeParamsAddress = memoryBufferView.getInt32(synthesizeParamsAddressAddress, true); await this._pvFree(synthesizeParamsAddressAddress); - const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate(synthesizeParamsAddress, speechRate); - if (setSpeechRateStatus !== PV_STATUS_SUCCESS) { - const messageStack = await Orca.getMessageStack( - this._pvGetErrorStack, - this._pvFreeErrorStack, - this._messageStackAddressAddressAddress, - this._messageStackDepthAddress, - memoryBufferView, - memoryBufferUint8, + + if (synthesizeParams.speechRate !== null && synthesizeParams.speechRate !== undefined) { + const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate( + synthesizeParamsAddress, + synthesizeParams.speechRate, ); + if (setSpeechRateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setSpeechRateStatus, 'Synthesize failed', messageStack); + } + } - throw pvStatusToException(setSpeechRateStatus, 'Synthesizing failed', messageStack); + if (synthesizeParams.randomState !== null && synthesizeParams.randomState !== undefined) { + const setRandomStateStatus = await this._pvOrcaSynthesizeParamsSetRandomState( + synthesizeParamsAddress, + BigInt(synthesizeParams.randomState), + ); + if (setRandomStateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + throw pvStatusToException(setRandomStateStatus, 'Synthesize failed', messageStack); + } } const numSamplesAddress = await this._alignedAlloc( @@ -338,11 +654,27 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } - const speechAddressAddress = await this._alignedAlloc( + const pcmAddressAddress = await this._alignedAlloc( Int32Array.BYTES_PER_ELEMENT, Int32Array.BYTES_PER_ELEMENT, ); - if (speechAddressAddress === 0) { + if (pcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const numAlignmentsAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numAlignmentsAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const alignmentsAddressAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (alignmentsAddressAddressAddress === 0) { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } @@ -351,12 +683,14 @@ export class Orca { textAddress, synthesizeParamsAddress, numSamplesAddress, - speechAddressAddress, + pcmAddressAddress, + numAlignmentsAddress, + alignmentsAddressAddressAddress, ); await this._pvFree(textAddress); await this._pvOrcaSynthesizeParamsDelete(synthesizeParamsAddress); - if (synthesizeStatus !== PV_STATUS_SUCCESS) { + if (synthesizeStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( this._pvGetErrorStack, this._pvFreeErrorStack, @@ -366,14 +700,14 @@ export class Orca { memoryBufferUint8, ); - throw pvStatusToException(synthesizeStatus, 'Synthesizing failed', messageStack); + throw pvStatusToException(synthesizeStatus, 'Synthesize failed', messageStack); } - const speechAddress = memoryBufferView.getInt32( - speechAddressAddress, + const pcmAddress = memoryBufferView.getInt32( + pcmAddressAddress, true, ); - await this._pvFree(speechAddressAddress); + await this._pvFree(pcmAddressAddress); const numSamples = memoryBufferView.getInt32( numSamplesAddress, @@ -382,14 +716,196 @@ export class Orca { await this._pvFree(numSamplesAddress); const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); - const speech = outputMemoryBuffer.slice( - speechAddress / Int16Array.BYTES_PER_ELEMENT, - (speechAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + const numAlignments = memoryBufferView.getInt32(numAlignmentsAddress, true); + const alignmentsAddressAddress = memoryBufferView.getInt32(alignmentsAddressAddressAddress, true); + + let ptr = memoryBufferView.getInt32(alignmentsAddressAddress, true); + const alignments: OrcaAlignment[] = []; + for (let i = 1; i <= numAlignments; i++) { + const wordAddress = memoryBufferView.getInt32(ptr, true); + const word = arrayBufferToStringAtIndex( + memoryBufferUint8, + wordAddress, + ); + ptr += Uint32Array.BYTES_PER_ELEMENT; + const startSec = memoryBufferView.getFloat32(ptr, true); + ptr += Float32Array.BYTES_PER_ELEMENT; + const endSec = memoryBufferView.getFloat32(ptr, true); + ptr += Float32Array.BYTES_PER_ELEMENT; + const numPhonemes = memoryBufferView.getInt32(ptr, true); + ptr += Uint32Array.BYTES_PER_ELEMENT; + const phonemesAddress = memoryBufferView.getInt32(ptr, true); + ptr = memoryBufferView.getInt32(alignmentsAddressAddress + (i * Uint32Array.BYTES_PER_ELEMENT), true); + + let phonemesPtr = memoryBufferView.getInt32(phonemesAddress, true); + const phonemes: OrcaPhoneme[] = []; + for (let j = 1; j <= numPhonemes; j++) { + const phonemeAddress = memoryBufferView.getInt32(phonemesPtr, true); + const phoneme = arrayBufferToStringAtIndex( + memoryBufferUint8, + phonemeAddress, + ); + phonemesPtr += Uint32Array.BYTES_PER_ELEMENT; + const pStartSec = memoryBufferView.getFloat32(phonemesPtr, true); + phonemesPtr += Float32Array.BYTES_PER_ELEMENT; + const pEndSec = memoryBufferView.getFloat32(phonemesPtr, true); + phonemesPtr = memoryBufferView.getInt32(phonemesAddress + (j * Uint32Array.BYTES_PER_ELEMENT), true); + phonemes.push({ phoneme, startSec: pStartSec, endSec: pEndSec }); + } + alignments.push({ word, startSec, endSec, phonemes }); + } + await this._pvFree(numAlignmentsAddress); + await this._pvFree(alignmentsAddressAddressAddress); + await this._pvOrcaWordAlignmentsDelete(numAlignments, alignmentsAddressAddress); + + return { pcm, alignments }; + }) + .then((result: OrcaSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Opens a stream for streaming text synthesis. + * + * @param synthesizeParams Optional configuration arguments. + * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + * + * @returns An instance of OrcaStream. + */ + public async streamOpen( + synthesizeParams: OrcaSynthesizeParams = { + speechRate: 1.0, + randomState: null, + }, + ): Promise { + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError('Attempted to call Orca stream open after release.'); + } + + const synthesizeParamsAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (synthesizeParamsAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + const initStatus = await this._pvOrcaSynthesizeParamsInit(synthesizeParamsAddressAddress); + if (initStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(initStatus, 'Stream open failed', messageStack); + } + + const synthesizeParamsAddress = memoryBufferView.getInt32(synthesizeParamsAddressAddress, true); + await this._pvFree(synthesizeParamsAddressAddress); + + if (synthesizeParams.speechRate !== null && synthesizeParams.speechRate !== undefined) { + const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate(synthesizeParamsAddress, synthesizeParams.speechRate); + if (setSpeechRateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setSpeechRateStatus, 'Stream open failed', messageStack); + } + } + + if (synthesizeParams.randomState !== null && synthesizeParams.randomState !== undefined) { + const setRandomStateStatus = await this._pvOrcaSynthesizeParamsSetRandomState(synthesizeParamsAddress, BigInt(synthesizeParams.randomState)); + if (setRandomStateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setRandomStateStatus, 'Stream open failed', messageStack); + } + } + + const streamAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (streamAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamOpenStatus = await this._pvOrcaStreamOpen( + this._objectAddress, + synthesizeParamsAddress, + streamAddressAddress, + ); + await this._pvOrcaSynthesizeParamsDelete(synthesizeParamsAddress); + + if (streamOpenStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(streamOpenStatus, 'Stream open failed', messageStack); + } + const streamAddress = memoryBufferView.getInt32(streamAddressAddress, true); + await this._pvFree(streamAddressAddress); + + return new Stream( + this._wasmMemory, + this._alignedAlloc, + this._pvFree, + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + this._functionMutex, + this._streamPcmAddressAddress, + this._pvOrcaPcmDelete, + this._pvOrcaStreamSynthesize, + this._pvOrcaStreamFlush, + this._pvOrcaStreamClose, + streamAddress, + Orca.getMessageStack, ); - await this._pvOrcaDeletePcm(speechAddress); - return speech; }) - .then((result: Int16Array) => { + .then(result => { resolve(result); }) .catch(async (error: any) => { @@ -405,30 +921,16 @@ export class Orca { await this._pvOrcaDelete(this._objectAddress); await this._pvFree(this._messageStackAddressAddressAddress); await this._pvFree(this._messageStackDepthAddress); - await this._pvFree(this._inputBufferAddress); + await this._pvFree(this._streamPcmAddressAddress); delete this._wasmMemory; this._wasmMemory = undefined; } - async onmessage(e: MessageEvent): Promise { - switch (e.data.command) { - case 'synthesize': - await this.synthesize(e.data.text, e.data.speechRate); - break; - default: - // eslint-disable-next-line no-console - console.warn(`Unrecognized command: ${e.data.command}`); - } - } - - private static async initWasm(accessKey: string, wasmBase64: string, modelPath: string): Promise { + private static async initWasm(accessKey: string, modelPath: string, wasmBase64: string): Promise { // A WebAssembly page has a constant size of 64KiB. -> 1MiB ~= 16 pages const memory = new WebAssembly.Memory({ initial: 7500 }); - const memoryBufferUint8 = new Uint8Array(memory.buffer); - const pvError = new PvError(); - const exports = await buildWasm(memory, wasmBase64, pvError); const aligned_alloc = exports.aligned_alloc as aligned_alloc_type; @@ -442,10 +944,15 @@ export class Orca { const pv_orca_synthesize_params_init = exports.pv_orca_synthesize_params_init as pv_orca_synthesize_params_init_type; const pv_orca_synthesize_params_delete = exports.pv_orca_synthesize_params_delete as pv_orca_synthesize_params_delete_type; const pv_orca_synthesize_params_set_speech_rate = exports.pv_orca_synthesize_params_set_speech_rate as pv_orca_synthesize_params_set_speech_rate_type; + const pv_orca_synthesize_params_set_random_state = exports.pv_orca_synthesize_params_set_random_state as pv_orca_synthesize_params_set_random_state_type; const pv_orca_synthesize = exports.pv_orca_synthesize as pv_orca_synthesize_type; - const pv_orca_delete_pcm = exports.pv_orca_delete_pcm as pv_orca_delete_pcm_type; + const pv_orca_pcm_delete = exports.pv_orca_pcm_delete as pv_orca_pcm_delete_type; + const pv_orca_word_alignments_delete = exports.pv_orca_word_alignments_delete as pv_orca_word_alignments_delete_type; + const pv_orca_stream_open = exports.pv_orca_stream_open as pv_orca_stream_open_type; + const pv_orca_stream_synthesize = exports.pv_orca_stream_synthesize as pv_orca_stream_synthesize_type; + const pv_orca_stream_flush = exports.pv_orca_stream_flush as pv_orca_stream_flush_type; + const pv_orca_stream_close = exports.pv_orca_stream_close as pv_orca_stream_close_type; const pv_orca_version = exports.pv_orca_version as pv_orca_version_type; - const pv_status_to_string = exports.pv_status_to_string_type as pv_status_to_string_type; const pv_set_sdk = exports.pv_set_sdk as pv_set_sdk_type; const pv_get_error_stack = exports.pv_get_error_stack as pv_get_error_stack_type; const pv_free_error_stack = exports.pv_free_error_stack as pv_free_error_stack_type; @@ -518,7 +1025,7 @@ export class Orca { objectAddressAddress); await pv_free(accessKeyAddress); await pv_free(modelPathAddress); - if (initStatus !== PV_STATUS_SUCCESS) { + if (initStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -542,7 +1049,7 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } const sampleRateStatus = await pv_orca_sample_rate(objectAddress, sampleRateAddress); - if (sampleRateStatus !== PV_STATUS_SUCCESS) { + if (sampleRateStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -558,6 +1065,30 @@ export class Orca { const sampleRate = memoryBufferView.getInt32(sampleRateAddress, true); await pv_free(sampleRateAddress); + const maxCharacterLimitAddress = await aligned_alloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (maxCharacterLimitAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + const maxCharacterLimitStatus = await pv_orca_max_character_limit(objectAddress, maxCharacterLimitAddress); + if (maxCharacterLimitStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + pv_get_error_stack, + pv_free_error_stack, + messageStackAddressAddressAddress, + messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(maxCharacterLimitStatus, 'Get max character limit failed', messageStack, pvError); + } + + const maxCharacterLimit = memoryBufferView.getInt32(maxCharacterLimitAddress, true); + await pv_free(maxCharacterLimitAddress); + const numCharactersAddress = await aligned_alloc( Int32Array.BYTES_PER_ELEMENT, Int32Array.BYTES_PER_ELEMENT, @@ -579,7 +1110,7 @@ export class Orca { numCharactersAddress, validCharactersAddressAddressAddress, ); - if (validCharactersStatus !== PV_STATUS_SUCCESS) { + if (validCharactersStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -610,14 +1141,20 @@ export class Orca { await pv_free(validCharactersAddressAddressAddress); await pv_orca_valid_characters_delete(validCharactersAddressAddress); - const maxCharacterLimit = await pv_orca_max_character_limit(); - const versionAddress = await pv_orca_version(); const version = arrayBufferToStringAtIndex( memoryBufferUint8, versionAddress, ); + const streamPcmAddressAddress = await aligned_alloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (streamPcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + return { memory: memory, pvFree: pv_free, @@ -628,6 +1165,7 @@ export class Orca { sampleRate: sampleRate, maxCharacterLimit: maxCharacterLimit, validCharacters: validCharacters, + streamPcmAddressAddress: streamPcmAddressAddress, messageStackAddressAddressAddress: messageStackAddressAddressAddress, messageStackDepthAddress: messageStackDepthAddress, @@ -635,9 +1173,14 @@ export class Orca { pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init, pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete, pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate, + pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state, pvOrcaSynthesize: pv_orca_synthesize, - pvStatusToString: pv_status_to_string, - pvOrcaDeletePcm: pv_orca_delete_pcm, + pvOrcaPcmDelete: pv_orca_pcm_delete, + pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete, + pvOrcaStreamOpen: pv_orca_stream_open, + pvOrcaStreamSynthesize: pv_orca_stream_synthesize, + pvOrcaStreamFlush: pv_orca_stream_flush, + pvOrcaStreamClose: pv_orca_stream_close, pvGetErrorStack: pv_get_error_stack, pvFreeErrorStack: pv_free_error_stack, }; @@ -667,7 +1210,7 @@ export class Orca { messageStack.push(message); } - pv_free_error_stack(messageStackAddressAddress); + await pv_free_error_stack(messageStackAddressAddress); return messageStack; } diff --git a/binding/web/src/orca_worker.ts b/binding/web/src/orca_worker.ts index 743d70e5..8f951a2d 100644 --- a/binding/web/src/orca_worker.ts +++ b/binding/web/src/orca_worker.ts @@ -13,16 +13,171 @@ import PvWorker from 'web-worker:./orca_worker_handler.ts'; import { OrcaModel, + OrcaSynthesizeParams, + OrcaSynthesizeResult, + OrcaStreamSynthesizeResult, OrcaWorkerInitResponse, OrcaWorkerSynthesizeResponse, OrcaWorkerReleaseResponse, + OrcaWorkerStreamOpenResponse, + OrcaWorkerStreamSynthesizeResponse, + OrcaWorkerStreamFlushResponse, + OrcaWorkerStreamCloseResponse, PvStatus, - SynthesizeParams, } from './types'; import { loadModel } from '@picovoice/web-utils'; import { pvStatusToException } from './orca_errors'; +class StreamWorker { + readonly _worker: Worker; + + constructor(orcaWorker: Worker) { + this._worker = orcaWorker; + } + + /** + * Adds a chunk of text to the Stream object in a worker and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public synthesize(text: string): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(event.data.result); + break; + case 'failed': + case 'error': + // eslint-disable-next-line no-case-declarations + reject(pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + )); + break; + default: + reject(pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + )); + } + }; + }, + ); + + this._worker.postMessage( + { + command: 'streamSynthesize', + text: text, + }, + ); + + return returnPromise; + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public flush(): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(event.data.result); + break; + case 'failed': + case 'error': + reject(pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + )); + break; + default: + reject(pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + )); + } + }; + }, + ); + + this._worker.postMessage({ + command: 'streamFlush', + }); + + return returnPromise; + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public close(): Promise { + const returnPromise: Promise = new Promise((resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(); + break; + case 'failed': + case 'error': + reject( + pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + ), + ); + break; + default: + reject( + pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + ), + ); + } + }; + }); + + this._worker.postMessage({ + command: 'streamClose', + }); + + return returnPromise; + } +} + +export type OrcaStreamWorker = StreamWorker + export class OrcaWorker { private readonly _worker: Worker; private readonly _version: string; @@ -186,22 +341,25 @@ export class OrcaWorker { } /** - * Synthesizes speech in a worker. - * The speech result will be supplied with the callback provided when initializing the worker either - * by 'fromBase64' or 'fromPublicDirectory'. - * Can also send a message directly using 'this.worker.postMessage({command: "synthesize", text: "..."})'. + * Generates audio from text in a worker. The returned audio contains the speech representation of the text. + * The maximum number of characters per call to `.synthesize()` is `.maxCharacterLimit`. + * Allowed characters are lower-case and upper-case letters and punctuation marks that can be retrieved with `.validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". * - * @param text A string of text. + * @param text A string of text with properties described above. * @param synthesizeParams Optional configuration arguments. * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. * - * @return An Int16Array. + * @return A result object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and a sequence of OrcaAlignment objects representing the word alignments. */ - public async synthesize( + public synthesize( text: string, - synthesizeParams: SynthesizeParams = {}, - ): Promise { - const returnPromise: Promise = new Promise( + synthesizeParams: OrcaSynthesizeParams = {}, + ): Promise { + const returnPromise: Promise = new Promise( (resolve, reject) => { this._worker.onmessage = ( event: MessageEvent, @@ -283,6 +441,57 @@ export class OrcaWorker { return returnPromise; } + + /** + * Opens a new OrcaStream object in a worker. + * + * @param synthesizeParams Optional configuration arguments. + * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + */ + public streamOpen(synthesizeParams: OrcaSynthesizeParams = {}): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(new StreamWorker(this._worker)); + break; + case 'failed': + case 'error': + reject( + pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + ), + ); + break; + default: + reject( + pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + ), + ); + } + }; + }, + ); + + this._worker.postMessage( + { + command: 'streamOpen', + synthesizeParams: synthesizeParams, + }, + ); + + return returnPromise; + } + /** * Terminates the active worker. Stops all requests being handled by worker. */ diff --git a/binding/web/src/orca_worker_handler.ts b/binding/web/src/orca_worker_handler.ts index 398d78eb..96c0c66d 100644 --- a/binding/web/src/orca_worker_handler.ts +++ b/binding/web/src/orca_worker_handler.ts @@ -17,6 +17,7 @@ import { OrcaWorkerRequest, PvStatus } from './types'; import { OrcaError } from './orca_errors'; let orca: Orca | null = null; +let orcaStream: any = null; /** * Orca worker handler. @@ -90,7 +91,7 @@ self.onmessage = async function( } else { self.postMessage({ command: 'error', - status: PvStatus.INVALID_STATE, + status: PvStatus.RUNTIME_ERROR, shortMessage: 'Orca synthesize error', }); } @@ -106,6 +107,121 @@ self.onmessage = async function( command: 'ok', }); break; + case 'streamOpen': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + try { + orcaStream = await orca.streamOpen(event.data.synthesizeParams); + self.postMessage({ + command: 'ok', + }); + } catch (e: any) { + if (e instanceof OrcaError) { + self.postMessage({ + command: 'error', + status: e.status, + shortMessage: e.shortMessage, + messageStack: e.messageStack, + }); + } else { + self.postMessage({ + command: 'error', + status: PvStatus.RUNTIME_ERROR, + shortMessage: 'Orca stream open error', + }); + } + } + break; + case 'streamSynthesize': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + try { + self.postMessage({ + command: 'ok', + result: await orcaStream.synthesize(event.data.text), + }); + } catch (e: any) { + if (e instanceof OrcaError) { + self.postMessage({ + command: 'error', + status: e.status, + shortMessage: e.shortMessage, + messageStack: e.messageStack, + }); + } else { + self.postMessage({ + command: 'error', + status: PvStatus.RUNTIME_ERROR, + shortMessage: 'Orca synthesize error', + }); + } + } + break; + case 'streamFlush': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + self.postMessage({ + command: 'ok', + result: await orcaStream.flush(), + }); + break; + case 'streamClose': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + await orcaStream.close(); + orcaStream = null; + self.postMessage({ + command: 'ok', + }); + break; default: self.postMessage({ command: 'failed', diff --git a/binding/web/src/types.ts b/binding/web/src/types.ts index 7c8aed4a..ccee0060 100644 --- a/binding/web/src/types.ts +++ b/binding/web/src/types.ts @@ -27,15 +27,36 @@ export enum PvStatus { ACTIVATION_REFUSED, } -export type SynthesizeParams = { - speechRate?: number -} - /** * OrcaModel types */ export type OrcaModel = PvModel; +export type OrcaSynthesizeParams = { + speechRate?: number; + randomState?: number | null; +} + +export type OrcaPhoneme = { + phoneme: string; + startSec: number; + endSec: number; +} + +export type OrcaAlignment = { + word: string; + startSec: number; + endSec: number; + phonemes: OrcaPhoneme[]; +} + +export type OrcaSynthesizeResult = { + pcm: Int16Array; + alignments: OrcaAlignment[]; +} + +export type OrcaStreamSynthesizeResult = Int16Array | null + export type OrcaWorkerInitRequest = { command: 'init'; accessKey: string; @@ -48,17 +69,39 @@ export type OrcaWorkerInitRequest = { export type OrcaWorkerSynthesizeRequest = { command: 'synthesize'; text: string; - synthesizeParams?: SynthesizeParams; + synthesizeParams: OrcaSynthesizeParams; }; export type OrcaWorkerReleaseRequest = { command: 'release'; }; +export type OrcaWorkerStreamOpenRequest = { + command: 'streamOpen'; + synthesizeParams: OrcaSynthesizeParams; +} + +export type OrcaWorkerStreamSynthesizeRequest = { + command: 'streamSynthesize'; + text: string; +}; + +export type OrcaWorkerStreamFlushRequest = { + command: 'streamFlush'; +}; + +export type OrcaWorkerStreamCloseRequest = { + command: 'streamClose'; +}; + export type OrcaWorkerRequest = | OrcaWorkerInitRequest | OrcaWorkerSynthesizeRequest - | OrcaWorkerReleaseRequest; + | OrcaWorkerReleaseRequest + | OrcaWorkerStreamOpenRequest + | OrcaWorkerStreamSynthesizeRequest + | OrcaWorkerStreamFlushRequest + | OrcaWorkerStreamCloseRequest; export type OrcaWorkerFailureResponse = { command: 'failed' | 'error'; @@ -82,7 +125,7 @@ export type OrcaWorkerSynthesizeResponse = | OrcaWorkerFailureResponse | { command: 'ok'; - result: Int16Array; + result: OrcaSynthesizeResult; }; export type OrcaWorkerReleaseResponse = @@ -91,7 +134,37 @@ export type OrcaWorkerReleaseResponse = command: 'ok'; }; +export type OrcaWorkerStreamOpenResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: any; +}; + +export type OrcaWorkerStreamSynthesizeResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: OrcaStreamSynthesizeResult; +}; + +export type OrcaWorkerStreamFlushResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: OrcaStreamSynthesizeResult; +}; + +export type OrcaWorkerStreamCloseResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; +}; + export type OrcaWorkerResponse = | OrcaWorkerInitResponse | OrcaWorkerSynthesizeResponse - | OrcaWorkerReleaseResponse; + | OrcaWorkerReleaseResponse + | OrcaWorkerStreamOpenResponse + | OrcaWorkerStreamSynthesizeResponse + | OrcaWorkerStreamFlushResponse; diff --git a/binding/web/test/orca.test.ts b/binding/web/test/orca.test.ts index 6c3af96c..d531733e 100644 --- a/binding/web/test/orca.test.ts +++ b/binding/web/test/orca.test.ts @@ -1,54 +1,42 @@ -import { LeopardWorker } from '@picovoice/leopard-web'; import { Orca, OrcaWorker } from '../'; import { OrcaError } from '../dist/types/orca_errors'; +import { PvModel } from '@picovoice/web-utils'; // @ts-ignore import orcaParamsMale from './orca_params_male'; + // @ts-ignore import orcaParamsFemale from './orca_params_female'; -import { PvModel } from '@picovoice/web-utils'; -import testData from '../cypress/fixtures/.test/test_data.json'; +/* eslint camelcase: 0 */ + +import testData from '../cypress/fixtures/resources/.test/test_data.json'; const ACCESS_KEY = Cypress.env('ACCESS_KEY'); const EXPECTED_MAX_CHARACTER_LIMIT = 2000; const EXPECTED_SAMPLE_RATE = 22050; const EXPECTED_VALID_CHARACTERS = [ - '.', ':', ',', '"', '?', '!', 'a', 'b', - 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', - 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', - 'Y', 'Z', '\'', '{', '}', '|', ' ', '-', + '.', ':', ',', '"', '?', '!', 'a', + 'b', 'c', 'd', 'e', 'f', 'g', 'h', + 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', + 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '\'', '{', '}', '|', ' ', + '-', '1', '2', '3', '4', '5', '6', + '7', '8', '9', '0', '@', '%', '&', ]; -const levenshteinDistance = (words1: string[], words2: string[]) => { - const res = Array.from(Array(words1.length + 1), () => new Array(words2.length + 1)); - for (let i = 0; i <= words1.length; i++) { - res[i][0] = i; - } - for (let j = 0; j <= words2.length; j++) { - res[0][j] = j; - } - for (let i = 1; i <= words1.length; i++) { - for (let j = 1; j <= words2.length; j++) { - res[i][j] = Math.min( - res[i - 1][j] + 1, - res[i][j - 1] + 1, - res[i - 1][j - 1] + (words1[i - 1].toUpperCase() === words2[j - 1].toUpperCase() ? 0 : 1), - ); - } - } - return res[words1.length][words2.length]; -}; +const EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = 'female'; -const wordErrorRate = (reference: string, hypothesis: string, useCER = false): number => { - const splitter = (useCER) ? '' : ' '; - const ed = levenshteinDistance(reference.split(splitter), hypothesis.split(splitter)); - return ed / reference.length; +const compareArrays = (arr1: Int16Array, arr2: Int16Array, step: number) => { + expect(arr1.length).eq(arr2.length); + for (let i = 0; i < arr1.length - step; i += step) { + expect(arr1[i]).closeTo(arr2[i], 1); + } }; const runInitTest = async ( @@ -65,19 +53,18 @@ const runInitTest = async ( expectFailure = false, } = params; - let orca = null; + let orca: Orca | OrcaWorker | null = null; let isFailed = false; try { orca = await instance.create(accessKey, model); - - expect(typeof orca.version).to.eq('string'); - expect(orca.version.length).to.be.greaterThan(0); - expect(orca.maxCharacterLimit).to.eq(EXPECTED_MAX_CHARACTER_LIMIT); - expect(orca.sampleRate).to.eq(EXPECTED_SAMPLE_RATE); - expect(orca.validCharacters.length).to.eq(EXPECTED_VALID_CHARACTERS.length); + expect(typeof orca.version).eq('string'); + expect(orca.version.length).gt(0); + expect(orca.maxCharacterLimit).eq(EXPECTED_MAX_CHARACTER_LIMIT); + expect(orca.sampleRate).eq(EXPECTED_SAMPLE_RATE); + expect(orca.validCharacters.length).eq(EXPECTED_VALID_CHARACTERS.length); orca.validCharacters.forEach((symbol: string, i: number) => { - expect(symbol).to.eq(EXPECTED_VALID_CHARACTERS[i]); + expect(symbol).eq(EXPECTED_VALID_CHARACTERS[i]); }); } catch (e) { if (expectFailure) { @@ -85,73 +72,12 @@ const runInitTest = async ( } else { expect(e).to.be.undefined; } - } finally { - if (orca !== null) { - if (orca instanceof OrcaWorker) { - orca.terminate(); - } else { - await orca.release(); - } - } } - if (expectFailure) { - expect(isFailed).to.be.true; - } else { - expect(isFailed).to.be.false; - } -}; - -const runProcTest = async ( - instance: typeof Orca | typeof OrcaWorker, - text: string, - speechRate: number, - params: { - accessKey?: string; - model?: PvModel; - isTestWER?: boolean; - expectFailure?: boolean; - } = {}, -) => { - const { - accessKey = ACCESS_KEY, - model = { publicPath: '/test/orca_params_male.pv', forceWrite: true }, - isTestWER = true, - expectFailure = false, - } = params; - - const checkWER = async (pcm: Int16Array) => { - const leopard = await LeopardWorker.create( - accessKey, - { publicPath: '/test/leopard_params.pv', forceWrite: true }, - ); - - const { transcript } = await leopard.process(pcm); - const wer = wordErrorRate(transcript, testData.test_sentences.text_no_punctuation); - expect(wer).lt(testData.wer_threshold); - leopard.terminate(); - }; - - let isFailed = false; - const orca = await instance.create(accessKey, model); - - try { - const speech = await orca.synthesize(text, { speechRate }); - if (isTestWER) { - await checkWER(speech); - } else if (!expectFailure) { - expect(speech.length).gt(0); - } - } catch (e) { - isFailed = true; - } finally { - if (orca !== null) { - if (orca instanceof OrcaWorker) { - orca.terminate(); - } else if (orca instanceof Orca) { - await orca.release(); - } - } + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); } if (expectFailure) { @@ -163,32 +89,26 @@ const runProcTest = async ( describe('Orca Binding', function() { for (const instance of [Orca, OrcaWorker]) { - const instanceString = instance === OrcaWorker ? 'worker' : 'main'; + const instanceString = instance === Orca ? 'main' : 'worker'; - it(`should be able to handle invalid public path (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { publicPath: 'invalid', forceWrite: true }, - expectFailure: true, - }); + it(`should be able to handle invalid public path (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath: 'invalid', forceWrite: true }, + expectFailure: true, }); }); - it(`should be able to handle invalid base64 (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { base64: 'invalid', forceWrite: true }, - expectFailure: true, - }); + it(`should be able to handle invalid base64 (${instanceString})`, async () => { + await runInitTest(instance, { + model: { base64: 'invalid', forceWrite: true }, + expectFailure: true, }); }); - it(`should be able to handle invalid access key (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - accessKey: 'invalid', - expectFailure: true, - }); + it(`should be able to handle invalid access key (${instanceString})`, async () => { + await runInitTest(instance, { + accessKey: 'invalid', + expectFailure: true, }); }); @@ -196,105 +116,93 @@ describe('Orca Binding', function() { const publicPath = modelFileSuffix === 'male' ? `/test/orca_params_male.pv` : `/test/orca_params_female.pv`; const base64Path = modelFileSuffix === 'male' ? orcaParamsMale : orcaParamsFemale; - it(`should return process and flush error message stack`, async () => { - const orca = await Orca.create( - ACCESS_KEY, - { publicPath: publicPath, forceWrite: true }, - ); - - // @ts-ignore - const objectAddress = orca._objectAddress; - - // @ts-ignore - orca._objectAddress = 0; - - const errors: OrcaError[] = []; - try { - await orca.synthesize('test'); - } catch (e) { - errors.push(e); - } - - // @ts-ignore - orca._objectAddress = objectAddress; - await orca.release(); - - expect(errors.length).to.be.gte(0); - - for (let i = 0; i < errors.length; i++) { - expect((errors[i] as OrcaError).messageStack.length).to.be.gt(0); - expect((errors[i] as OrcaError).messageStack.length).to.be.lte(8); - } - }); - - it(`should return correct error message stack [${modelFileSuffix}] (${instanceString})`, async () => { - let messageStack = []; - try { - const orca = await instance.create('invalidAccessKey', { - publicPath, - forceWrite: true, - }); - expect(orca).to.be.undefined; - } catch (e: any) { - messageStack = e.messageStack; - } - - expect(messageStack.length).to.be.gt(0); - expect(messageStack.length).to.be.lte(8); - - try { - const orca = await instance.create('invalidAccessKey', { - publicPath, - forceWrite: true, - }); - expect(orca).to.be.undefined; - } catch (e: any) { - expect(messageStack.length).to.be.eq(e.messageStack.length); - } + it(`should be able to init with public path [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath, forceWrite: true }, + }); }); - it(`should be able to init with public path [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { - publicPath, - forceWrite: true, - }, - }); + it(`should be able to init with base64 [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { base64: base64Path, forceWrite: true }, }); }); - it(`should be able to init with base64 [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { base64: base64Path, forceWrite: true }, - }); + it(`should be able to handle UTF-8 public path [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath, forceWrite: true, customWritePath: '테스트' }, }); }); - it(`should be able to handle UTF-8 public path [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { - publicPath, - forceWrite: true, - customWritePath: '테스트', + it(`should be able to process text streaming [${modelFileSuffix}] (${instanceString})`, () => { + try { + cy.getFramesFromFile(`${testData.audio_data_folder}orca_params_${modelFileSuffix}_stream.wav`).then( + async (rawPcm: Int16Array) => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + try { + const orcaStream = await orca.streamOpen({ randomState: testData.random_state }); + + const streamPcm: number[] = []; + for (const c of testData.test_sentences.text.split('')) { + const pcm = await orcaStream.synthesize(c); + if (pcm !== null) { + streamPcm.push(...pcm); + } + } + + const endPcm = await orcaStream.flush(); + if (endPcm !== null) { + streamPcm.push(...endPcm); + } + + compareArrays(new Int16Array(streamPcm), rawPcm, 500); + await orcaStream.close(); + } catch (e) { + expect(e).to.be.undefined; + } + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } }, - }); - }); + ); + } catch (e) { + expect(e).to.be.undefined; + } }); - it(`should be able to handle different speech rates [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { + if (modelFileSuffix === EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER) { + it(`should be able to process alignment exact [${modelFileSuffix}] (${instanceString})`, async () => { try { const orca = await instance.create( ACCESS_KEY, { publicPath, forceWrite: true }, ); - const speechSlow = await orca.synthesize(testData.test_sentences.text, { speechRate: 0.7 }); - const speechFast = await orca.synthesize(testData.test_sentences.text, { speechRate: 1.3 }); - expect(speechSlow.length).gt(speechFast.length); + const { + pcm, + alignments, + } = await orca.synthesize(testData.test_sentences.text_alignment, { randomState: testData.random_state }); + expect(pcm.length).gt(0); + expect(alignments.length).eq(testData.alignments.length); + + alignments.forEach((w, i) => { + const { word, start_sec, end_sec, phonemes } = testData.alignments[i]; + expect(w.word).eq(word); + expect(w.startSec).closeTo(start_sec, 0.01); + expect(w.endSec).closeTo(end_sec, 0.01); + w.phonemes.forEach((p, j) => { + expect(p.phoneme).eq(phonemes[j].phoneme); + expect(p.startSec).closeTo(phonemes[j].start_sec, 0.01); + expect(p.endSec).closeTo(phonemes[j].end_sec, 0.01); + }); + }); if (orca instanceof OrcaWorker) { orca.terminate(); @@ -305,19 +213,33 @@ describe('Orca Binding', function() { expect(e).to.be.undefined; } }); - }); - - it(`should be able to handle max num characters [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { + } else { + it(`should be able to process alignment [${modelFileSuffix}] (${instanceString})`, async () => { try { const orca = await instance.create( ACCESS_KEY, { publicPath, forceWrite: true }, ); - const maxNumChars = orca.maxCharacterLimit; - const speech = await orca.synthesize('a'.repeat(maxNumChars)); - expect(speech.length).gt(0); + const { + pcm, + alignments, + } = await orca.synthesize(testData.test_sentences.text_alignment, { randomState: testData.random_state }); + expect(pcm.length).gt(0); + expect(alignments.length).eq(testData.alignments.length); + + let prevWordEndSec = 0; + let prevPhonemeEndSec = 0; + alignments.forEach(w => { + expect(w.startSec).closeTo(prevWordEndSec, 0.001); + expect(w.endSec).gt(w.startSec); + prevWordEndSec = w.endSec; + w.phonemes.forEach(p => { + expect(p.startSec).closeTo(prevPhonemeEndSec, 0.001); + expect(p.endSec).gt(p.startSec); + prevPhonemeEndSec = p.endSec; + }); + }); if (orca instanceof OrcaWorker) { orca.terminate(); @@ -328,19 +250,28 @@ describe('Orca Binding', function() { expect(e).to.be.undefined; } }); - }); + } - it(`should be able to process - punctuation [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to process text [${modelFileSuffix}] (${instanceString})`, () => { try { - await runProcTest( - instance, - testData.test_sentences.text, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, + cy.getFramesFromFile(`${testData.audio_data_folder}orca_params_${modelFileSuffix}_single.wav`).then( + async rawPcm => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const { pcm } = await orca.synthesize( + testData.test_sentences.text, + { speechRate: 1, randomState: testData.random_state }, + ); + compareArrays(pcm, rawPcm, 500); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } }, ); } catch (e) { @@ -350,62 +281,163 @@ describe('Orca Binding', function() { it(`should be able to process - no punctuation [${modelFileSuffix}] (${instanceString})`, async () => { try { - await runProcTest( - instance, - testData.test_sentences.text_no_punctuation, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - }, + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, ); + + const { pcm } = await orca.synthesize(testData.test_sentences.text_no_punctuation); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } } catch (e) { expect(e).to.be.undefined; } }); - it(`should be able to process - custom punctuation [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to process custom punctuation [${modelFileSuffix}] (${instanceString})`, async () => { try { - await runProcTest( - instance, - testData.test_sentences.text_custom_pronunciation, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - isTestWER: false, - }, + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, ); + + const { pcm } = await orca.synthesize(testData.test_sentences.text_custom_pronunciation); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } } catch (e) { expect(e).to.be.undefined; } }); - for (const failureCase of testData.test_sentences.text_invalid) { - it(`should handle invalid text (${failureCase}) [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to handle different speech rates [${modelFileSuffix}] (${instanceString})`, async () => { + try { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const { pcm: pcmSlow } = await orca.synthesize(testData.test_sentences.text, { speechRate: 0.7 }); + const { pcm: pcmFast } = await orca.synthesize(testData.test_sentences.text, { speechRate: 1.3 }); + expect(pcmSlow.length).gt(pcmFast.length); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + } catch (e) { + expect(e).to.be.undefined; + } + }); + + it(`should be able to handle max num characters [${modelFileSuffix}] (${instanceString})`, async () => { + try { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const maxNumChars = orca.maxCharacterLimit; + const { pcm } = await orca.synthesize('a'.repeat(maxNumChars)); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + } catch (e) { + expect(e).to.be.undefined; + } + }); + + it(`should handle invalid input [${modelFileSuffix}] (${instanceString})`, async () => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + for (const failureCase of testData.test_sentences.text_invalid) { try { - await runProcTest( - instance, - failureCase, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - isTestWER: false, - expectFailure: true, - }, - ); + await orca.synthesize(failureCase); } catch (e) { - expect(e).to.be.undefined; + expect(e).not.to.be.undefined; } - }); - } + } + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + }); + + it(`should return process and flush error message stack [${modelFileSuffix}] (${instanceString})`, async () => { + const orca = await Orca.create( + ACCESS_KEY, + { publicPath: publicPath, forceWrite: true }, + ); + + // @ts-ignore + const objectAddress = orca._objectAddress; + + // @ts-ignore + orca._objectAddress = 0; + + const errors: OrcaError[] = []; + try { + await orca.synthesize('test'); + } catch (e: any) { + errors.push(e); + } + + // @ts-ignore + orca._objectAddress = objectAddress; + await orca.release(); + + expect(errors.length).to.be.gte(0); + + for (let i = 0; i < errors.length; i++) { + expect((errors[i] as OrcaError).messageStack.length).to.be.gt(0); + expect((errors[i] as OrcaError).messageStack.length).to.be.lte(8); + } + }); + + it(`should return correct error message stack [${modelFileSuffix}] (${instanceString})`, async () => { + let messageStack = []; + try { + const orca = await instance.create('invalidAccessKey', { + publicPath, + forceWrite: true, + }); + expect(orca).to.be.undefined; + } catch (e: any) { + messageStack = e.messageStack; + } + + expect(messageStack.length).to.be.gt(0); + expect(messageStack.length).to.be.lte(8); + + try { + const orca = await instance.create('invalidAccessKey', { + publicPath, + forceWrite: true, + }); + expect(orca).to.be.undefined; + } catch (e: any) { + expect(messageStack.length).to.be.eq(e.messageStack.length); + } + }); } } }); diff --git a/binding/web/test/orca_perf.test.ts b/binding/web/test/orca_perf.test.ts index 172f26d3..67419b7d 100644 --- a/binding/web/test/orca_perf.test.ts +++ b/binding/web/test/orca_perf.test.ts @@ -1,5 +1,5 @@ import { Orca, OrcaWorker } from '../'; -import testData from '../cypress/fixtures/.test/test_data.json'; +import testData from '../cypress/fixtures/resources/.test/test_data.json'; const ACCESS_KEY = Cypress.env('ACCESS_KEY'); const NUM_TEST_ITERATIONS = Number(Cypress.env('NUM_TEST_ITERATIONS')); diff --git a/demo/android/OrcaDemo/README.md b/demo/android/OrcaDemo/README.md index 73144c72..6b45dd58 100644 --- a/demo/android/OrcaDemo/README.md +++ b/demo/android/OrcaDemo/README.md @@ -2,13 +2,15 @@ ## AccessKey -Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca SDKs. +Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca +SDKs. You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. ## Setup -Replace `"${YOUR_ACCESS_KEY_HERE}"` inside [MainActivity.java](orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java) +Replace `"${YOUR_ACCESS_KEY_HERE}"` +inside [MainActivity.java](orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java) with your AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/). 1. Open the project in Android Studio @@ -16,7 +18,7 @@ with your AccessKey obtained from [Picovoice Console](https://console.picovoice. ## Usage -1. Type a phrase that you'd like to synthesize into the textbox at the top. -2. Press the `Synthesize` button to hear the synthesized speech. -3. Press `Stop` if you wish to stop the playback before it completes on its own. +1. Choose between Streaming Synthesis and Single Synthesis using the switch at the top. +2. Type a phrase that you'd like to synthesize into the textbox. +3. Press the `Synthesize` button to hear the synthesized speech. diff --git a/demo/android/OrcaDemo/orca-demo-app/build.gradle b/demo/android/OrcaDemo/orca-demo-app/build.gradle index 2cdae428..8f8353a7 100644 --- a/demo/android/OrcaDemo/orca-demo-app/build.gradle +++ b/demo/android/OrcaDemo/orca-demo-app/build.gradle @@ -7,8 +7,8 @@ android { applicationId "ai.picovoice.orcademo" minSdkVersion 21 targetSdkVersion defaultTargetSdkVersion - versionCode 1 - versionName "1.0" + versionCode 2 + versionName "2.0" testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" } @@ -30,11 +30,10 @@ android { } dependencies { - implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation 'ai.picovoice:orca-android:0.1.0' + implementation 'ai.picovoice:orca-android:0.2.0' } tasks.register('copyParams', Copy) { @@ -43,4 +42,4 @@ tasks.register('copyParams', Copy) { into("${rootDir}/orca-demo-app/src/main/assets") } -preBuild.dependsOn(copyParams) \ No newline at end of file +preBuild.dependsOn(copyParams) diff --git a/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java b/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java index cb5e4b7f..3f9653c5 100644 --- a/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java +++ b/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java @@ -20,19 +20,28 @@ import android.os.Looper; import android.text.Editable; import android.text.TextWatcher; +import android.text.method.ScrollingMovementMethod; import android.view.View; -import android.widget.Button; import android.widget.EditText; import android.widget.ProgressBar; import android.widget.TextView; import android.widget.ToggleButton; +import android.media.AudioFormat; +import android.media.AudioManager; +import android.media.AudioTrack; + import androidx.appcompat.app.AppCompatActivity; +import androidx.appcompat.widget.SwitchCompat; +import java.util.ArrayList; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -50,9 +59,12 @@ public class MainActivity extends AppCompatActivity { private static final String ACCESS_KEY = "${YOUR_ACCESS_KEY_HERE}"; private static final String MODEL_FILE = "orca_params_female.pv"; + private static final int STREAMING_NUM_AUDIO_WAIT_CHUNKS = 1; private final Handler mainHandler = new Handler(Looper.getMainLooper()); private final ExecutorService executor = Executors.newSingleThreadExecutor(); + private final ExecutorService executorStreamingSynthesis = Executors.newSingleThreadExecutor(); + private final ExecutorService executorStreamingAudio = Executors.newSingleThreadExecutor(); private String synthesizedFilePath; private MediaPlayer synthesizedPlayer; @@ -63,10 +75,15 @@ public class MainActivity extends AppCompatActivity { private Orca orca; + private Orca.OrcaStream orcaStream = null; + TextView errorText; TextView infoTextView; + TextView streamTextView; + TextView streamSecsTextView; TextView numCharsTextView; EditText synthesizeEditText; + SwitchCompat streamSwitch; ToggleButton synthesizeButton; ProgressBar synthesizeProgress; @@ -77,10 +94,14 @@ protected void onCreate(Bundle savedInstanceState) { setContentView(R.layout.orca_demo); errorText = findViewById(R.id.errorTextView); infoTextView = findViewById(R.id.infoTextView); + streamTextView = findViewById(R.id.streamTextView); + streamSecsTextView = findViewById(R.id.streamSecsTextView); numCharsTextView = findViewById(R.id.numCharsTextView); synthesizeEditText = findViewById(R.id.synthesizeEditText); + streamSwitch = findViewById(R.id.streamSwitch); synthesizeButton = findViewById(R.id.synthesizeButton); synthesizeProgress = findViewById(R.id.synthesizeProgress); + streamTextView.setMovementMethod(new ScrollingMovementMethod()); try { orca = new Orca.Builder() @@ -113,9 +134,9 @@ public void afterTextChanged(Editable s) { public void onTextChanged(CharSequence s, int start, int before, int count) { runOnUiThread(() -> numCharsTextView.setText(String.format( - "%d/%d", - s.toString().length(), - orca.getMaxCharacterLimit())) + "%d/%d", + s.toString().length(), + orca.getMaxCharacterLimit())) ); validateText(s.toString()); } @@ -139,37 +160,65 @@ private void setUIState(UIState state) { case EDIT: infoTextView.setVisibility(View.INVISIBLE); synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(true); synthesizeButton.setEnabled(true); + synthesizeButton.setChecked(false); synthesizeEditText.setEnabled(true); + synthesizeEditText.setVisibility(View.VISIBLE); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); break; case PLAYBACK: infoTextView.setVisibility(View.VISIBLE); synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(true); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); + break; + case STREAMING_PLAYBACK: + infoTextView.setText("Streaming..."); + infoTextView.setVisibility(View.VISIBLE); + synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); + synthesizeButton.setEnabled(false); + synthesizeEditText.setEnabled(false); + synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.VISIBLE); + streamSecsTextView.setVisibility(View.VISIBLE); + synthesizeEditText.setVisibility(View.INVISIBLE); break; case BUSY: infoTextView.setVisibility(View.VISIBLE); synthesizeButton.setVisibility(View.INVISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.VISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; case ERROR: infoTextView.setVisibility(View.VISIBLE); errorText.setVisibility(View.INVISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(true); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; case FATAL_ERROR: infoTextView.setVisibility(View.INVISIBLE); errorText.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; default: break; @@ -225,13 +274,23 @@ private void validateText(String text) { } sb.append(c); } - runOnUiThread(() -> { - setUIState(UIState.ERROR); - infoTextView.setText(String.format( - "Invalid characters in text: [%s]", - sb - )); - }); + if (orcaStream == null) { + runOnUiThread(() -> { + setUIState(UIState.ERROR); + infoTextView.setText(String.format( + "Invalid characters in text: [%s]", + sb + )); + }); + } else { + runOnUiThread(() -> { + infoTextView.setVisibility(View.VISIBLE); + infoTextView.setText(String.format( + "Invalid characters in text will be ignored: [%s]", + sb + )); + }); + } } } } else { @@ -322,6 +381,34 @@ private void stopPlayback() { runOnUiThread(() -> setUIState(UIState.EDIT)); } + public void onStreamSwitchClick(View view) { + if (orca == null) { + displayError("Orca is not initialized"); + streamSwitch.setChecked(false); + return; + } + + try { + if (orcaStream == null) { + orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + runOnUiThread(() -> { + synthesizeEditText.setText(""); + streamSecsTextView.setText(""); + streamSecsTextView.setVisibility(View.VISIBLE); + }); + } else { + orcaStream.close(); + orcaStream = null; + runOnUiThread(() -> { + synthesizeEditText.setText(""); + streamSecsTextView.setVisibility(View.INVISIBLE); + }); + } + } catch (OrcaException e) { + onOrcaException(e); + } + } + public void onSynthesizeClick(View view) { if (orca == null) { displayError("Orca is not initialized"); @@ -329,21 +416,150 @@ public void onSynthesizeClick(View view) { return; } - if (synthesizeButton.isChecked()) { - String text = synthesizeEditText.getText().toString(); - if (!previousText.equals(text)) { - runSynthesis(text); + String text = synthesizeEditText.getText().toString(); + if (orcaStream == null) { + if (synthesizeButton.isChecked()) { + if (!previousText.equals(text)) { + runSynthesis(text); + } else { + startPlayback(); + } } else { - startPlayback(); + stopPlayback(); } } else { - stopPlayback(); + runStreamSynthesis(text); } } + private void runStreamSynthesis(final String text) { + setUIState(UIState.STREAMING_PLAYBACK); + + AtomicBoolean isStreamingText = new AtomicBoolean(false); + ArrayList textStream = new ArrayList<>(); + + AtomicBoolean isQueueingStreamingPcm = new AtomicBoolean(false); + ConcurrentLinkedQueue pcmQueue = new ConcurrentLinkedQueue<>(); + CountDownLatch streamingSynthesisLatch = new CountDownLatch(1); + CountDownLatch streamingAudioLatch = new CountDownLatch(1); + + executor.submit(() -> { + isStreamingText.set(true); + streamingSynthesisLatch.countDown(); + + String[] words = text.split(" "); + for (String word : words) { + word += " "; + String finalWord = word; + mainHandler.post(() -> { + textStream.add(finalWord); + streamTextView.append(finalWord); + }); + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { } + } + + isStreamingText.set(false); + }); + + executorStreamingSynthesis.submit(() -> { + try { + mainHandler.post(() -> { + streamTextView.setText(""); + streamSecsTextView.setText("Seconds of audio synthesized: 0.000s"); + synthesizeButton.setEnabled(false); + }); + + int numIterations = 0; + boolean isPcmPlayStarted = false; + float secs = 0; + isQueueingStreamingPcm.set(true); + + streamingSynthesisLatch.await(); + while (isStreamingText.get() || !textStream.isEmpty()) { + if (!textStream.isEmpty()) { + String word = textStream.remove(0); + try { + short[] pcm = orcaStream.synthesize(word); + if (pcm != null && pcm.length > 0) { + pcmQueue.add(pcm); + secs += (float) pcm.length / orca.getSampleRate(); + float finalSecs = secs; + mainHandler.post(() -> streamSecsTextView.setText(String.format("Seconds of audio synthesized: %.3fs", finalSecs))); + if (numIterations == STREAMING_NUM_AUDIO_WAIT_CHUNKS) { + streamingAudioLatch.countDown(); + isPcmPlayStarted = true; + } + numIterations++; + } + } catch (OrcaException e) { + mainHandler.post(() -> onOrcaException(e)); + } + } + } + + try { + short[] flushedPcm = orcaStream.flush(); + if (flushedPcm != null && flushedPcm.length > 0) { + pcmQueue.add(flushedPcm); + secs += (float) flushedPcm.length / orca.getSampleRate(); + float finalSecs = secs; + mainHandler.post(() -> streamSecsTextView.setText(String.format("Seconds of audio synthesized: %.3fs", finalSecs))); + } + + if (!isPcmPlayStarted) { + streamingAudioLatch.countDown(); + } + } catch (OrcaException e) { + mainHandler.post(() -> onOrcaException(e)); + } + + isQueueingStreamingPcm.set(false); + } catch (Exception e) { + mainHandler.post(() -> displayError(e.toString())); + } + }); + + executorStreamingAudio.submit(() -> { + try { + AudioTrack audioTrack = new AudioTrack( + AudioManager.STREAM_MUSIC, + orca.getSampleRate(), + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT, + AudioTrack.getMinBufferSize( + orca.getSampleRate(), + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT), + AudioTrack.MODE_STREAM); + + audioTrack.play(); + + streamingAudioLatch.await(); + while(isQueueingStreamingPcm.get() || !pcmQueue.isEmpty()) { + if (!pcmQueue.isEmpty()) { + short[] pcm = pcmQueue.poll(); + if (pcm != null && pcm.length > 0) { + audioTrack.write(pcm, 0, pcm.length); + } + } + } + + audioTrack.stop(); + audioTrack.release(); + + mainHandler.post(() -> setUIState(UIState.EDIT)); + } catch (Exception e) { + mainHandler.post(() -> displayError(e.toString())); + } + }); + } + private enum UIState { EDIT, PLAYBACK, + STREAMING_PLAYBACK, BUSY, ERROR, FATAL_ERROR diff --git a/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml b/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml index 912b82b9..f541b179 100644 --- a/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml +++ b/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml @@ -1,5 +1,6 @@ - - + - + - + - + - + - + - + - \ No newline at end of file + + + + + + + diff --git a/demo/c/CMakeLists.txt b/demo/c/CMakeLists.txt index 12e37908..3efdb691 100644 --- a/demo/c/CMakeLists.txt +++ b/demo/c/CMakeLists.txt @@ -1,13 +1,18 @@ cmake_minimum_required(VERSION 3.13) -project(orca_demo) +project(orca_demo_c) set(CMAKE_C_STANDARD 99) set(CMAKE_BUILD_TYPE Release) +set(COMMON_LIBS dl) include_directories("${PROJECT_SOURCE_DIR}/../../include") add_executable(orca_demo orca_demo.c) +add_executable(orca_demo_streaming orca_demo_streaming.c) +target_include_directories(orca_demo_streaming PRIVATE dr_libs) + if (NOT WIN32) - target_link_libraries(orca_demo dl) + target_link_libraries(orca_demo ${COMMON_LIBS}) + target_link_libraries(orca_demo_streaming ${COMMON_LIBS}) endif() diff --git a/demo/c/README.md b/demo/c/README.md index 93e42357..715acb9a 100644 --- a/demo/c/README.md +++ b/demo/c/README.md @@ -16,33 +16,63 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you - The demo requires [CMake](https://cmake.org/) version 3.4 or higher. - **For Windows Only**: [MinGW](https://www.mingw-w64.org/) is required to build the demo. -# Speech Synthesis Demo +# Speech Synthesis Demos +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +This is demonstrated in the Orca streaming demo. +In the single synthesis mode, the text is synthesized in a single call to the Orca engine. **Note**: the following commands are run from the root of the repo. -## Build +## Streaming Synthesis Demo + +### Build Use CMake to build the Orca demo target: ```console -cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo_streaming ``` -## Usage +### Usage Running the executable without any command-line arguments prints the usage info to the console: ```console -Usage: orca_demo [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH] +Usage: orca_demo_streaming [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH] +``` + +To run the Orca streaming demo: + +```console +./demo/c/build/orca_demo_streaming -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} +``` + +Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with +a path to any of the model files available under [lib/common](../../lib/common), `${ACCESS_KEY}` with AccessKey +obtained from [Picovoice Console](https://console.picovoice.ai/), `${TEXT}` with the text to be synthesized, +and `${WAV_OUTPUT_PATH}` with a path to a output audio file. +The audio will be stored as a single-channel 16-bit PCM `.wav` file. + +## Single Synthesis Demo + +### Build + +Use CMake to build the Orca demo target: + +```console +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo ``` +### Usage + To run the Orca demo: ```console ./demo/c/build/orca_demo -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} ``` -Replace `${LIBRARY_PATH}` with path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with +Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with a path to any of the model files available under [lib/common](../../lib/common), `${ACCESS_KEY}` with AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/), `${TEXT}` with the text to be synthesized, and `${WAV_OUTPUT_PATH}` with a path to a output audio file. diff --git a/demo/c/dr_libs b/demo/c/dr_libs new file mode 160000 index 00000000..da35f9d6 --- /dev/null +++ b/demo/c/dr_libs @@ -0,0 +1 @@ +Subproject commit da35f9d6c7374a95353fd1df1d394d44ab66cf01 diff --git a/demo/c/orca_demo.c b/demo/c/orca_demo.c index 93aa1279..35a0c62c 100644 --- a/demo/c/orca_demo.c +++ b/demo/c/orca_demo.c @@ -140,7 +140,7 @@ int picovoice_main(int argc, char **argv) { void *orca_library = open_dl(library_path); if (!orca_library) { - fprintf(stderr, "Failed to open library at '%s'.\n", library_path); + fprintf(stderr, "Failed to open library at `%s`.\n", library_path); exit(EXIT_FAILURE); } @@ -164,20 +164,6 @@ int picovoice_main(int argc, char **argv) { exit(EXIT_FAILURE); } - pv_status_t (*pv_orca_valid_characters_func)(pv_orca_t *, int32_t *, const char *const **) = - load_symbol(orca_library, "pv_orca_valid_characters"); - if (!pv_orca_valid_characters_func) { - print_dl_error("Failed to load 'pv_orca_valid_characters'"); - exit(EXIT_FAILURE); - } - - pv_status_t (*pv_orca_sample_rate_func)(pv_orca_t *, int32_t *) = - load_symbol(orca_library, "pv_orca_sample_rate"); - if (!pv_orca_sample_rate_func) { - print_dl_error("Failed to load 'pv_orca_sample_rate'"); - exit(EXIT_FAILURE); - } - pv_status_t (*pv_orca_synthesize_params_init_func)(pv_orca_synthesize_params_t **) = load_symbol(orca_library, "pv_orca_synthesize_params_init"); if (!pv_orca_synthesize_params_init_func) { @@ -192,23 +178,29 @@ int picovoice_main(int argc, char **argv) { exit(EXIT_FAILURE); } - pv_status_t (*pv_orca_synthesize_params_set_speech_rate_func)(pv_orca_synthesize_params_t *, float) = - load_symbol(orca_library, "pv_orca_synthesize_params_set_speech_rate"); - if (!pv_orca_synthesize_params_set_speech_rate_func) { - print_dl_error("Failed to load 'pv_orca_synthesize_params_set_speech_rate'"); - exit(EXIT_FAILURE); - } - - pv_status_t (*pv_orca_synthesize_to_file_func)(pv_orca_t *, const char *, const pv_orca_synthesize_params_t *, const char *) = + pv_status_t (*pv_orca_synthesize_to_file_func)( + pv_orca_t *, + const char *, + const pv_orca_synthesize_params_t *, + const char *, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments) = load_symbol(orca_library, "pv_orca_synthesize_to_file"); if (!pv_orca_synthesize_to_file_func) { print_dl_error("Failed to load 'pv_orca_synthesize_to_file'"); exit(EXIT_FAILURE); } - void (*pv_orca_delete_pcm_func)(int16_t *) = load_symbol(orca_library, "pv_orca_delete_pcm"); - if (!pv_orca_delete_pcm_func) { - print_dl_error("Failed to load 'pv_orca_delete_pcm'"); + pv_status_t (*pv_orca_word_alignments_delete_func)(int32_t, pv_orca_word_alignment_t **) = + load_symbol(orca_library, "pv_orca_word_alignments_delete"); + if (!pv_orca_word_alignments_delete_func) { + print_dl_error("Failed to load 'pv_orca_word_alignments_delete'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_pcm_delete_func)(int16_t *) = load_symbol(orca_library, "pv_orca_pcm_delete"); + if (!pv_orca_pcm_delete_func) { + print_dl_error("Failed to load 'pv_orca_pcm_delete'"); exit(EXIT_FAILURE); } @@ -242,10 +234,10 @@ int picovoice_main(int argc, char **argv) { pv_orca_t *orca = NULL; pv_status_t orca_status = pv_orca_init_func(access_key, model_path, &orca); if (orca_status != PV_STATUS_SUCCESS) { - fprintf(stderr, "Failed to create an instance of Orca with '%s'", pv_status_to_string_func(orca_status)); + fprintf(stderr, "Failed to create an instance of Orca with `%s`", pv_status_to_string_func(orca_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { - fprintf(stderr, ".\nUnable to get Orca error state with '%s'.\n", pv_status_to_string_func(error_status)); + fprintf(stderr, ".\nUnable to get Orca error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -260,8 +252,9 @@ int picovoice_main(int argc, char **argv) { struct timeval after; gettimeofday(&after, NULL); - double init_sec = ((double) (after.tv_sec - before.tv_sec) + - ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + double init_sec = + ((double) (after.tv_sec - before.tv_sec) + + ((double) (after.tv_usec - before.tv_usec)) * 1e-6); fprintf(stdout, "Initialized Orca in %.1f sec\n", init_sec); pv_orca_synthesize_params_t *synthesize_params = NULL; @@ -269,13 +262,13 @@ int picovoice_main(int argc, char **argv) { if (synthesize_params_status != PV_STATUS_SUCCESS) { fprintf( stderr, - "Failed to create an instance of Orca synthesize params with '%s'", + "Failed to create an instance of Orca synthesize params with `%s`", pv_status_to_string_func(synthesize_params_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { fprintf( stderr, - ".\nUnable to get Orca synthesize params error state with '%s'.\n", + ".\nUnable to get Orca synthesize params error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -291,23 +284,27 @@ int picovoice_main(int argc, char **argv) { double proc_sec = 0.; gettimeofday(&before, NULL); - fprintf(stdout, "Synthesizing text '%s' ...\n", text); + fprintf(stdout, "\nSynthesizing text `%s`\n", text); + int32_t num_alignments = 0; + pv_orca_word_alignment_t **alignments = NULL; pv_status_t synthesize_status = pv_orca_synthesize_to_file_func( orca, text, synthesize_params, - output_path); + output_path, + &num_alignments, + &alignments); if (synthesize_status != PV_STATUS_SUCCESS) { fprintf( stderr, - "Failed to synthesize text with '%s'", + "Failed to synthesize text with `%s`", pv_status_to_string_func(synthesize_params_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { fprintf( stderr, - ".\nUnable to get Orca synthesize error state with '%s'.\n", + ".\nUnable to get Orca synthesize error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -322,12 +319,45 @@ int picovoice_main(int argc, char **argv) { gettimeofday(&after, NULL); - proc_sec += ((double) (after.tv_sec - before.tv_sec) + - ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + proc_sec += + ((double) (after.tv_sec - before.tv_sec) + + ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + + if (num_alignments > 0) { + fprintf(stdout, "\nWord alignments"); + if (num_alignments > 3) { + fprintf(stdout, " (only showing first 3):\n"); + } else { + fprintf(stdout, ":\n"); + } + int32_t num_alignments_shown = num_alignments > 3 ? 3 : num_alignments; + for (int32_t i = 0; i < num_alignments_shown; i++) { + fprintf( + stdout, + "word=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->word, + alignments[i]->start_sec, + alignments[i]->end_sec); + for (int32_t j = 0; j < alignments[i]->num_phonemes; j++) { + fprintf( + stdout, + "\tphoneme=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->phonemes[j]->phoneme, + alignments[i]->phonemes[j]->start_sec, + alignments[i]->phonemes[j]->end_sec); + } + } + } - fprintf(stdout, "Synthesized text in %.1f sec\n", proc_sec); + fprintf(stdout, "\nSynthesized text in %.2f sec\n", proc_sec); fprintf(stdout, "Saved audio to `%s`\n", output_path); + pv_status_t delete_status = pv_orca_word_alignments_delete_func(num_alignments, alignments); + if (delete_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to delete word alignments with `%s`.\n", pv_status_to_string_func(delete_status)); + exit(EXIT_FAILURE); + } + pv_orca_synthesize_params_delete_func(synthesize_params); pv_orca_delete_func(orca); close_dl(orca_library); diff --git a/demo/c/orca_demo_streaming.c b/demo/c/orca_demo_streaming.c new file mode 100644 index 00000000..238f3918 --- /dev/null +++ b/demo/c/orca_demo_streaming.c @@ -0,0 +1,623 @@ +/* +Copyright 2024 Picovoice Inc. + +You may not use this file except in compliance with the license. A copy of +the license is located in the "LICENSE" file accompanying this source. + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations under +the License. +*/ + +#include +#include +#include +#include +#include + +#if !(defined(_WIN32) || defined(_WIN64)) + +#include + +#else + +#include + +#define UTF8_COMPOSITION_FLAG (0) +#define NULL_TERMINATED (-1) + +#endif + +#define DR_WAV_IMPLEMENTATION + +#include "dr_wav.h" + +#include "pv_orca.h" + +#define MAX_NUM_CHUNKS (500) +#define MAX_NUM_BYTES_PER_CHARACTER (5) + +static void *open_dl(const char *dl_path) { + +#if defined(_WIN32) || defined(_WIN64) + + return LoadLibrary(dl_path); + +#else + + return dlopen(dl_path, RTLD_NOW); + +#endif +} + +static void *load_symbol(void *handle, const char *symbol) { + +#if defined(_WIN32) || defined(_WIN64) + + return GetProcAddress((HMODULE) handle, symbol); + +#else + + return dlsym(handle, symbol); + +#endif +} + +static void close_dl(void *handle) { + +#if defined(_WIN32) || defined(_WIN64) + + FreeLibrary((HMODULE) handle); + +#else + + dlclose(handle); + +#endif +} + +static void print_dl_error(const char *message) { + +#if defined(_WIN32) || defined(_WIN64) + + fprintf(stderr, "%s with code `%lu`.\n", message, GetLastError()); + +#else + + fprintf(stderr, "%s with `%s`.\n", message, dlerror()); + +#endif +} + +static struct option long_options[] = { + {"access_key", required_argument, NULL, 'a'}, + {"library_path", required_argument, NULL, 'l'}, + {"model_path", required_argument, NULL, 'm'}, + {"text", required_argument, NULL, 't'}, + {"output_path", required_argument, NULL, 'o'}, +}; + +static pv_status_t num_bytes_character(unsigned char c, int32_t *num_bytes) { + *num_bytes = 0; + + int32_t nb; + if ((c & 0x80) == 0x00) { + nb = 1; + } else if ((c & 0xE0) == 0xC0) { + nb = 2; + } else if ((c & 0xF0) == 0xE0) { + nb = 3; + } else if ((c & 0xF8) == 0xF0) { + nb = 4; + } else { + return PV_STATUS_INVALID_ARGUMENT; + } + + *num_bytes = nb; + + return PV_STATUS_SUCCESS; +} + +static double get_time() { + struct timeval tv; + gettimeofday(&tv, NULL); + return (double) tv.tv_sec + ((double) tv.tv_usec * 1e-6); +} + +static void print_usage(const char *program_name) { + fprintf( + stdout, + "Usage: %s [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH]\n", + program_name); +} + +typedef struct pcm_chunk pcm_chunk_t; + +struct pcm_chunk { + int32_t num_samples; + int16_t *pcm; + pcm_chunk_t *next; +}; + +static pv_status_t pcm_chunk_init( + int32_t num_samples, + int16_t *pcm, + pcm_chunk_t **chunk) { + *chunk = NULL; + + pcm_chunk_t *c = calloc(1, sizeof(pcm_chunk_t)); + if (!c) { + return PV_STATUS_OUT_OF_MEMORY; + } + + c->pcm = pcm; + c->num_samples = num_samples; + c->next = NULL; + + *chunk = c; + + return PV_STATUS_SUCCESS; +} + +static pv_status_t pcm_chunk_delete(pcm_chunk_t *chunk) { + if (chunk) { + free(chunk->pcm); + free(chunk); + } + return PV_STATUS_SUCCESS; +} + +void print_error_message(char **message_stack, int32_t message_stack_depth) { + for (int32_t i = 0; i < message_stack_depth; i++) { + fprintf(stderr, " [%d] %s\n", i, message_stack[i]); + } +} + +void handle_error( + char **message_stack, + int32_t message_stack_depth, + pv_status_t (*pv_get_error_stack_func)(char ***, int32_t *), + void (*pv_free_error_stack_func)(char **), + const char *(*pv_status_to_string_func)(pv_status_t)) { + pv_status_t error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); + + if (error_status != PV_STATUS_SUCCESS) { + fprintf(stderr, ".\nUnable to get Orca error state with '%s'\n", pv_status_to_string_func(error_status)); + exit(EXIT_FAILURE); + } + + if (message_stack_depth > 0) { + fprintf(stderr, ":\n"); + for (int32_t i = 0; i < message_stack_depth; i++) { + fprintf(stderr, " [%d] %s\n", i, message_stack[i]); + } + } + + pv_free_error_stack_func(message_stack); +} + +int32_t picovoice_main(int32_t argc, char **argv) { + const char *library_path = NULL; + const char *model_path = NULL; + const char *access_key = NULL; + const char *text = NULL; + const char *output_path = NULL; + + int32_t c; + while ((c = getopt_long(argc, argv, "l:m:a:t:o:", long_options, NULL)) != -1) { + switch (c) { + case 'l': + library_path = optarg; + break; + case 'm': + model_path = optarg; + break; + case 'a': + access_key = optarg; + break; + case 't': + text = optarg; + break; + case 'o': + output_path = optarg; + break; + default: + exit(EXIT_FAILURE); + } + } + + if (!library_path || !model_path || !access_key || !text || !output_path) { + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + void *orca_library = open_dl(library_path); + if (!orca_library) { + fprintf(stderr, "Failed to open library at `%s`.\n", library_path); + exit(EXIT_FAILURE); + } + + const char *(*pv_status_to_string_func)(pv_status_t) = + load_symbol(orca_library, "pv_status_to_string"); + if (!pv_status_to_string_func) { + print_dl_error("Failed to load 'pv_status_to_string'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_init_func)(const char *, const char *, pv_orca_t **) = + load_symbol(orca_library, "pv_orca_init"); + if (!pv_orca_init_func) { + print_dl_error("Failed to load 'pv_orca_init'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_delete_func)(pv_orca_t *) = load_symbol(orca_library, "pv_orca_delete"); + if (!pv_orca_delete_func) { + print_dl_error("Failed to load 'pv_orca_delete'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_sample_rate_func)(pv_orca_t *, int32_t *) = + load_symbol(orca_library, "pv_orca_sample_rate"); + if (!pv_orca_sample_rate_func) { + print_dl_error("Failed to load 'pv_orca_sample_rate'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_synthesize_params_init_func)(pv_orca_synthesize_params_t **) = + load_symbol(orca_library, "pv_orca_synthesize_params_init"); + if (!pv_orca_synthesize_params_init_func) { + print_dl_error("Failed to load 'pv_orca_synthesize_params_init'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_synthesize_params_delete_func)(pv_orca_synthesize_params_t *) = + load_symbol(orca_library, "pv_orca_synthesize_params_delete"); + if (!pv_orca_synthesize_params_delete_func) { + print_dl_error("Failed to load 'pv_orca_synthesize_params_delete'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_pcm_delete_func)(int16_t *) = load_symbol(orca_library, "pv_orca_pcm_delete"); + if (!pv_orca_pcm_delete_func) { + print_dl_error("Failed to load 'pv_orca_pcm_delete'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_open_func)( + pv_orca_t *, + const pv_orca_synthesize_params_t *, + pv_orca_stream_t **) = load_symbol(orca_library, "pv_orca_stream_open"); + if (!pv_orca_stream_open_func) { + print_dl_error("Failed to load 'pv_orca_stream_open'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_synthesize_func)( + pv_orca_stream_t *, + const char *, + int32_t *, + int16_t **) = load_symbol(orca_library, "pv_orca_stream_synthesize"); + if (!pv_orca_stream_synthesize_func) { + print_dl_error("Failed to load 'pv_orca_stream_synthesize'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_flush_func)( + pv_orca_stream_t *, + int32_t *, + int16_t **) = load_symbol(orca_library, "pv_orca_stream_flush"); + if (!pv_orca_stream_flush_func) { + print_dl_error("Failed to load 'pv_orca_stream_flush'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_stream_close_func)(pv_orca_stream_t *) = load_symbol(orca_library, "pv_orca_stream_close"); + if (!pv_orca_stream_close_func) { + print_dl_error("Failed to load 'pv_orca_stream_close'"); + exit(EXIT_FAILURE); + } + + const char *(*pv_orca_version_func)() = load_symbol(orca_library, "pv_orca_version"); + if (!pv_orca_version_func) { + print_dl_error("Failed to load 'pv_orca_version'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_get_error_stack_func)(char ***, int32_t *) = load_symbol(orca_library, "pv_get_error_stack"); + if (!pv_get_error_stack_func) { + print_dl_error("Failed to load 'pv_get_error_stack'"); + exit(EXIT_FAILURE); + } + + void (*pv_free_error_stack_func)(char **) = load_symbol(orca_library, "pv_free_error_stack"); + if (!pv_free_error_stack_func) { + print_dl_error("Failed to load 'pv_free_error_stack'"); + exit(EXIT_FAILURE); + } + + char **message_stack = NULL; + int32_t message_stack_depth = 0; + + fprintf(stdout, "Orca version: %s\n\n", pv_orca_version_func()); + + double time_before_init = get_time(); + + pv_orca_t *orca = NULL; + pv_status_t orca_status = pv_orca_init_func(access_key, model_path, &orca); + if (orca_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to create an instance of Orca with `%s`", pv_status_to_string_func(orca_status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + double init_sec = get_time() - time_before_init; + fprintf(stdout, "Initialized Orca in %.1f sec\n", init_sec); + + int32_t sample_rate = 0; + pv_status_t status = pv_orca_sample_rate_func(orca, &sample_rate); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to get Orca sample rate with `%s`", pv_status_to_string_func(status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_PCM; + format.channels = 1; + format.sampleRate = sample_rate; + format.bitsPerSample = 16; + + drwav output_file; + +#if defined(_WIN32) || defined(_WIN64) + + int output_path_wchars_num = MultiByteToWideChar(CP_UTF8, UTF8_COMPOSITION_FLAG, output_path, NULL_TERMINATED, NULL, 0); + wchar_t output_path_w[output_path_wchars_num]; + MultiByteToWideChar(CP_UTF8, UTF8_COMPOSITION_FLAG, output_path, NULL_TERMINATED, output_path_w, output_path_wchars_num); + unsigned int drwav_init_file_status = drwav_init_file_write_w(&output_file, output_path_w, &format, NULL); + +#else + + unsigned int drwav_init_file_status = drwav_init_file_write(&output_file, output_path, &format, NULL); + +#endif + + if (!drwav_init_file_status) { + fprintf(stderr, "Failed to open the output wav file at '%s'.", output_path); + exit(EXIT_FAILURE); + } + + pv_orca_synthesize_params_t *synthesize_params = NULL; + pv_status_t synthesize_params_status = pv_orca_synthesize_params_init_func(&synthesize_params); + if (synthesize_params_status != PV_STATUS_SUCCESS) { + fprintf( + stderr, + "Failed to create an instance of Orca synthesize params with `%s`", + pv_status_to_string_func(synthesize_params_status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + fprintf(stdout, "\nSynthesizing text `%s` \n", text); + + int32_t num_samples_chunks[MAX_NUM_CHUNKS] = {0}; + double start_chunks[MAX_NUM_CHUNKS] = {0}; + start_chunks[0] = get_time(); + double end_chunks[MAX_NUM_CHUNKS] = {0}; + int32_t num_chunks = 0; + + pcm_chunk_t *pcm_chunk_prev = NULL; + pcm_chunk_t *pcm_chunk_head = NULL; + + pv_orca_stream_t *orca_stream = NULL; + pv_status_t stream_open_status = pv_orca_stream_open_func(orca, synthesize_params, &orca_stream); + if (stream_open_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error opening stream"); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + char character[MAX_NUM_BYTES_PER_CHARACTER] = {0}; + for (int32_t i = 0; i < (int32_t) strlen(text); i++) { + if (num_chunks > (MAX_NUM_CHUNKS - 1)) { + fprintf(stderr, "Trying to synthesize too many chunks. Only `%d` chunks are supported.\n", MAX_NUM_CHUNKS); + exit(EXIT_FAILURE); + } + + int32_t num_bytes = 0; + status = num_bytes_character((unsigned char) text[i], &num_bytes); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error getting number of bytes for character: `%c`", text[i]); + exit(EXIT_FAILURE); + } + + for (int32_t j = 0; j < num_bytes; j++) { + character[j] = text[i + j]; + } + character[num_bytes] = '\0'; + + int32_t num_samples_chunk = 0; + int16_t *pcm_chunk = NULL; + status = pv_orca_stream_synthesize_func(orca_stream, character, &num_samples_chunk, &pcm_chunk); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error adding token: `%s`", character); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + if (num_samples_chunk > 0) { + if (pcm_chunk_prev == NULL) { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &pcm_chunk_prev); + pcm_chunk_head = pcm_chunk_prev; + } else { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &(pcm_chunk_prev->next)); + pcm_chunk_prev = pcm_chunk_prev->next; + } + + double timestamp = get_time(); + num_samples_chunks[num_chunks] = num_samples_chunk; + end_chunks[num_chunks++] = timestamp; + start_chunks[num_chunks] = timestamp; + } + } + + int32_t num_samples_chunk = 0; + int16_t *pcm_chunk = NULL; + status = pv_orca_stream_flush_func(orca_stream, &num_samples_chunk, &pcm_chunk); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error flushing Orca stream"); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + if (num_samples_chunk > 0) { + if (pcm_chunk_prev == NULL) { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &pcm_chunk_prev); + pcm_chunk_head = pcm_chunk_prev; + } else { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &(pcm_chunk_prev->next)); + } + + double timestamp = get_time(); + num_samples_chunks[num_chunks] = num_samples_chunk; + end_chunks[num_chunks++] = timestamp; + start_chunks[num_chunks] = timestamp; + } + + pv_orca_stream_close_func(orca_stream); + pv_orca_synthesize_params_delete_func(synthesize_params); + pv_orca_delete_func(orca); + + int32_t num_samples = 0; + pcm_chunk_t *pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + num_samples += pcm_chunk_iter->num_samples; + pcm_chunk_iter = pcm_chunk_iter->next; + } + + int16_t *pcm = malloc(num_samples * sizeof(int16_t)); + int32_t offset = 0; + pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + memcpy(&pcm[offset], pcm_chunk_iter->pcm, pcm_chunk_iter->num_samples * sizeof(int16_t)); + offset += pcm_chunk_iter->num_samples; + pcm_chunk_iter = pcm_chunk_iter->next; + } + + pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + pcm_chunk_t *tmp = pcm_chunk_iter; + pcm_chunk_iter = pcm_chunk_iter->next; + pcm_chunk_delete(tmp); + } + + if ((int32_t) drwav_write_pcm_frames(&output_file, num_samples, pcm) != num_samples) { + fprintf(stderr, "Failed to write to output file.\n"); + exit(EXIT_FAILURE); + } + + drwav_uninit(&output_file); + free(pcm); + + fprintf( + stdout, + "\nGenerated %d audio chunk%s in %.2f seconds.\n", + num_chunks, num_chunks == 1 ? "" : "s", + end_chunks[num_chunks - 1] - start_chunks[0]); + + for (int32_t i = 0; i < num_chunks; i++) { + float num_seconds = (float) num_samples_chunks[i] / (float) sample_rate; + double process_time = end_chunks[i] - start_chunks[i]; + fprintf( + stdout, + "Audio chunk #%d: length: %.2f s, processing time %.2f s\n", + i, + num_seconds, + process_time); + } + + fprintf(stdout, "\nSaved final audio to `%s`\n", output_path); + + close_dl(orca_library); + + return EXIT_SUCCESS; +} + +int32_t main(int argc, char *argv[]) { + +#if defined(_WIN32) || defined(_WIN64) + +#define UTF8_COMPOSITION_FLAG (0) +#define NULL_TERMINATED (-1) + + LPWSTR *wargv = CommandLineToArgvW(GetCommandLineW(), &argc); + if (wargv == NULL) { + fprintf(stderr, "CommandLineToArgvW failed\n"); + exit(1); + } + + char *utf8_argv[argc]; + + for (int32_t i = 0; i < argc; ++i) { + // WideCharToMultiByte: + // https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte + int arg_chars_num = + WideCharToMultiByte(CP_UTF8, UTF8_COMPOSITION_FLAG, wargv[i], NULL_TERMINATED, NULL, 0, NULL, NULL); + utf8_argv[i] = (char *) malloc(arg_chars_num * sizeof(char)); + if (!utf8_argv[i]) { + fprintf(stderr, "failed to to allocate memory for converting args"); + } + WideCharToMultiByte(CP_UTF8, UTF8_COMPOSITION_FLAG, wargv[i], NULL_TERMINATED, utf8_argv[i], arg_chars_num, NULL, NULL); + } + + LocalFree(wargv); + argv = utf8_argv; + +#endif + + int result = picovoice_main(argc, argv); + +#if defined(_WIN32) || defined(_WIN64) + + for (int i = 0; i < argc; ++i) { + free(utf8_argv[i]); + } + +#endif + + return result; +} diff --git a/demo/c/test/test_orca_c.py b/demo/c/test/test_orca_c.py index 045a951e..c88728eb 100644 --- a/demo/c/test/test_orca_c.py +++ b/demo/c/test/test_orca_c.py @@ -10,21 +10,29 @@ # import os.path +import platform as pltf import subprocess import sys import unittest from test_util import get_model_paths, get_test_data -test_sentences = get_test_data() +test_data = get_test_data() class OrcaCTestCase(unittest.TestCase): @classmethod def setUpClass(cls): cls._access_key = sys.argv[1] - cls._platform = sys.argv[2] - cls._arch = "" if len(sys.argv) != 4 else sys.argv[3] + platform = sys.argv[2] + if platform == "mac": + if pltf.machine() == "x86_64": + cls._arch = "x86_64" + elif pltf.machine() == "arm64": + cls._arch = "arm64" + else: + cls._arch = "" if len(sys.argv) != 4 else sys.argv[3] + cls._platform = platform cls._root_dir = os.path.join(os.path.dirname(__file__), "../../..") @staticmethod @@ -52,18 +60,43 @@ def run_orca(self, model_path: str) -> None: "-a", self._access_key, "-l", self._get_library_file(), "-m", model_path, - "-t", test_sentences.text, + "-t", test_data.text, "-o", output_path, ] process = subprocess.Popen(args, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = process.communicate() - self.assertEqual(process.poll(), 0) + poll_result = process.poll() + if poll_result != 0: + print(stdout.decode('utf-8')) + print(stderr.decode('utf-8')) + raise RuntimeError("Error running demo. See details above") + + self.assertEqual(poll_result, 0) self.assertEqual(stderr.decode('utf-8'), '') self.assertTrue("Saved audio" in stdout.decode('utf-8')) os.remove(output_path) + def run_orca_streaming(self, model_path: str) -> None: + output_path = os.path.join(os.path.dirname(__file__), "output.wav") + args = [ + os.path.join(os.path.dirname(__file__), "../build/orca_demo_streaming"), + "-a", self._access_key, + "-l", self._get_library_file(), + "-m", model_path, + "-t", test_data.text, + "-o", output_path, + ] + + process = subprocess.Popen(args, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + stdout, stderr = process.communicate() + + self.assertEqual(process.poll(), 0) + self.assertEqual(stderr.decode('utf-8'), '') + self.assertTrue("Saved final audio" in stdout.decode('utf-8')) + os.remove(output_path) + def test_orca(self) -> None: for model_path in get_model_paths(): self.run_orca(model_path=model_path) diff --git a/demo/c/test/test_util.py b/demo/c/test/test_util.py index 430729e8..21acd5de 100644 --- a/demo/c/test/test_util.py +++ b/demo/c/test/test_util.py @@ -29,6 +29,7 @@ def get_test_data() -> TestSentences: with open(data_file_path, encoding="utf8") as data_file: json_test_data = data_file.read() test_data = json.loads(json_test_data)['test_sentences'] + test_data.pop("text_alignment") return TestSentences(**test_data) diff --git a/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj b/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj index cf096214..935a9715 100644 --- a/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj +++ b/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj @@ -13,7 +13,9 @@ 02A1195F268D3FD600A2AC99 /* ViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02A1195E268D3FD600A2AC99 /* ViewModel.swift */; }; 1E001B682B76FFE700D8E72D /* AudioPlayer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */; }; 1E001B6A2B7D451200D8E72D /* orca_params_female.pv in Resources */ = {isa = PBXBuildFile; fileRef = 1E001B692B7D451200D8E72D /* orca_params_female.pv */; }; - 4A00B7EF2D4C1FA9D1C474E1 /* libPods-OrcaDemo.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */; }; + B218600C461D96EA568B6D6C /* libPods-OrcaDemo.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */; }; + E125E1892BE99DCA008B6D56 /* AtomicBool.swift in Sources */ = {isa = PBXBuildFile; fileRef = E125E1882BE99DCA008B6D56 /* AtomicBool.swift */; }; + E1C5A45F2BE587A2002C0C40 /* AudioPlayerStream.swift in Sources */ = {isa = PBXBuildFile; fileRef = E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -25,12 +27,11 @@ 02A1195E268D3FD600A2AC99 /* ViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewModel.swift; sourceTree = ""; }; 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioPlayer.swift; sourceTree = ""; }; 1E001B692B7D451200D8E72D /* orca_params_female.pv */ = {isa = PBXFileReference; lastKnownFileType = file; name = orca_params_female.pv; path = ../../../../lib/common/orca_params_female.pv; sourceTree = ""; }; - 25220F02E797CC78BF7E6619 /* libPods-OrcaDemoUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemoUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.release.xcconfig"; sourceTree = ""; }; - 72F8162D9843C0A1C546BE64 /* Pods-OrcaDemoUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemoUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemoUITests/Pods-OrcaDemoUITests.debug.xcconfig"; sourceTree = ""; }; - 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemo.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.debug.xcconfig"; sourceTree = ""; }; - D2D9DCA10D9D1AB213098AEF /* Pods-OrcaDemoUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemoUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemoUITests/Pods-OrcaDemoUITests.release.xcconfig"; sourceTree = ""; }; + 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.debug.xcconfig"; sourceTree = ""; }; + 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.release.xcconfig"; sourceTree = ""; }; + A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemo.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + E125E1882BE99DCA008B6D56 /* AtomicBool.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AtomicBool.swift; sourceTree = ""; }; + E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioPlayerStream.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -38,7 +39,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 4A00B7EF2D4C1FA9D1C474E1 /* libPods-OrcaDemo.a in Frameworks */, + B218600C461D96EA568B6D6C /* libPods-OrcaDemo.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -50,8 +51,8 @@ children = ( 02A11947268D39A700A2AC99 /* OrcaDemo */, 02A11946268D39A700A2AC99 /* Products */, - 02A11957268D39D100A2AC99 /* Frameworks */, 8DB92FF3DC81AB04D3FF7242 /* Pods */, + 4374BA75AB06EC0D059377CD /* Frameworks */, ); sourceTree = ""; }; @@ -73,15 +74,16 @@ 02A11951268D39AB00A2AC99 /* Info.plist */, 02A1195E268D3FD600A2AC99 /* ViewModel.swift */, 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */, + E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */, + E125E1882BE99DCA008B6D56 /* AtomicBool.swift */, ); path = OrcaDemo; sourceTree = ""; }; - 02A11957268D39D100A2AC99 /* Frameworks */ = { + 4374BA75AB06EC0D059377CD /* Frameworks */ = { isa = PBXGroup; children = ( - 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */, - 25220F02E797CC78BF7E6619 /* libPods-OrcaDemoUITests.a */, + A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */, ); name = Frameworks; sourceTree = ""; @@ -89,10 +91,8 @@ 8DB92FF3DC81AB04D3FF7242 /* Pods */ = { isa = PBXGroup; children = ( - 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */, - 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */, - 72F8162D9843C0A1C546BE64 /* Pods-OrcaDemoUITests.debug.xcconfig */, - D2D9DCA10D9D1AB213098AEF /* Pods-OrcaDemoUITests.release.xcconfig */, + 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */, + 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */, ); path = Pods; sourceTree = ""; @@ -104,11 +104,11 @@ isa = PBXNativeTarget; buildConfigurationList = 02A11954268D39AB00A2AC99 /* Build configuration list for PBXNativeTarget "OrcaDemo" */; buildPhases = ( - 9E7C8E83BA330F7017CD5C56 /* [CP] Check Pods Manifest.lock */, + E5EA3B129B59D3DF4752D82D /* [CP] Check Pods Manifest.lock */, 02A11941268D39A700A2AC99 /* Sources */, 02A11942268D39A700A2AC99 /* Frameworks */, 02A11943268D39A700A2AC99 /* Resources */, - B387D574F16D312B6FFB5B42 /* [CP] Embed Pods Frameworks */, + E85A144184F1D605DB772089 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -164,7 +164,7 @@ /* End PBXResourcesBuildPhase section */ /* Begin PBXShellScriptBuildPhase section */ - 9E7C8E83BA330F7017CD5C56 /* [CP] Check Pods Manifest.lock */ = { + E5EA3B129B59D3DF4752D82D /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -186,7 +186,7 @@ shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - B387D574F16D312B6FFB5B42 /* [CP] Embed Pods Frameworks */ = { + E85A144184F1D605DB772089 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -210,6 +210,8 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + E125E1892BE99DCA008B6D56 /* AtomicBool.swift in Sources */, + E1C5A45F2BE587A2002C0C40 /* AudioPlayerStream.swift in Sources */, 02A1194B268D39A700A2AC99 /* ContentView.swift in Sources */, 02A1195F268D3FD600A2AC99 /* ViewModel.swift in Sources */, 02A11949268D39A700A2AC99 /* OrcaDemoApp.swift in Sources */, @@ -338,13 +340,13 @@ }; 02A11955268D39AB00A2AC99 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */; + baseConfigurationReference = 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = Y6S42VUYBV; ENABLE_PREVIEWS = YES; INFOPLIST_FILE = OrcaDemo/Info.plist; IPHONEOS_DEPLOYMENT_TARGET = 14.0; @@ -352,7 +354,7 @@ "$(inherited)", "@executable_path/Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo; + PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo.albert; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; @@ -361,13 +363,13 @@ }; 02A11956268D39AB00A2AC99 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */; + baseConfigurationReference = 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = Y6S42VUYBV; ENABLE_PREVIEWS = YES; INFOPLIST_FILE = OrcaDemo/Info.plist; IPHONEOS_DEPLOYMENT_TARGET = 14.0; @@ -375,7 +377,7 @@ "$(inherited)", "@executable_path/Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo; + PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo.albert; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; diff --git a/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift b/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift new file mode 100644 index 00000000..f20c3e98 --- /dev/null +++ b/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift @@ -0,0 +1,22 @@ +import Foundation + +class AtomicBool { + private var value: Bool + private let lock = NSLock() + + init(_ value: Bool = false) { + self.value = value + } + + func set(_ newValue: Bool) { + lock.lock() + value = newValue + lock.unlock() + } + + func get() -> Bool { + lock.lock() + defer { lock.unlock() } + return value + } +} diff --git a/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift b/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift new file mode 100644 index 00000000..f2860eaf --- /dev/null +++ b/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift @@ -0,0 +1,71 @@ +import Foundation +import AVFoundation + +class AudioPlayerStream { + private let engine = AVAudioEngine() + private let playerNode = AVAudioPlayerNode() + private let mixerNode = AVAudioMixerNode() + + private var pcmBuffers = [[Int16]]() + private var isPlaying = false + + init(sampleRate: Double) throws { + let audioSession = AVAudioSession.sharedInstance() + try audioSession.setCategory(.playback, mode: .default) + try audioSession.setActive(true) + + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: sampleRate, + channels: AVAudioChannelCount(1), + interleaved: false) + + engine.attach(mixerNode) + engine.connect(mixerNode, to: engine.outputNode, format: format) + + engine.attach(playerNode) + engine.connect(playerNode, to: mixerNode, format: format) + + try engine.start() + } + + func playStreamPCM(_ pcmData: [Int16], completion: @escaping (Bool) -> Void) { + pcmBuffers.append(pcmData) + if !isPlaying { + playNextPCMBuffer(completion: completion) + } else { + completion(true) + } + } + + private func playNextPCMBuffer(completion: @escaping (Bool) -> Void) { + guard let pcmData = pcmBuffers.first, !pcmData.isEmpty else { + isPlaying = false + completion(false) + return + } + pcmBuffers.removeFirst() + + let audioBuffer = AVAudioPCMBuffer( + pcmFormat: playerNode.outputFormat(forBus: 0), frameCapacity: AVAudioFrameCount(pcmData.count))! + + audioBuffer.frameLength = audioBuffer.frameCapacity + let buf = audioBuffer.floatChannelData![0] + for (index, sample) in pcmData.enumerated() { + buf[index] = Float32(sample) / Float32(Int16.max) + } + + playerNode.scheduleBuffer(audioBuffer) { [weak self] in + self?.playNextPCMBuffer(completion: completion) + } + + playerNode.play() + isPlaying = true + completion(true) + } + + func stopStreamPCM() { + playerNode.stop() + engine.stop() + } +} diff --git a/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift b/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift index 01453210..cb53f8bf 100644 --- a/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift +++ b/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift @@ -19,47 +19,95 @@ struct ContentView: View { let lightGray = Color(red: 247 / 255, green: 247 / 255, blue: 247 / 255, opacity: 1) var body: some View { + let streamingMode = viewModel.state == .STREAM_OPEN || viewModel.state == .STREAM_PLAYING let interactionDisabled = - !viewModel.errorMessage.isEmpty || viewModel.state == UIState.PROCESSING - || viewModel.state == UIState.INIT || text.isEmpty || !viewModel.invalidTextMessage.isEmpty + !viewModel.errorMessage.isEmpty || viewModel.state == .PROCESSING + || viewModel.state == .INIT || (!streamingMode && !viewModel.invalidTextMessage.isEmpty) + let toggleDisabled = interactionDisabled || viewModel.state == .STREAM_PLAYING + let buttonDisabled = toggleDisabled || text.isEmpty + GeometryReader { _ in VStack(spacing: 10) { - GeometryReader { geometry in - VStack { - ScrollView { - ZStack(alignment: .topLeading) { - TextEditor(text: $text) - .transparentScrolling() - .padding() - .frame(minWidth: 0, - maxWidth: .infinity, - minHeight: geometry.size.height, - maxHeight: .infinity) - .font(.title3) - .background(lightGray) - .onChange(of: text) { _ in - text = String(text.prefix(Int(viewModel.maxCharacterLimit))) - viewModel.isValid(text: text) - } + Toggle( + isOn: Binding( + get: { streamingMode }, + set: { _ in viewModel.toggleStreaming() } + ), + label: { Text("Streaming Synthesis") } + ) + .disabled(toggleDisabled) + .onChange(of: streamingMode) { _ in text = "" } + .foregroundColor(Color.black) - if text.count == 0 { - Text("Enter any text to be synthesized") - .padding(25) + if viewModel.state == .STREAM_PLAYING { + GeometryReader { geometry in + ScrollView { + Text(viewModel.textStream) + .transparentScrolling() + .padding() + .frame(minWidth: 0, + maxWidth: .infinity, + minHeight: geometry.size.height, + maxHeight: .infinity, + alignment: .topLeading) + .font(.title3) + .background(lightGray) + .foregroundColor(Color.black) + } + } + } else { + GeometryReader { geometry in + VStack { + ScrollView { + ZStack(alignment: .topLeading) { + TextEditor(text: $text) + .transparentScrolling() + .padding() + .frame(minWidth: 0, + maxWidth: .infinity, + minHeight: geometry.size.height, + maxHeight: .infinity) .font(.title3) - .foregroundColor(Color.gray) + .background(lightGray) + .foregroundColor(Color.black) + .onChange(of: text) { newValue in + let updatedText = String( + newValue.prefix(Int(exactly: viewModel.maxCharacterLimit)!)) + text = updatedText.replacingOccurrences(of: "’", with: "'") + viewModel.isValid(text: text) + } + .disabled(viewModel.state == .PLAYING) + + if text.count == 0 { + Text("Enter any text to be synthesized") + .padding(25) + .font(.title3) + .foregroundColor(Color.gray) + } } } + Text("\(text.count) / \(viewModel.maxCharacterLimit)") + .font(.footnote) + .frame(maxWidth: .infinity, alignment: .trailing) + .foregroundColor(Color.gray) } - - Text("\(text.count) / \(viewModel.maxCharacterLimit)") - .font(.footnote) - .frame(maxWidth: .infinity, alignment: .trailing) - .foregroundColor(Color.gray) } } - if viewModel.state == .INIT || viewModel.state == .READY { + if streamingMode { + if viewModel.state == .STREAM_OPEN && !viewModel.streamInvalidTextMessage.isEmpty { + Text(viewModel.streamInvalidTextMessage) + .padding() + .font(.body) + .foregroundColor(Color.gray) + } else { + Text(viewModel.streamHelperText) + .padding() + .font(.body) + .foregroundColor(Color.black) + } + } else if viewModel.state == .INIT || viewModel.state == .READY { if viewModel.invalidTextMessage.isEmpty { Text("Enter text and press synthesize") .padding() @@ -100,12 +148,12 @@ struct ContentView: View { Text(viewModel.state == .PLAYING ? "Stop" : "Synthesize") .padding() .frame(minWidth: 200) - .background(interactionDisabled ? Color.gray : activeBlue) + .background(buttonDisabled ? Color.gray : activeBlue) .foregroundColor(Color.white) .font(.largeTitle) } ) - .disabled(interactionDisabled) + .disabled(buttonDisabled) } .onReceive( NotificationCenter.default.publisher( @@ -127,7 +175,6 @@ struct ContentView: View { .onTapGesture { hideKeyboard() } - } } } diff --git a/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift b/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift index 4e0b4480..1b9ac382 100644 --- a/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift +++ b/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift @@ -17,24 +17,34 @@ enum UIState { case PROCESSING case SYNTHESIZED case PLAYING + case STREAM_OPEN + case STREAM_PLAYING case ERROR } class ViewModel: ObservableObject { private let ACCESS_KEY = "{YOUR_ACCESS_KEY_HERE}" // Obtained from Picovoice Console (https://console.picovoice.ai) + private let NUM_AUDIO_WAIT_CHUNKS = 1 + private var orca: Orca! + private var orcaStream: Orca.OrcaStream! private var player: AudioPlayer = AudioPlayer() + private var playerStream: AudioPlayerStream! private var previousText = "" private var subscriptions = Set() private let audioFilePath = "temp.wav" private var audioFile: URL! - @Published var errorMessage = "" @Published var state = UIState.INIT - @Published var maxCharacterLimit = Orca.maxCharacterLimit + @Published var sampleRate: Int32 = 0 + @Published var maxCharacterLimit: Int32 = 0 + @Published var textStream = "" + @Published var streamHelperText = "" + @Published var errorMessage = "" @Published var invalidTextMessage = "" + @Published var streamInvalidTextMessage = "" init() { initialize() @@ -44,6 +54,8 @@ class ViewModel: ObservableObject { state = UIState.INIT do { try orca = Orca(accessKey: ACCESS_KEY, modelPath: "orca_params_female.pv") + maxCharacterLimit = orca.maxCharacterLimit! + sampleRate = orca.sampleRate! state = UIState.READY let audioDir = try FileManager.default.url( @@ -73,7 +85,32 @@ class ViewModel: ObservableObject { orca.delete() } + public func toggleStreaming() { + if state == UIState.READY || state == UIState.STREAM_OPEN { + if orcaStream == nil { + do { + self.textStream = "" + self.streamHelperText = "Enter text and press synthesize" + orcaStream = try orca.streamOpen() + self.state = UIState.STREAM_OPEN + } catch { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } else { + orcaStream.close() + orcaStream = nil + self.state = UIState.READY + } + } + } + public func toggleSynthesize(text: String) { + if state == UIState.STREAM_OPEN { + runStreamSynthesis(text: text) + return + } + if state == UIState.PLAYING { toggleSynthesizeOff() } else { @@ -81,6 +118,181 @@ class ViewModel: ObservableObject { } } + private func runStreamSynthesis(text: String) { + self.textStream = "" + self.state = UIState.STREAM_PLAYING + + do { + playerStream = try AudioPlayerStream(sampleRate: Double(self.sampleRate)) + } catch { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + + let textStreamQueue = DispatchQueue(label: "text-stream-queue") + let textStreamQueueConcurrent = DispatchQueue(label: "text-stream-queue-concurrent", attributes: .concurrent) + var textStreamArray = [String]() + let isTextStreamQueueActive = AtomicBool(false) + + func isTextStreamEmpty() -> Bool { + return textStreamQueueConcurrent.sync { + textStreamArray.isEmpty + } + } + + func getFromTextStream() -> String? { + var word: String? + textStreamQueueConcurrent.sync { + if !textStreamArray.isEmpty { + word = textStreamArray.removeFirst() + } + } + return word + } + + func addToTextStream(word: String) { + textStreamQueueConcurrent.async(flags: .barrier) { + textStreamArray.append(word) + } + } + + let pcmStreamQueue = DispatchQueue(label: "pcm-stream-queue") + let pcmStreamQueueConcurrent = DispatchQueue(label: "pcm-stream-queue-concurrent", attributes: .concurrent) + var pcmStreamArray = [[Int16]]() + let isPcmStreamQueueActive = AtomicBool(false) + + func isPcmStreamEmpty() -> Bool { + return pcmStreamQueueConcurrent.sync { + pcmStreamArray.isEmpty + } + } + + func getFromPcmStream() -> [Int16]? { + var pcm: [Int16]? + pcmStreamQueueConcurrent.sync { + if !pcmStreamArray.isEmpty { + pcm = pcmStreamArray.removeFirst() + } + } + return pcm + } + + func addToPcmStream(pcm: [Int16]) { + pcmStreamQueueConcurrent.async(flags: .barrier) { + pcmStreamArray.append(pcm) + } + } + + let playStreamQueue = DispatchQueue(label: "play-stream-queue") + let pcmStreamQueueLatch = DispatchSemaphore(value: 0) + let playStreamQueueLatch = DispatchSemaphore(value: 0) + + func getSecsString(secs: Float) -> String { + return "Seconds of audio synthesized: " + String(format: "%.3f", secs) + "s" + } + + textStreamQueue.async { + isTextStreamQueueActive.set(true) + + var isPcmStreamQueueStarted = false + let words = text.split(separator: " ") + for word in words { + let wordWithSpace = String(word) + " " + addToTextStream(word: wordWithSpace) + if isPcmStreamQueueStarted == false { + pcmStreamQueueLatch.signal() + isPcmStreamQueueStarted = true + } + usleep(100 * 1000) + DispatchQueue.main.async { + self.textStream.append(wordWithSpace) + } + } + + isTextStreamQueueActive.set(false) + } + + pcmStreamQueue.async { + isPcmStreamQueueActive.set(true) + + var audioSynthesizedSecs: Float = 0 + var numIterations = 0 + var isPlayStreamQueueStarted = false + + pcmStreamQueueLatch.wait() + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + + while isTextStreamQueueActive.get() || !isTextStreamEmpty() { + if !isTextStreamEmpty() { + do { + let word = getFromTextStream() + if word != nil { + let pcm = try self.orcaStream.synthesize(text: word!) + if pcm != nil { + addToPcmStream(pcm: pcm!) + audioSynthesizedSecs += Float(pcm!.count) / Float(self.sampleRate) + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + if numIterations == self.NUM_AUDIO_WAIT_CHUNKS { + playStreamQueueLatch.signal() + isPlayStreamQueueStarted = true + } + numIterations += 1 + } + } + } catch { + DispatchQueue.main.async { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } + } + } + + do { + let pcm = try self.orcaStream.flush() + if pcm != nil { + addToPcmStream(pcm: pcm!) + audioSynthesizedSecs += Float(pcm!.count) / Float(self.sampleRate) + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + if !isPlayStreamQueueStarted { + playStreamQueueLatch.signal() + } + } + } catch { + DispatchQueue.main.async { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } + + isPcmStreamQueueActive.set(false) + } + + playStreamQueue.async { + playStreamQueueLatch.wait() + + while isPcmStreamQueueActive.get() || !isPcmStreamEmpty() { + if !isPcmStreamEmpty() { + let pcm = getFromPcmStream() + self.playerStream.playStreamPCM(pcm!) { isPlaying in + if !isPlaying { + DispatchQueue.main.async { + self.playerStream.stopStreamPCM() + self.state = UIState.STREAM_OPEN + } + } + } + } + } + } + } + public func toggleSynthesizeOff() { player.stop() state = UIState.READY @@ -127,27 +339,21 @@ class ViewModel: ObservableObject { } public func isValid(text: String) { - do { - let characters = try orca.validCharacters - let regex = try NSRegularExpression( - pattern: "[^\(characters.joined(separator: ""))\\s{}|']", - options: .caseInsensitive) - let range = NSRange(text.startIndex.. 0 { - let characterString = unexpectedCharacters.array.map { "\($0)" }.joined(separator: ", ") - self.invalidTextMessage = "Text contains the following invalid characters: `\(characterString)`" - } else { - self.invalidTextMessage = "" + var nonAllowedCharacters = [Character]() + for i in 0.. 0 { + let characterString = nonAllowedCharacters.map { "\($0)" }.joined(separator: ", ") + self.invalidTextMessage = "Text contains the following invalid characters: `\(characterString)`" + self.streamInvalidTextMessage = "The following characters will be ignored: `\(characterString)`" + } else { + self.invalidTextMessage = "" + self.streamInvalidTextMessage = "" } } } diff --git a/demo/ios/OrcaDemo/Podfile b/demo/ios/OrcaDemo/Podfile index b888dd18..e6434527 100644 --- a/demo/ios/OrcaDemo/Podfile +++ b/demo/ios/OrcaDemo/Podfile @@ -2,5 +2,5 @@ source 'https://cdn.cocoapods.org/' platform :ios, '13.0' target 'OrcaDemo' do - pod 'Orca-iOS', '~> 0.1.0' + pod 'Orca-iOS', '~> 0.2.0' end diff --git a/demo/ios/OrcaDemo/Podfile.lock b/demo/ios/OrcaDemo/Podfile.lock index 9a4803dc..ddb3725e 100644 --- a/demo/ios/OrcaDemo/Podfile.lock +++ b/demo/ios/OrcaDemo/Podfile.lock @@ -1,16 +1,16 @@ PODS: - - Orca-iOS (0.1.0) + - Orca-iOS (0.2.0) DEPENDENCIES: - - Orca-iOS (~> 0.1.0) + - Orca-iOS (~> 0.2.0) SPEC REPOS: trunk: - Orca-iOS SPEC CHECKSUMS: - Orca-iOS: 808b4c77678454905ea0a0c1408eff8f9255e3ac + Orca-iOS: 01bbf44ba52a102104fc09aded6bfda7beb4865e -PODFILE CHECKSUM: b2d1bae4a022122055b4d7532f81ce24a11ade44 +PODFILE CHECKSUM: 7655658323a426ab9a7ed6f7546e15081a877170 -COCOAPODS: 1.11.3 +COCOAPODS: 1.15.2 diff --git a/demo/llm_voice_assistant/.gitignore b/demo/llm_voice_assistant/.gitignore new file mode 100644 index 00000000..ba0430d2 --- /dev/null +++ b/demo/llm_voice_assistant/.gitignore @@ -0,0 +1 @@ +__pycache__/ \ No newline at end of file diff --git a/demo/llm_voice_assistant/README.md b/demo/llm_voice_assistant/README.md new file mode 100644 index 00000000..ecfdd9d5 --- /dev/null +++ b/demo/llm_voice_assistant/README.md @@ -0,0 +1,45 @@ +# LLM Voice Assistant Demo - Talk to ChatGPT in Real-Time + +Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) + +This demo showcases how [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) can be seamlessly +integrated into LLM-applications to drastically reduce the audio latency of voice assistants. + +## Technologies + +In this demo, the user can interact with a voice assistant in real-time by leveraging GenAI technologies. +It is built like the majority of voice assistant today, by chaining together a Speech-to-Text engine, an LLM, and +a Text-to-Speech engine. + +The following technologies are used: + +- Speech-to-Text: Picovoice's [Cheetah Streaming Speech-to-Text](https://picovoice.ai/platform/cheetah/) +- LLM: "ChatGPT" using `gpt-3.5-turbo` + with OpenAI Chat Completion API. +- TTS: + - Picovoice's [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) + - OpenAI TTS + +## Compatibility + +This demo has been tested on Linux (x86_64) and macOS (x86_64) using Python 3.10. + +## Access Keys + +To run all features of this demo, access keys are required for: + +- Picovoice Console: Get your `AccessKey` for free by signing up or logging in + to [Picovoice Console](https://console.picovoice.ai/). +- OpenAI API: Get your `AccessKey` from OpenAI. + +## Usage + +```bash +python llm_voice_assistant_demo.py --picovoice-access-key ${PV_ACCESS_KEY} --openai-access-key ${OPEN_AI_KEY} +``` + +Replace `${PV_ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console, +`${OPEN_AI_KEY}` with your `AccessKey` obtained from OpenAI. +You can toggle between Orca and OpenAI TTS by using the `--tts` flag, using `picovoice_orca` or `openai`, respectively. +If you don't want to use ChatGPT, set the `--llm` flag to `dummy`. +This will simulate an LLM response using example sentences that are synthesized by the TTS system. diff --git a/demo/llm_voice_assistant/llm_voice_assistant_demo.py b/demo/llm_voice_assistant/llm_voice_assistant_demo.py new file mode 100644 index 00000000..c48b8c3e --- /dev/null +++ b/demo/llm_voice_assistant/llm_voice_assistant_demo.py @@ -0,0 +1,265 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import argparse +import time +from typing import Dict + +from pvrecorder import PvRecorder + +from src import ( + LLM, + LLMs, + Synthesizer, + Synthesizers, + TimingPrinter, + Timer, + UserInput, + UserInputs, + StreamingAudioDevice, + Transcribers, +) + +MAX_WAIT_TIME_FIRST_AUDIO = 10 + + +def get_user_input_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + + user_input_type = UserInputs(args.user_input) + if user_input_type is UserInputs.VOICE: + kwargs["audio_device_index"] = args.input_audio_device_index + + kwargs["transcriber"] = Transcribers.PICOVOICE_CHEETAH + kwargs["transcriber_params"] = dict() + if args.picovoice_access_key is None: + raise ValueError("Picovoice access key is required when using voice user input") + kwargs["transcriber_params"]["access_key"] = args.picovoice_access_key + if args.speech_endpoint_duration_sec is not None: + kwargs["transcriber_params"]["endpoint_duration_sec"] = args.speech_endpoint_duration_sec + + elif user_input_type is UserInputs.TEXT: + kwargs["llm_type"] = LLMs(args.llm) + + return kwargs + + +def get_llm_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + llm_type = LLMs(args.llm) + + if llm_type is LLMs.OPENAI: + if args.openai_access_key is None: + raise ValueError( + f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") + if args.tokens_per_second is not None: + raise ValueError(f"Tokens per second is not supported for `{llm_type}`") + + kwargs["access_key"] = args.openai_access_key + if args.system_message is not None: + kwargs["system_message"] = args.system_message + + elif llm_type is LLMs.DUMMY: + if args.tokens_per_second is not None: + kwargs["tokens_per_second"] = args.tokens_per_second + + return kwargs + + +def get_synthesizer_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + synthesizer_type = Synthesizers(args.synthesizer) + + if synthesizer_type is Synthesizers.PICOVOICE_ORCA: + if args.picovoice_access_key is None: + raise ValueError("Picovoice access key is required when using Picovoice TTS") + kwargs["access_key"] = args.picovoice_access_key + kwargs["model_path"] = args.orca_model_path + kwargs["library_path"] = args.orca_library_path + + elif synthesizer_type is Synthesizers.OPENAI: + if args.openai_access_key is None: + raise ValueError( + f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") + kwargs["access_key"] = args.openai_access_key + + return kwargs + + +def main(args: argparse.Namespace) -> None: + max_num_interactions = args.num_interactions + + user_input_init_kwargs = get_user_input_init_kwargs(args) + user_input = UserInput.create(UserInputs(args.user_input), **user_input_init_kwargs) + + audio_output = StreamingAudioDevice.from_default_device() + + timer = Timer() + + synthesizer_init_kwargs = get_synthesizer_init_kwargs(args) + synthesizer = Synthesizer.create( + Synthesizers(args.synthesizer), + play_audio_callback=audio_output.play, + timer=timer, + **synthesizer_init_kwargs) + + llm_init_kwargs = get_llm_init_kwargs(args) + llm = LLM.create(LLMs(args.llm), **llm_init_kwargs) + + timing_printer = TimingPrinter(llm_string=f"{llm}", synthesizer_string=f"{synthesizer}") + + try: + num_interactions_counter = 0 + while True: + timer.reset() + + audio_output.start(sample_rate=synthesizer.sample_rate) + + text = user_input.get_user_input() + + timer.log_time_llm_request() + text_generator = llm.chat(user_input=text) + + llm_message = "" + printed_stats = False + for token in text_generator: + if token is None: + continue + + if timer.is_first_token: + timer.log_time_first_llm_token() + + llm_message += token + + if synthesizer.text_streamable: + synthesizer.synthesize(token) + + if not timer.before_first_audio and not printed_stats: + timing_printer.print_timing_stats( + num_seconds_first_llm_token=timer.num_seconds_to_first_token(), + num_seconds_first_audio=timer.num_seconds_to_first_audio(), + ) + printed_stats = True + print(f"Answering with {synthesizer} ...") + + timer.increment_num_tokens() + + timer.log_time_last_llm_token() + + if synthesizer.text_streamable: + synthesizer.flush() + else: + synthesizer.synthesize(llm_message) + + wait_start_time = time.time() + while timer.before_first_audio: + if time.time() - wait_start_time > MAX_WAIT_TIME_FIRST_AUDIO: + print( + f"Waited for {MAX_WAIT_TIME_FIRST_AUDIO}s for first audio but did not receive any. Exiting") + break + + if not printed_stats: + timing_printer.print_timing_stats( + num_seconds_first_llm_token=timer.num_seconds_to_first_token(), + num_seconds_first_audio=timer.num_seconds_to_first_audio()) + print(f"Answering with {synthesizer} ...") + + audio_output.flush_and_terminate() + + num_interactions_counter += 1 + + if 0 < max_num_interactions == num_interactions_counter: + print("\nDemo complete!") + break + + print() + + except KeyboardInterrupt: + pass + + synthesizer.terminate() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Text-to-speech streaming synthesis") + + parser.add_argument( + "--user-input", + default=UserInputs.VOICE.value, + choices=[u.value for u in UserInputs], + help="Choose type of input type") + parser.add_argument( + "--input-audio-device-index", + type=int, + default=-1, + help="Index of input audio device") + parser.add_argument( + "--speech-endpoint-duration-sec", + type=float, + default=None, + help="Duration in seconds for speechless audio to be considered an endpoint") + parser.add_argument( + "--show-audio-devices", + action="store_true", + help="Only list available devices and exit") + + parser.add_argument( + "--llm", + default=LLMs.OPENAI.value, + choices=[llm.value for llm in LLMs], + help="Choose LLM to use") + parser.add_argument( + "--openai-access-key", + default=None, + help="Open AI access key. Needed when using openai models") + parser.add_argument( + "--system-message", + default=None, + help="The system message to use to prompt the LLM response") + parser.add_argument( + "--tokens-per-second", + default=None, + type=int, + help="Imitated tokens per second to use for Dummy LLM") + + parser.add_argument( + "--tts", + dest="synthesizer", + default=Synthesizers.PICOVOICE_ORCA.value, + choices=[s.value for s in Synthesizers], + help="Choose voice synthesizer to use") + parser.add_argument( + "--picovoice-access-key", + default=None, + help="AccessKey obtained from Picovoice Console") + parser.add_argument( + "--orca-model-path", + default=None, + help="Path to the model parameters file") + parser.add_argument( + "--orca-library-path", + default=None, + help="Path to Orca's dynamic library") + + parser.add_argument( + "--num-interactions", + type=int, + default=-1, + help="Number of interactions with LLM run before completing the demo. Default is -1 (run indefinitely)") + + arg = parser.parse_args() + + if arg.show_audio_devices: + for index, name in enumerate(PvRecorder.get_available_devices()): + print('Device #%d: %s' % (index, name)) + exit(0) + + main(arg) diff --git a/demo/llm_voice_assistant/requirements.txt b/demo/llm_voice_assistant/requirements.txt new file mode 100644 index 00000000..34ec74df --- /dev/null +++ b/demo/llm_voice_assistant/requirements.txt @@ -0,0 +1,16 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +openai==1.17.0 +pvcheetah==2.0.1 +pvrecorder==1.2.2 +sounddevice==0.4.6 +tiktoken==0.6.0 diff --git a/demo/llm_voice_assistant/src/__init__.py b/demo/llm_voice_assistant/src/__init__.py new file mode 100644 index 00000000..d618d01b --- /dev/null +++ b/demo/llm_voice_assistant/src/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from .audio_device import * +from .llm import * +from .synthesizer import * +from .transcriber import * +from .user_input import * +from .util import * diff --git a/demo/llm_voice_assistant/src/audio_device.py b/demo/llm_voice_assistant/src/audio_device.py new file mode 100644 index 00000000..2036776e --- /dev/null +++ b/demo/llm_voice_assistant/src/audio_device.py @@ -0,0 +1,107 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import time +from queue import Queue +from typing import ( + Any, + Optional, + Sequence, + Union, +) + +import numpy as np +from numpy.typing import NDArray +from sounddevice import OutputStream, query_devices + + +class StreamingAudioDevice: + def __init__(self, device_index: int) -> None: + self._device_index = device_index + self._queue: Queue[NDArray] = Queue() + + self._buffer = None + self._stream = None + self._sample_rate = None + self._blocksize = None + + def start(self, sample_rate: int) -> None: + self._sample_rate = sample_rate + self._blocksize = self._sample_rate // 20 + self._stream = OutputStream( + channels=1, + samplerate=self._sample_rate, + dtype=np.int16, + device=self._device_index, + callback=self._callback, + blocksize=self._blocksize) + self._stream.start() + + # noinspection PyShadowingNames + # noinspection PyUnusedLocal + def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None: + if self._queue.empty(): + outdata[:] = 0 + return + data = self._queue.get() + outdata[:, 0] = data + + def play(self, pcm_chunk: Optional[Union[Sequence[int], NDArray]] = None) -> None: + if self._stream is None: + raise ValueError("Stream is not started. Call `start` method first.") + + if pcm_chunk is not None and isinstance(pcm_chunk, list): + pcm_chunk = np.array(pcm_chunk, dtype=np.int16) + + if self._buffer is not None: + pcm_chunk = self._buffer if pcm_chunk is None else np.concatenate([self._buffer, pcm_chunk]) + self._buffer = None + + if pcm_chunk is None: + return + + length = pcm_chunk.shape[0] + for index_block in range(0, length, self._blocksize): + if (length - index_block) < self._blocksize: + self._buffer = pcm_chunk[index_block: index_block + (length - index_block)] + else: + self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) + + def flush_and_terminate(self) -> None: + self.flush() + self.terminate() + + def flush(self) -> None: + if self._buffer is not None: + chunk = np.zeros(self._blocksize, dtype=np.int16) + chunk[:self._buffer.shape[0]] = self._buffer + self._queue.put_nowait(chunk) + + time_interval = self._blocksize / self._sample_rate + while not self._queue.empty(): + time.sleep(time_interval) + + time.sleep(time_interval) + + def terminate(self) -> None: + self._stream.stop() + self._stream.close() + + @classmethod + def from_default_device(cls) -> 'StreamingAudioDevice': + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + return cls(device_index=device_index) + + +__all__ = [ + "StreamingAudioDevice", +] diff --git a/demo/llm_voice_assistant/src/llm.py b/demo/llm_voice_assistant/src/llm.py new file mode 100644 index 00000000..542c9b09 --- /dev/null +++ b/demo/llm_voice_assistant/src/llm.py @@ -0,0 +1,144 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import json +import os +import random +import time +from enum import Enum +from typing import ( + Any, + Generator, + Sequence, +) + +import tiktoken + + +class LLMs(Enum): + DUMMY = "dummy" + OPENAI = "openai" + + +class LLM: + SYSTEM_MESSAGE = """ + You are a friendly voice assistant in customer service of an e-commerce platform. + Use natural, conversational language that are clear and easy to follow (short sentences, simple words). + Only use english letters and punctuation, no special characters. + Keep the conversation flowing naturally. + """ + + def __init__(self, system_message: str = SYSTEM_MESSAGE) -> None: + self._system_message = system_message + + def _chat(self, user_input: str) -> Generator[str, None, None]: + raise NotImplementedError( + f"Method `chat_stream` must be implemented in a subclass of {self.__class__.__name__}") + + def chat(self, user_input: str) -> Generator[str, None, None]: + for token in self._chat(user_input=user_input): + yield token + + @classmethod + def create(cls, llm_type: LLMs, **kwargs) -> 'LLM': + classes = { + LLMs.DUMMY: DummyLLM, + LLMs.OPENAI: OpenAILLM, + } + + if llm_type not in classes: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{llm_type.value}`") + + return classes[llm_type](**kwargs) + + def __str__(self) -> str: + raise NotImplementedError() + + +class OpenAILLM(LLM): + MODEL_NAME = "gpt-3.5-turbo" + RANDOM_SEED = 7777 + + def __init__( + self, + access_key: str, + model_name: str = MODEL_NAME, + **kwargs: Any, + ) -> None: + super().__init__(**kwargs) + + from openai import OpenAI + self._model_name = model_name + self._client = OpenAI(api_key=access_key) + + self._history = [{"role": "system", "content": self._system_message}] + + def _append_user_message(self, message: str) -> None: + self._history.append({"role": "user", "content": message}) + + def _append_assistant_message(self, message: str) -> None: + self._history.append({"role": "assistant", "content": message}) + + def _chat(self, user_input: str) -> Generator[str, None, None]: + self._append_user_message(user_input) + stream = self._client.chat.completions.create( + model=self._model_name, + messages=self._history, + seed=self.RANDOM_SEED, + temperature=0, + top_p=0.05, + stream=True) + assistant_message = "" + for chunk in stream: + token = chunk.choices[0].delta.content + yield token + if token is not None: + assistant_message += token + self._append_assistant_message(assistant_message) + + def __str__(self) -> str: + return f"ChatGPT ({self._model_name})" + + +class DummyLLM(LLM): + TOKENS_PER_SECOND = 25 + + def __init__(self, tokens_per_second: int = TOKENS_PER_SECOND) -> None: + super().__init__(system_message="") + + self._encoder = tiktoken.encoding_for_model("gpt-4") + self._tokens_delay = 1 / tokens_per_second + + data_file_path = os.path.join(os.path.dirname(__file__), "../../../resources/demo/demo_data.json") + with open(data_file_path, encoding="utf8") as data_file: + self._sentences = json.loads(data_file.read())["demo_sentences"] + + random.seed(7777) + + def _tokenize(self, text: str) -> Sequence[str]: + tokens = [self._encoder.decode([i]) for i in self._encoder.encode(text)] + return tokens + + def _chat(self, user_input: str) -> Generator[str, None, None]: + sentence = self._sentences[random.randint(0, len(self._sentences) - 1)] + + for i in self._tokenize(text=sentence): + time.sleep(self._tokens_delay) + yield i + + def __str__(self) -> str: + return "Dummy LLM" + + +__all__ = [ + "LLMs", + "LLM", +] diff --git a/demo/llm_voice_assistant/src/synthesizer.py b/demo/llm_voice_assistant/src/synthesizer.py new file mode 100644 index 00000000..f48b4345 --- /dev/null +++ b/demo/llm_voice_assistant/src/synthesizer.py @@ -0,0 +1,241 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import threading +import time +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from queue import Queue +from typing import ( + Any, + Callable, + Literal, + Optional, + Sequence, + Union, +) + +import numpy as np +import pvorca +from numpy.typing import NDArray +from openai import OpenAI +from pvorca import OrcaActivationLimitError + +from .util import Timer + + +class Synthesizers(Enum): + OPENAI = "openai" + PICOVOICE_ORCA = "picovoice_orca" + + +class Synthesizer: + def __init__( + self, + sample_rate: int, + play_audio_callback: Callable[[Union[Sequence[int], NDArray]], None], + timer: Timer, + text_streamable: bool = False, + ) -> None: + self.sample_rate = sample_rate + self.text_streamable = text_streamable + + self._play_audio_callback = play_audio_callback + self._timer = timer + + def synthesize(self, text: str) -> None: + raise NotImplementedError( + f"Method `synthesize` must be implemented in a subclass of {self.__class__.__name__}") + + @property + def info(self) -> str: + raise NotImplementedError( + f"Method `info` must be implemented in a subclass of {self.__class__.__name__}") + + def flush(self) -> None: + pass + + def terminate(self) -> None: + pass + + @classmethod + def create(cls, engine: Synthesizers, **kwargs: Any) -> 'Synthesizer': + subclasses = { + Synthesizers.PICOVOICE_ORCA: PicovoiceOrcaSynthesizer, + Synthesizers.OPENAI: OpenAISynthesizer, + } + + if engine not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{engine.value}`") + + return subclasses[engine](**kwargs) + + def __str__(self) -> str: + raise NotImplementedError() + + +class OpenAISynthesizer(Synthesizer): + SAMPLE_RATE = 24000 + NAME = "OpenAI TTS" + + DEFAULT_MODEL_NAME = "tts-1" + DEFAULT_VOICE_NAME = "shimmer" + + def __init__( + self, + access_key: str, + model_name: str = DEFAULT_MODEL_NAME, + voice_name: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = DEFAULT_VOICE_NAME, + **kwargs: Any + ) -> None: + super().__init__(sample_rate=self.SAMPLE_RATE, **kwargs) + + self._model_name = model_name + self._voice_name = voice_name + self._client = OpenAI(api_key=access_key) + + @staticmethod + def _decode(b: bytes) -> NDArray: + pcm = np.frombuffer(BytesIO(b).read(), dtype=np.int16) + return pcm + + def synthesize(self, text: str) -> None: + self._timer.maybe_log_time_first_synthesis_request() + + response = self._client.audio.speech.create( + model=self._model_name, + voice=self._voice_name, + response_format="pcm", + input=text) + + for chunk in response.iter_bytes(chunk_size=1024): + self._timer.maybe_log_time_first_audio() + + pcm = self._decode(chunk) + self._play_audio_callback(pcm) + + @property + def info(self) -> str: + return f"{self.NAME} (model: {self.DEFAULT_MODEL_NAME}, voice: {self.DEFAULT_VOICE_NAME})" + + def __str__(self) -> str: + return f"{self.NAME}" + + +class PicovoiceOrcaSynthesizer(Synthesizer): + NUM_TOKENS_PER_PCM_CHUNK = 4 + + @dataclass + class OrcaTextInput: + text: str + flush: bool + + def __init__( + self, + play_audio_callback: Callable[[Union[Sequence[int], NDArray]], None], + timer: Timer, + access_key: str, + model_path: Optional[str] = None, + library_path: Optional[str] = None, + ) -> None: + self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + super().__init__( + sample_rate=self._orca.sample_rate, + play_audio_callback=play_audio_callback, + timer=timer, + text_streamable=True) + + self._orca_stream = self._orca.stream_open() + + self._queue: Queue[Optional[PicovoiceOrcaSynthesizer.OrcaTextInput]] = Queue() + + self._num_tokens = 0 + + self._thread = None + self._start_thread() + + def _start_thread(self) -> None: + self._thread = threading.Thread(target=self._run) + self._thread.start() + + def _close_thread_blocking(self): + self._queue.put_nowait(None) + self._thread.join() + + def _reset_state(self) -> None: + self._num_tokens = 0 + + def _compute_first_audio_delay(self, pcm: Sequence[int], processing_time: float) -> float: + seconds_audio = len(pcm) / self.sample_rate + tokens_per_sec = self._num_tokens / (time.time() - self._timer.time_first_synthesis_request) + llm_delay_seconds = (self.NUM_TOKENS_PER_PCM_CHUNK / (tokens_per_sec + 1e-4)) + orca_delay_seconds = 3 * processing_time + delay_seconds = max(llm_delay_seconds + orca_delay_seconds - seconds_audio, 0) + return delay_seconds + + def _run(self) -> None: + while True: + orca_input = self._queue.get() + if orca_input is None: + break + + self._timer.maybe_log_time_first_synthesis_request() + + self._num_tokens += 1 + + start = time.time() + try: + if not orca_input.flush: + pcm = self._orca_stream.synthesize(orca_input.text) + else: + pcm = self._orca_stream.flush() + except OrcaActivationLimitError: + raise ValueError("Orca activation limit reached.") + processing_time = time.time() - start + + if pcm is not None: + if self._timer.before_first_audio: + self._timer.maybe_log_time_first_audio() + + initial_audio_delay = self._compute_first_audio_delay(pcm=pcm, processing_time=processing_time) + self._timer.set_initial_audio_delay(initial_audio_delay) + + time.sleep(initial_audio_delay) + + self._play_audio_callback(pcm) + + def synthesize(self, text: str) -> None: + self._queue.put_nowait(self.OrcaTextInput(text=text, flush=False)) + + def flush(self) -> None: + self._queue.put_nowait(self.OrcaTextInput(text="", flush=True)) + self._close_thread_blocking() + self._reset_state() + self._start_thread() + + def terminate(self): + self._close_thread_blocking() + self._orca_stream.close() + self._orca.delete() + + @property + def info(self) -> str: + return f"Picovoice Orca v{self._orca.version}" + + def __str__(self) -> str: + return "Picovoice Orca" + + +__all__ = [ + "Synthesizers", + "Synthesizer", +] diff --git a/demo/llm_voice_assistant/src/transcriber.py b/demo/llm_voice_assistant/src/transcriber.py new file mode 100644 index 00000000..f03ae432 --- /dev/null +++ b/demo/llm_voice_assistant/src/transcriber.py @@ -0,0 +1,87 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from enum import Enum +from typing import ( + Any, + Optional, + Sequence, + Tuple, +) + +from pvcheetah import CheetahActivationLimitError, create + + +class Transcribers(Enum): + PICOVOICE_CHEETAH = "picovoice_cheetah" + + +class Transcriber: + def process(self, pcm_frame: Sequence[int]) -> Tuple[str, bool]: + raise NotImplementedError() + + def flush(self) -> str: + raise NotImplementedError() + + @property + def frame_length(self) -> int: + raise NotImplementedError() + + @classmethod + def create(cls, x: Transcribers, **kwargs: Any) -> 'Transcriber': + subclasses = { + Transcribers.PICOVOICE_CHEETAH: PicovoiceCheetahTranscriber, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") + + return subclasses[x](**kwargs) + + +class PicovoiceCheetahTranscriber(Transcriber): + def __init__( + self, + access_key: str, + library_path: Optional[str] = None, + model_path: Optional[str] = None, + endpoint_duration_sec: float = 1.0, + enable_automatic_punctuation: bool = True + ) -> None: + self._cheetah = create( + access_key=access_key, + library_path=library_path, + model_path=model_path, + endpoint_duration_sec=endpoint_duration_sec, + enable_automatic_punctuation=enable_automatic_punctuation) + + def process(self, pcm_frame: Sequence[int]) -> Tuple[str, bool]: + try: + partial_transcript, is_endpoint = self._cheetah.process(pcm_frame) + except CheetahActivationLimitError: + raise ValueError("Cheetah activation limit reached.") + return partial_transcript, is_endpoint + + def flush(self) -> str: + try: + return self._cheetah.flush() + except CheetahActivationLimitError: + raise ValueError("Cheetah activation limit reached.") + + @property + def frame_length(self) -> int: + return self._cheetah.frame_length + + +__all__ = [ + "Transcriber", + "Transcribers", +] diff --git a/demo/llm_voice_assistant/src/user_input.py b/demo/llm_voice_assistant/src/user_input.py new file mode 100644 index 00000000..791943e8 --- /dev/null +++ b/demo/llm_voice_assistant/src/user_input.py @@ -0,0 +1,94 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from enum import Enum +from typing import ( + Any, + Dict, + Optional, +) + +from pvrecorder import PvRecorder + +from .llm import LLMs +from .transcriber import Transcriber, Transcribers + + +class UserInputs(Enum): + VOICE = "voice" + TEXT = "text" + + +class UserInput: + def get_user_input(self) -> str: + raise NotImplementedError() + + @classmethod + def create(cls, x: UserInputs, **kwargs: Any) -> 'UserInput': + subclasses = { + UserInputs.VOICE: VoiceUserInput, + UserInputs.TEXT: TextUserInput, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") + + return subclasses[x](**kwargs) + + +class VoiceUserInput(UserInput): + def __init__( + self, + audio_device_index: int, + transcriber: Transcribers, + transcriber_params: Dict[str, Any], + ) -> None: + self._transcriber = Transcriber.create(transcriber, **transcriber_params) + self._recorder = PvRecorder(frame_length=self._transcriber.frame_length, device_index=audio_device_index) + + def get_user_input(self) -> str: + print("Listening ...") + if not self._recorder.is_recording: + self._recorder.start() + + transcript = "" + try: + while True: + partial_transcript, is_endpoint = self._transcriber.process(self._recorder.read()) + transcript += partial_transcript + if is_endpoint: + final_transcript = self._transcriber.flush() + transcript += final_transcript + self._recorder.stop() + return transcript + except Exception as e: + self._recorder.stop() + raise e + + +class TextUserInput(UserInput): + USER_PROMPT = "Your question: " + USER_PROMPT_DUMMY_LLM = "Press ENTER to generate a demo LLM response " + + def __init__(self, llm_type: LLMs, prompt: Optional[str] = None) -> None: + if prompt is not None: + self._prompt = prompt + else: + self._prompt = self.USER_PROMPT_DUMMY_LLM if llm_type is LLMs.DUMMY else self.USER_PROMPT + + def get_user_input(self) -> str: + return input(self._prompt) + + +__all__ = [ + "UserInput", + "UserInputs", +] diff --git a/demo/llm_voice_assistant/src/util.py b/demo/llm_voice_assistant/src/util.py new file mode 100644 index 00000000..79fb7966 --- /dev/null +++ b/demo/llm_voice_assistant/src/util.py @@ -0,0 +1,169 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import time +from dataclasses import dataclass +from typing import Tuple + + +@dataclass +class Colors: + GREEN = "\033[92m" + RESET = "\033[0m" + BOLD = "\033[1m" + + +@dataclass +class Timer: + time_llm_request: float = -1.0 + time_first_llm_token: float = -1.0 + time_last_llm_token: float = -1.0 + time_first_synthesis_request: float = -1.0 + time_first_audio: float = -1.0 + initial_audio_delay: float = 0.0 + + before_first_audio: bool = True + _is_first_synthesis_request: bool = True + _num_tokens: int = 0 + + @staticmethod + def _get_time() -> float: + return time.time() + + def log_time_llm_request(self) -> None: + self.time_llm_request = self._get_time() + + def log_time_first_llm_token(self) -> None: + self.time_first_llm_token = self._get_time() + + def log_time_last_llm_token(self) -> None: + self.time_last_llm_token = self._get_time() + + def maybe_log_time_first_synthesis_request(self) -> None: + if self._is_first_synthesis_request: + self.time_first_synthesis_request = self._get_time() + self._is_first_synthesis_request = False + + def maybe_log_time_first_audio(self) -> None: + if self.before_first_audio: + self.time_first_audio = self._get_time() + self.before_first_audio = False + + def increment_num_tokens(self) -> None: + self._num_tokens += 1 + + @property + def is_first_token(self) -> bool: + return self._num_tokens == 0 + + def set_initial_audio_delay(self, delay: float) -> None: + self.initial_audio_delay = delay + + def num_seconds_to_first_audio(self) -> float: + return self.time_first_audio - self.time_first_llm_token + + def num_seconds_to_first_token(self) -> float: + return self.time_first_llm_token - self.time_llm_request + + def reset(self) -> None: + self.time_llm_request = -1.0 + self.time_first_llm_token = -1.0 + self.time_last_llm_token = -1.0 + self.time_first_synthesis_request = -1.0 + self.time_first_audio = -1.0 + self.initial_audio_delay = 0.0 + + self._is_first_synthesis_request = True + self.before_first_audio = True + + self._num_tokens = 0 + + +class TimingPrinter: + TIMER_MESSAGE = "Time to wait for" + + TIMER_BAR_MAX_RED_SECONDS = 2.0 + TIMER_BAR_SYMBOLS_PER_SECONDS = 40 + TIMER_BAR_SYMBOL = ">" + + MAX_GREEN_VALUE = 0.6 + MAX_RED_VALUE = 0.75 + + def __init__( + self, + llm_string: str, + synthesizer_string: str, + timer_bar_max_red_seconds: float = TIMER_BAR_MAX_RED_SECONDS, + timer_bar_symbols_per_second: float = TIMER_BAR_SYMBOLS_PER_SECONDS, + timer_bar_symbol: str = TIMER_BAR_SYMBOL, + ) -> None: + max_length = len(llm_string) if len(llm_string) > len(synthesizer_string) else len(synthesizer_string) + llm_info_string = llm_string.ljust(max_length) + synthesizer_info_string = synthesizer_string.ljust(max_length) + + self._timer_message_llm = f"{self.TIMER_MESSAGE} {llm_info_string} : " + self._timer_message_tts = f"{self.TIMER_MESSAGE} {synthesizer_info_string} : " + + self._progress_bar_color_max = timer_bar_max_red_seconds * timer_bar_symbols_per_second + self._progress_bar_symbols_per_second = timer_bar_symbols_per_second + self._progress_bar_symbol = timer_bar_symbol + + @staticmethod + def _colored_string(text: str, red: float, green: float, blue: float, bold: bool = False) -> str: + s = Colors.BOLD if bold else "" + s = f"{s}\033[38;2;{int(red * 255)};{int(green * 255)};{int(blue * 255)}m{text}{Colors.RESET}" + return s + + def _print_colored_progress_bar(self, num_seconds: float, bold: bool = False) -> Tuple[float, float, float]: + red = 0 + green = self.MAX_GREEN_VALUE + blue = 0 + + half_max_length = self._progress_bar_color_max // 2 + + length = int(num_seconds * self._progress_bar_symbols_per_second) + for i in range(length): + if i < half_max_length: + red = min(i / (half_max_length - 1), self.MAX_RED_VALUE) + else: + green = max(0.5 - (i - half_max_length) / (half_max_length - 1), 0) + + print(f"{self._colored_string(self._progress_bar_symbol, red, green, blue, bold=bold)}", end="") + + return red, green, blue + + def _print_timer_bar_llm(self, num_seconds_first_llm_token: float) -> None: + print(self._colored_string(self._timer_message_llm, 0, self.MAX_GREEN_VALUE, 0), end="") + + red, green, blue = self._print_colored_progress_bar(num_seconds_first_llm_token) + + num_seconds_string = f"{round(num_seconds_first_llm_token, 1):.1f}s" + print(f" {self._colored_string(num_seconds_string, red, green, blue)}", flush=True) + + def _print_timer_bar_tts(self, num_seconds_first_audio: float) -> None: + print(self._colored_string(self._timer_message_tts, 0, self.MAX_GREEN_VALUE, 0, bold=True), end="") + + red, green, blue = self._print_colored_progress_bar(num_seconds_first_audio, bold=True) + + num_seconds_string = f"{round(num_seconds_first_audio, 1):.1f}s" + print(f" {self._colored_string(num_seconds_string, red, green, blue, bold=True)}", flush=True) + + def print_timing_stats(self, num_seconds_first_llm_token: float, num_seconds_first_audio: float) -> None: + print() + self._print_timer_bar_llm(num_seconds_first_llm_token) + self._print_timer_bar_tts(num_seconds_first_audio) + + +__all__ = [ + "Colors", + "TimingPrinter", + "Timer", +] diff --git a/demo/python/.gitignore b/demo/python/.gitignore index 2ce46fe0..1e1bea03 100644 --- a/demo/python/.gitignore +++ b/demo/python/.gitignore @@ -3,3 +3,4 @@ dist MANIFEST.in pvorcademo pvorcademo.egg-info +__pycache__/ \ No newline at end of file diff --git a/demo/python/README.md b/demo/python/README.md index fa86a25e..f8d11e8d 100644 --- a/demo/python/README.md +++ b/demo/python/README.md @@ -1,10 +1,11 @@ -# Orca Text-to-Speech Engine Demo +# Orca Streaming Text-to-Speech Engine Python Demo Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) ## Orca -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -15,7 +16,7 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Compatibility -- Python 3.7+ +- Python 3.8+ - Runs on Linux (x86_64), macOS (x86_64, arm64), Windows (x86_64), Raspberry Pi (5, 4, 3), and NVIDIA Jetson Nano. ## Installation @@ -32,12 +33,35 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Usage -To synthesize speech, run the following: +Orca supports two modes of operation: streaming and single synthesis. + +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +This is demonstrated in the Orca streaming demo. + +In the single synthesis mode, the text is synthesized in a single call to the Orca engine. + +### Streaming synthesis demo + +In this demo, we simulate a response from a language model by creating a text stream from a user-defined text. +We stream that text to Orca and play the synthesized audio as soon as it gets generated. + +To run it, execute the following: + +```console +orca_demo_streaming --access_key ${ACCESS_KEY} --text_to_stream ${TEXT} +``` + +Replace `${ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console and `${TEXT}` with your text to be +streamed to Orca. Please note that this demo was not tested on macOS. + +### Single synthesis demo + +To synthesize speech in a single call to Orca and without audio playback, run the following: ```console orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with your text to be synthesized, -and `${WAV_OUTPUT_PATH}` with a path to a `.wav` file where the generated audio will be stored as a single-channel, +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with your text to be synthesized, +and `${WAV_OUTPUT_PATH}` with a path to a `.wav` file where the generated audio will be stored as a single-channel, 16-bit PCM `.wav` file. diff --git a/demo/python/orca_demo.py b/demo/python/orca_demo.py index 628c816a..1f2a0cbe 100644 --- a/demo/python/orca_demo.py +++ b/demo/python/orca_demo.py @@ -11,53 +11,75 @@ import argparse import struct +import time import wave from pvorca import create, OrcaActivationLimitError -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( '--access_key', + '-a', required=True, help='AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)') parser.add_argument( '--text', + '-t', required=True, help='Text to be synthesized') parser.add_argument( '--output_path', + '-o', required=True, help='Absolute path to .wav file where the generated audio will be stored') parser.add_argument( '--library_path', + '-l', help='Absolute path to dynamic library. Default: using the library provided by `pvorca`') parser.add_argument( '--model_path', + '-m', help='Absolute path to Orca model. Default: using the model provided by `pvorca`') args = parser.parse_args() - if not args.output_path.lower().endswith('.wav'): + access_key = args.access_key + model_path = args.model_path + library_path = args.library_path + output_path = args.output_path + text = args.text + + if not output_path.lower().endswith('.wav'): raise ValueError('Given argument --output_path must have WAV file extension') - orca = create(access_key=args.access_key, model_path=args.model_path, library_path=args.library_path) + orca = create(access_key=access_key, model_path=model_path, library_path=library_path) try: - print('Orca version: %s' % orca.version) - pcm = orca.synthesize(args.text) + print(f"Orca version: {orca.version}") + + start = time.time() + + pcm, alignments = orca.synthesize(text) + + processing_time = time.time() - start length_sec = len(pcm) / orca.sample_rate - with wave.open(args.output_path, 'wb') as output_file: + + with wave.open(output_path, "wb") as output_file: output_file.setnchannels(1) output_file.setsampwidth(2) output_file.setframerate(orca.sample_rate) - output_file.writeframes(struct.pack('%dh' % len(pcm), *pcm)) - print('%.2f seconds of audio were written to `%s`.' % (length_sec, args.output_path)) + output_file.writeframes(struct.pack(f"{len(pcm)}h", *pcm)) + + print( + f"Orca took {processing_time:.2f} seconds to synthesize {length_sec:.2f} seconds of speech which is " + f"~{length_sec / processing_time:.0f} times faster than real-time.") + print(f"Audio written to `{output_path}`.") except OrcaActivationLimitError: - print('AccessKey has reached its processing limit') + print("AccessKey has reached its processing limit") finally: orca.delete() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/demo/python/orca_demo_streaming.py b/demo/python/orca_demo_streaming.py new file mode 100644 index 00000000..05b0d92e --- /dev/null +++ b/demo/python/orca_demo_streaming.py @@ -0,0 +1,399 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import argparse +import platform +import re +import subprocess +import threading +import time +import traceback +from dataclasses import dataclass +from queue import Queue +from typing import ( + Any, + Callable, + Dict, + Optional, + Sequence, +) + +import numpy as np +import pvorca +import tiktoken +from numpy.typing import NDArray +from pvorca import OrcaActivationLimitError, OrcaInvalidArgumentError +from sounddevice import ( + OutputStream, + query_devices, + PortAudioError, +) + +CUSTOM_PRON_PATTERN = r"\{(.*?\|.*?)\}" +CUSTOM_PRON_PATTERN_NO_WHITESPACE = r"\{(.*?\|.*?)\}(?!\s)" + + +class StreamingAudioDevice: + def __init__(self, device_index: Optional[int] = None) -> None: + if device_index is None: + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + + self._device_index = device_index + self._queue: Queue[Sequence[int]] = Queue() + + self._buffer = None + self._stream = None + self._sample_rate = None + self._blocksize = None + + def start(self, sample_rate: int) -> None: + self._sample_rate = sample_rate + self._blocksize = self._sample_rate // 20 + self._stream = OutputStream( + channels=1, + samplerate=self._sample_rate, + dtype=np.int16, + device=self._device_index, + callback=self._callback, + blocksize=self._blocksize) + self._stream.start() + + # noinspection PyShadowingNames + # noinspection PyUnusedLocal + def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None: + if self._queue.empty(): + outdata[:] = 0 + return + + pcm = self._queue.get() + outdata[:, 0] = pcm + + def play(self, pcm_chunk: Sequence[int]) -> None: + if self._stream is None: + raise ValueError("Stream is not started. Call `start` method first.") + + pcm_chunk = np.array(pcm_chunk, dtype=np.int16) + + if self._buffer is not None: + if pcm_chunk is not None: + pcm_chunk = np.concatenate([self._buffer, pcm_chunk]) + else: + pcm_chunk = self._buffer + self._buffer = None + + length = pcm_chunk.shape[0] + for index_block in range(0, length, self._blocksize): + if (length - index_block) < self._blocksize: + self._buffer = pcm_chunk[index_block: index_block + (length - index_block)] + else: + self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) + + def flush_and_terminate(self) -> None: + self.flush() + self.terminate() + + def flush(self) -> None: + if self._buffer is not None: + chunk = np.zeros(self._blocksize, dtype=np.int16) + chunk[:self._buffer.shape[0]] = self._buffer + self._queue.put_nowait(chunk) + + time_interval = self._blocksize / self._sample_rate + while not self._queue.empty(): + time.sleep(time_interval) + + time.sleep(time_interval) + + def terminate(self) -> None: + self._stream.stop() + self._stream.close() + + @staticmethod + def list_output_devices() -> Dict[str, Any]: + return query_devices(kind="output") + + +def linux_machine() -> str: + machine = platform.machine() + if machine == "x86_64": + return machine + elif machine in ["aarch64", "armv7l"]: + arch_info = ("-" + machine) if "64bit" in platform.architecture()[0] else "" + else: + raise NotImplementedError("Unsupported CPU architecture: `%s`" % machine) + + cpu_info = "" + try: + cpu_info = subprocess.check_output(["cat", "/proc/cpuinfo"]).decode("utf-8") + cpu_part_list = [x for x in cpu_info.split("\n") if "CPU part" in x] + cpu_part = cpu_part_list[0].split(" ")[-1].lower() + except Exception as e: + raise RuntimeError("Failed to identify the CPU with `%s`\nCPU info: `%s`" % (e, cpu_info)) + + if "0xd03" == cpu_part: + return "cortex-a53" + arch_info + elif "0xd07" == cpu_part: + return "cortex-a57" + arch_info + elif "0xd08" == cpu_part: + return "cortex-a72" + arch_info + elif "0xd0b" == cpu_part: + return "cortex-a76" + arch_info + else: + raise NotImplementedError("Unsupported CPU: `%s`." % cpu_part) + + +class OrcaThread: + @dataclass + class OrcaInput: + text: str + flush: bool + + def __init__( + self, + play_audio_callback: Callable[[Sequence[int]], None], + access_key: str, + num_tokens_per_second: int, + model_path: Optional[str] = None, + library_path: Optional[str] = None, + audio_wait_chunks: Optional[int] = None, + ) -> None: + + self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + self._orca_stream = self._orca.stream_open() + self._sample_rate = self._orca.sample_rate + + self._play_audio_callback = play_audio_callback + self._num_tokens_per_second = num_tokens_per_second + assert self._num_tokens_per_second > 0 + + self._queue: Queue[Optional[OrcaThread.OrcaInput]] = Queue() + self._thread = None + + self._time_first_audio_available = -1 + self._pcm_buffer: Queue[Sequence[int]] = Queue() + + self._wait_chunks = audio_wait_chunks or self._get_first_audio_wait_chunks() + self._num_pcm_chunks_processed = 0 + + @staticmethod + def _get_first_audio_wait_chunks() -> int: + wait_chunks = 0 + if platform.system() == "Linux": + machine = linux_machine() + if "cortex" in machine: + wait_chunks = 1 + return wait_chunks + + def _run(self) -> None: + while True: + orca_input = self._queue.get() + if orca_input is None: + while not self._pcm_buffer.empty(): + self._play_audio_callback(self._pcm_buffer.get()) + break + + try: + if not orca_input.flush: + pcm = self._orca_stream.synthesize(orca_input.text) + else: + pcm = self._orca_stream.flush() + except OrcaInvalidArgumentError as e: + raise ValueError(f"Orca could not synthesize text input `{orca_input.text}`: `{e}`") + + if pcm is not None: + if self._num_pcm_chunks_processed < self._wait_chunks: + self._pcm_buffer.put_nowait(pcm) + else: + while not self._pcm_buffer.empty(): + self._play_audio_callback(self._pcm_buffer.get()) + self._play_audio_callback(pcm) + + if self._num_pcm_chunks_processed == 0: + self._time_first_audio_available = time.time() + + self._num_pcm_chunks_processed += 1 + + def _close_thread_blocking(self): + self._queue.put_nowait(None) + self._thread.join() + + def start(self) -> None: + self._thread = threading.Thread(target=self._run) + self._thread.start() + + def synthesize(self, text: str) -> None: + self._queue.put_nowait(self.OrcaInput(text=text, flush=False)) + + def flush(self) -> None: + self._queue.put_nowait(self.OrcaInput(text="", flush=True)) + self._close_thread_blocking() + self.start() + + def delete(self) -> None: + self._close_thread_blocking() + self._orca_stream.close() + self._orca.delete() + + def get_time_first_audio_available(self) -> float: + return self._time_first_audio_available + + @property + def sample_rate(self) -> int: + return self._sample_rate + + @property + def version(self) -> str: + return self._orca.version + + +def tokenize_text(text: str) -> Sequence[str]: + text = re.sub(CUSTOM_PRON_PATTERN_NO_WHITESPACE, r'{\1} ', text) + + custom_pronunciations = re.findall(CUSTOM_PRON_PATTERN, text) + custom_pronunciations = set(["{" + pron + "}" for pron in custom_pronunciations]) + + encoder = tiktoken.encoding_for_model("gpt-4") + tokens_raw = [encoder.decode([i]) for i in encoder.encode(text)] + + custom_pron = "" + tokens_with_custom_pronunciations = [] + for i, token in enumerate(tokens_raw): + in_custom_pron = False + for pron in custom_pronunciations: + in_custom_pron_global = len(custom_pron) > 0 + current_match = token.strip() if not in_custom_pron_global else custom_pron + token + if pron.startswith(current_match): + custom_pron += token.strip() if not in_custom_pron_global else token + in_custom_pron = True + + if not in_custom_pron: + if custom_pron != "": + tokens_with_custom_pronunciations.append(f" {custom_pron}" if i != 0 else custom_pron) + custom_pron = "" + tokens_with_custom_pronunciations.append(token) + + return tokens_with_custom_pronunciations + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--access_key", + "-a", + required=True, + help="AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)") + parser.add_argument( + "--library_path", + "-l", + help="Absolute path to dynamic library. Default: using the library provided by `pvorca`") + parser.add_argument( + "--model_path", + "-m", + help="Absolute path to Orca model. Default: using the model provided by `pvorca`") + parser.add_argument( + "--text_to_stream", + "-t", + required=True, + help="Text to be streamed to Orca") + parser.add_argument( + "--tokens_per_second", + type=int, + default=15, + help="Number of tokens per second to be streamed to Orca, simulating an LLM response.") + parser.add_argument( + "--audio_wait_chunks", + type=int, + default=None, + help="Number of PCM chunks to wait before starting to play audio. Default: system-dependent.") + parser.add_argument( + "--show_audio_devices", + action="store_true", + help="Only list available audio output devices and exit") + parser.add_argument('--audio-device-index', type=int, default=None, help='Index of input audio device') + args = parser.parse_args() + + if args.show_audio_devices: + print(StreamingAudioDevice.list_output_devices()) + exit(0) + + access_key = args.access_key + model_path = args.model_path + library_path = args.library_path + text = args.text_to_stream + tokens_per_second = args.tokens_per_second + audio_wait_chunks = args.audio_wait_chunks + audio_device_index = args.audio_device_index + + try: + audio_device = StreamingAudioDevice(device_index=audio_device_index) + # Some systems may have issues with PortAudio only when starting the audio device. Test it here. + audio_device.start(sample_rate=16000) + audio_device.terminate() + play_audio_callback = audio_device.play + except PortAudioError: + print(traceback.format_exc()) + print( + "WARNING: Failed to initialize audio device, see details above. Falling back to running " + "the demo without audio playback.\n") + audio_device = None + + # noinspection PyUnusedLocal + def play_audio_callback(pcm: Sequence[int]): + pass + + orca = OrcaThread( + play_audio_callback=play_audio_callback, + num_tokens_per_second=tokens_per_second, + access_key=access_key, + model_path=model_path, + library_path=library_path, + audio_wait_chunks=audio_wait_chunks, + ) + + orca.start() + if audio_device is not None: + audio_device.start(sample_rate=orca.sample_rate) + + try: + print(f"Orca version: {orca.version}\n") + + print(f"Simulated text stream:") + tokens = tokenize_text(text=text) + + time_start_text_stream = time.time() + for token in tokens: + print(f"{token}", end="", flush=True) + + orca.synthesize(text=token) + + time.sleep(1 / tokens_per_second) + + text_stream_duration_seconds = time.time() - time_start_text_stream + + orca.flush() + + first_audio_available_seconds = orca.get_time_first_audio_available() - time_start_text_stream + print(f"\n\nTime to finish text stream: {text_stream_duration_seconds:.2f} seconds") + print(f"Time to receive first audio: {first_audio_available_seconds:.2f} seconds after text stream started\n") + + if audio_device is not None: + print("Waiting for audio to finish ...") + audio_device.flush_and_terminate() + + except OrcaActivationLimitError: + print("AccessKey has reached its processing limit") + finally: + orca.delete() + + +if __name__ == "__main__": + main() diff --git a/demo/python/requirements.txt b/demo/python/requirements.txt index af97749e..af73a886 100644 --- a/demo/python/requirements.txt +++ b/demo/python/requirements.txt @@ -1 +1,4 @@ -pvorca==0.1.4 +numpy>=1.24.0 +pvorca==0.2.1 +sounddevice==0.4.6 +tiktoken==0.6.0 diff --git a/demo/python/setup.py b/demo/python/setup.py index fc8cc2d2..a83f2029 100644 --- a/demo/python/setup.py +++ b/demo/python/setup.py @@ -3,12 +3,14 @@ import setuptools +INCLUDE_FILES = [ + "../../LICENSE", + "orca_demo.py", + "orca_demo_streaming.py"] -INCLUDE_FILES = ('../../LICENSE', 'orca_demo.py') +os.system("git clean -dfx") -os.system('git clean -dfx') - -package_folder = os.path.join(os.path.dirname(__file__), 'pvorcademo') +package_folder = os.path.join(os.path.dirname(__file__), "pvorcademo") os.mkdir(package_folder) manifest_in = "" @@ -16,23 +18,23 @@ shutil.copy(os.path.join(os.path.dirname(__file__), rel_path), package_folder) manifest_in += "include pvorcademo/%s\n" % os.path.basename(rel_path) -with open(os.path.join(os.path.dirname(__file__), 'MANIFEST.in'), 'w') as f: +with open(os.path.join(os.path.dirname(__file__), "MANIFEST.in"), "w") as f: f.write(manifest_in) -with open(os.path.join(os.path.dirname(__file__), 'README.md'), 'r') as f: +with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f: long_description = f.read() setuptools.setup( name="pvorcademo", - version="0.1.3", + version="0.2.1", author="Picovoice", author_email="hello@picovoice.ai", - description="Orca Text-to-Speech Engine demos", + description="Orca Streaming Text-to-Speech Engine demos", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Picovoice/orca", packages=["pvorcademo"], - install_requires=["pvorca==0.1.4"], + install_requires=["numpy>=1.24.0", "pvorca==0.2.1", "sounddevice==0.4.6", "tiktoken==0.6.0"], include_package_data=True, classifiers=[ "Development Status :: 4 - Beta", @@ -44,9 +46,10 @@ ], entry_points=dict( console_scripts=[ - 'orca_demo=pvorcademo.orca_demo:main', + "orca_demo=pvorcademo.orca_demo:main", + "orca_demo_streaming=pvorcademo.orca_demo_streaming:main", ], ), - python_requires='>=3.7', - keywords="Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", + python_requires=">=3.8", + keywords="Streaming Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", ) diff --git a/demo/web/README.md b/demo/web/README.md index 1af0b486..88164656 100644 --- a/demo/web/README.md +++ b/demo/web/README.md @@ -34,10 +34,33 @@ Available on: Hit CTRL-C to stop the server ``` -Wait until Orca has initialized. Type in any text (in English only), and optionally select a desired speech rate. Click -synthesize, and once Orca has finished synthesizing your text, click play and listen for the speech. +Copy in your AccessKey from Picovoice Console, and click "Start Orca". -**Optional**: If you wish, you may replace the model file in the `index.html` with a male model file for a male +## Usage + + +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to +the Orca engine. + +Click on either "Streaming Synthesis" or "Single Synthesis" to continue. + +### Streaming Synthesis + +1. Choose desired speech rate (or keep the default) +2. Click "Open Stream" +3. Type in any text (in English only). +4. When you're done, click "Run Streaming Synthesis" to run streaming synthesis on a simulated text stream. + +### Single Synthesis + +1. Type in any text (in English only) +2. Change the speech rate (or keep the default) +3. Click "Synthesize" +4. Click "Play" and listen for the generated speech. + +**Optional**: If you wish, you may replace the model file in the `index.html` with the male model file for a male voice: ```html diff --git a/demo/web/index.html b/demo/web/index.html index 60d737c2..ff3dfaed 100644 --- a/demo/web/index.html +++ b/demo/web/index.html @@ -8,6 +8,8 @@ let orca = null; let pcm = null; + let alignments = null; + let orcaStream = null; function writeMessage(message) { console.log(message); @@ -36,24 +38,41 @@ return buffer; } - let isPlaying = false; - let originalAudioSource; + const chooseBtnsEl = document.getElementById('choose-btns'); + const chooseSingleBtnEl = document.getElementById('choose-single-btn'); + const chooseStreamBtnEl = document.getElementById('choose-stream-btn'); + const singleSynthesisEl = document.getElementById('single-synthesis'); + const streamSynthesisEl = document.getElementById('stream-synthesis'); + + chooseSingleBtnEl.addEventListener('click', () => { + chooseBtnsEl.style.display = 'none'; + singleSynthesisEl.style.display = 'block'; + }); + + chooseStreamBtnEl.addEventListener('click', () => { + chooseBtnsEl.style.display = 'none'; + streamSynthesisEl.style.display = 'block'; + }); + const textToSynthesizeEl = document.getElementById('text-to-synthesize'); + const textToSynthesizeNumCharsEl = document.getElementById('text-to-synthesize-num-chars'); const textToSynthesizeErrorEl = document.getElementById('text-to-synthesize-error'); const speechRateSliderEl = document.getElementById('speech-rate'); + const speechRateDisplayEl = document.getElementById('speech-rate-display'); const synthesizeBtnEl = document.getElementById('synthesize-btn'); const controlBtnEl = document.getElementById('control-btn'); const downloadBtnEl = document.getElementById('download-btn'); - const speechRateDisplayEl = document.getElementById('speech-rate-display'); + const alignmentsTableEl = document.getElementById('alignments-table'); - function onSynthesizeParamChange() { - if (orca !== null && isPlaying === false) { - synthesizeBtnEl.disabled = false; - controlBtnEl.disabled = true; - downloadBtnEl.disabled = true; - controlBtnEl.innerText = 'Play'; - } - } + const streamTextToSynthesizeEl = document.getElementById('stream-text-to-synthesize'); + const streamTextDisplayEl = document.getElementById('stream-text-display'); + const streamSecondsDisplayEl = document.getElementById('stream-seconds-display'); + const streamTextToSynthesizeErrorEl = document.getElementById('stream-text-to-synthesize-error'); + const streamSpeechRateSliderEl = document.getElementById('stream-speech-rate'); + const streamSpeechRateDisplayEl = document.getElementById('stream-speech-rate-display'); + const streamOpenBtnEl = document.getElementById('stream-open-btn'); + const streamPlayBtnEl = document.getElementById('stream-play-btn'); + const streamCloseBtnEl = document.getElementById('stream-close-btn'); function validateInput(input, validChars) { let nonAllowedCharacters = []; @@ -66,32 +85,130 @@ if (nonAllowedCharacters.length > 0) { textToSynthesizeErrorEl.innerText = `Error: Characters ${JSON.stringify(nonAllowedCharacters)} are not allowed.`; + streamTextToSynthesizeErrorEl.innerText = `Characters ${JSON.stringify(nonAllowedCharacters)} will be ignored.`; synthesizeBtnEl.disabled = true; } else { - textToSynthesizeErrorEl.innerHTML = ' '; + const text = ' '; + textToSynthesizeErrorEl.innerHTML = text; + streamTextToSynthesizeErrorEl.innerHTML = text; synthesizeBtnEl.disabled = false; } } + // Single Synthesis + let isPlaying = false; + let originalAudioSource; + + textToSynthesizeEl.addEventListener('input', (e) => { + textToSynthesizeNumCharsEl.innerText = e.target.value.trim().length.toString(); + }); + + function onSynthesizeParamChange() { + if (orca !== null && isPlaying === false) { + synthesizeBtnEl.disabled = false; + controlBtnEl.disabled = true; + downloadBtnEl.disabled = true; + controlBtnEl.innerText = 'Play'; + } + } + + function setAlignmentsTable(alignments) { + if (alignments === null) { + alignmentsTableEl.style.display = 'none'; + return; + } + + alignmentsTableEl.style.display = 'block'; + const rowCount = alignmentsTableEl.rows.length; + for (let i = 1; i < rowCount; i++) { + alignmentsTableEl.deleteRow(1); + } + + alignments.forEach((a) => { + const row = alignmentsTableEl.insertRow(-1); + row.style.verticalAlign = 'top'; + const word = row.insertCell(0); + const start = row.insertCell(1); + const end = row.insertCell(2); + const phonemes = row.insertCell(3); + + word.innerHTML = `${a.word}`; + start.innerHTML = `${a.startSec.toFixed(3)}`; + end.innerHTML = `${a.endSec.toFixed(3)}`; + const phonemesInnerHTML = a.phonemes.map(p => { + return ` + ${p.phoneme} + [${p.startSec.toFixed(3)} - ${p.endSec.toFixed(3)}s] + `; + }).join(''); + phonemes.innerHTML = ` + + + + + + ${phonemesInnerHTML} +
+ `; + }); + } + + async function synthesize() { + const text = textToSynthesizeEl.value.trim(); + if (text === '') return; + + writeMessage('Synthesizing. Please wait...'); + try { + textToSynthesizeEl.disabled = true; + speechRateSliderEl.disabled = true; + synthesizeBtnEl.disabled = true; + controlBtnEl.disabled = true; + downloadBtnEl.disabled = true; + + const result = await orca.synthesize( + text, + { speechRate: speechRateSliderEl.value }, + ); + + pcm = result.pcm; + setAlignmentsTable(result.alignments); + writeMessage('Synthesizing complete!'); + + controlBtnEl.disabled = false; + downloadBtnEl.disabled = false; + } catch (err) { + writeMessage(err); + } finally { + textToSynthesizeEl.disabled = false; + speechRateSliderEl.disabled = false; + } + } + + function onAudioStop() { + isPlaying = false; + controlBtnEl.innerText = 'Play'; + textToSynthesizeEl.disabled = false; + speechRateSliderEl.disabled = false; + synthesizeBtnEl.disabled = false; + } + textToSynthesizeEl.addEventListener('input', (e) => { onSynthesizeParamChange(); if (orca !== null) { validateInput(e.target.value, orca.validCharacters); } }); + speechRateSliderEl.addEventListener('change', () => { onSynthesizeParamChange(); speechRateDisplayEl.innerText = speechRateSliderEl.value; }); - function onAudioStop() { - isPlaying = false; - controlBtnEl.innerText = 'Play'; - textToSynthesizeEl.disabled = false; - speechRateSliderEl.disabled = false; - } + synthesizeBtnEl.addEventListener('click', async () => await synthesize()); controlBtnEl.addEventListener('click', () => { + if (pcm === null) return; + if (!isPlaying) { originalAudioSource = audioContext.createBufferSource(); originalAudioSource.addEventListener('ended', onAudioStop); @@ -109,56 +226,164 @@ onAudioStop(); } }); + + // Streaming Synthesis + let isPlayingStream = false; + const audioBuffer = []; + let streamSource; + + async function playStream() { + if (isPlayingStream) return; + + if (audioBuffer.length === 0) { + streamPlayBtnEl.disabled = false; + streamCloseBtnEl.disabled = false; + return; + } else { + streamPlayBtnEl.disabled = true; + streamCloseBtnEl.disabled = true; + } + + streamSource = audioContext.createBufferSource(); + + streamSource.buffer = audioBuffer.shift(); + streamSource.connect(originalAudioGain); + + streamSource.onended = async () => { + isPlayingStream = false; + await playStream(); + }; + + streamSource.start(); + isPlayingStream = true; + } + + async function streamOpen() { + writeMessage('Opening stream. Please wait...'); + try { + streamTextToSynthesizeEl.disabled = true; + streamSpeechRateSliderEl.disabled = true; + streamOpenBtnEl.disabled = true; + + orcaStream = await orca.streamOpen({ + speechRate: streamSpeechRateSliderEl.value, + }); + + streamTextToSynthesizeEl.disabled = false; + streamCloseBtnEl.disabled = false; + streamPlayBtnEl.disabled = false; + + writeMessage('Stream opened. Type in the input field!'); + } catch (err) { + writeMessage(err); + } + } + + async function streamPlay() { + writeMessage('Synthesizing and playing speech! Please listen for audio.'); + try { + streamTextDisplayEl.innerText = ''; + streamSecondsDisplayEl.innerText = '0'; + + const text = streamTextToSynthesizeEl.value; + const words = text.split(' ').map(str => `${str} `); + let numIterations = 0; + + for (const word of words) { + streamTextDisplayEl.innerText += word; + const wordPcm = await orcaStream.synthesize(word); + if (wordPcm !== null) { + const curSecs = parseFloat(streamSecondsDisplayEl.innerText); + const newSecs = wordPcm.length / orca.sampleRate; + const time = curSecs + newSecs; + streamSecondsDisplayEl.innerText = time.toFixed(3); + audioBuffer.push(createBuffer(wordPcm)); + if (numIterations === 1) { + await playStream(); + } + numIterations++; + } + await new Promise(r => setTimeout(r, 100)); + } + + const flushPcm = await orcaStream.flush(); + if (flushPcm !== null) { + const curSecs = parseFloat(streamSecondsDisplayEl.innerText); + const newSecs = flushPcm.length / orca.sampleRate; + const time = curSecs + newSecs; + streamSecondsDisplayEl.innerText = time.toFixed(3); + audioBuffer.push(createBuffer(flushPcm)); + await playStream(); + } + } catch (err) { + writeMessage(err); + } + } + + async function streamClose() { + writeMessage('Closing stream. Please wait...'); + try { + streamTextToSynthesizeEl.disabled = true; + if (streamSource) { + streamSource.stop(); + } + + await orcaStream.close(); + orcaStream = null; + + streamSpeechRateSliderEl.disabled = false; + streamOpenBtnEl.disabled = false; + streamPlayBtnEl.disabled = true; + streamCloseBtnEl.disabled = true; + streamTextToSynthesizeEl.value = ''; + writeMessage('Stream closed! Click "Open Stream" to begin.'); + } catch (err) { + writeMessage(err); + } + } + + streamOpenBtnEl.addEventListener('click', async () => await streamOpen()); + streamPlayBtnEl.addEventListener('click', async () => await streamPlay()); + streamCloseBtnEl.addEventListener('click', async () => await streamClose()); + + streamTextToSynthesizeEl.addEventListener('input', (e) => { + if (orca !== null) { + validateInput(e.target.value, orca.validCharacters); + } + }); + + streamSpeechRateSliderEl.addEventListener('change', () => { + streamSpeechRateDisplayEl.innerText = streamSpeechRateSliderEl.value; + }); }; async function startOrca(accessKey) { writeMessage('Orca is loading. Please wait...'); try { document.getElementById('start-orca').disabled = true; - document.getElementById('text-to-synthesize').disabled = true; orca = await OrcaWeb.OrcaWorker.create( accessKey, { base64: modelParams, forceWrite: true }, ); - document.getElementById('text-to-synthesize').disabled = false; - document.getElementById('speech-rate').disabled = false; + document.getElementById('choose-btns').style.display = 'block'; + const maxCharacterLimit = orca.maxCharacterLimit.toString(); + document.getElementById('max-char-limit').innerText = maxCharacterLimit; + document.getElementById('text-to-synthesize').maxLength = maxCharacterLimit; + document.getElementById('stream-text-to-synthesize').maxLength = maxCharacterLimit; writeMessage('Orca worker ready!'); } catch (err) { writeMessage(err); } } - async function synthesize() { - writeMessage('Synthesizing. Please wait...'); - try { - document.getElementById('text-to-synthesize').disabled = true; - document.getElementById('speech-rate').disabled = true; - document.getElementById('synthesize-btn').disabled = true; - document.getElementById('control-btn').disabled = true; - document.getElementById('download-btn').disabled = true; - const text = document.getElementById('text-to-synthesize').value; - const speechRate = document.getElementById('speech-rate').value; - pcm = await orca.synthesize(text, { speechRate }); - writeMessage('Synthesizing complete!'); - document.getElementById('control-btn').disabled = false; - document.getElementById('download-btn').disabled = false; - } catch (err) { - writeMessage(err); - } finally { - document.getElementById('text-to-synthesize').disabled = false; - document.getElementById('speech-rate').disabled = false; - } - } - function downloadDumpAudio() { let blob = new Blob([pcm]); let a = document.createElement('a'); a.download = 'orca_speech_audio.pcm'; a.href = window.URL.createObjectURL(blob); a.click(); - document.removeChild(a); } @@ -185,31 +410,93 @@

Orca Web Demo

onclick="startOrca(document.getElementById('accessKey').value)" />
- -
- -
 
- -
-
- - - -
+
0/
+
 
+ + +
+
+ + +
+
+ + + + + + + + + + + + + + + +
diff --git a/demo/web/package.json b/demo/web/package.json index 76f0cfd9..78931cc8 100644 --- a/demo/web/package.json +++ b/demo/web/package.json @@ -1,6 +1,6 @@ { "name": "orca-web-demo", - "version": "0.1.1", + "version": "0.2.0", "description": "A basic demo to show how to use Orca for web browsers, using the IIFE version of the library", "main": "index.js", "private": true, @@ -18,7 +18,7 @@ "author": "Picovoice Inc", "license": "Apache-2.0", "dependencies": { - "@picovoice/orca-web": "~0.1.1" + "@picovoice/orca-web": "~0.2.0" }, "devDependencies": { "http-server": "^14.0.0" diff --git a/demo/web/yarn.lock b/demo/web/yarn.lock index 240377cc..718e4442 100644 --- a/demo/web/yarn.lock +++ b/demo/web/yarn.lock @@ -2,10 +2,10 @@ # yarn lockfile v1 -"@picovoice/orca-web@~0.1.1": - version "0.1.1" - resolved "https://registry.yarnpkg.com/@picovoice/orca-web/-/orca-web-0.1.1.tgz#28915f34d470714ecbfdec544d8d908afa8b7e5f" - integrity sha512-/3Q7lVfN8Pb1LmRlbZApvyeo9sq5iaYsaeExXWNp276pfqXi1NbUxf1d0b1707+i9qZfdUpAkXmvGrbrD4zkqg== +"@picovoice/orca-web@~0.2.0": + version "0.2.0" + resolved "https://registry.yarnpkg.com/@picovoice/orca-web/-/orca-web-0.2.0.tgz#b5567dcfe132333fd7f5f8cf2361a06c95d69eb5" + integrity sha512-mIgqAxrxUf84N7SgTRhaZ/qxltmofx5TgKfRslGHQzd391DhCrQE598YCQWZ8lsZIzzevGDce96LGeJH6vbfww== dependencies: "@picovoice/web-utils" "=1.3.4" diff --git a/include/pv_orca.h b/include/pv_orca.h index 44351ac5..f7f7d9d5 100644 --- a/include/pv_orca.h +++ b/include/pv_orca.h @@ -12,6 +12,7 @@ #ifndef PV_ORCA_H #define PV_ORCA_H +#include #include #include "picovoice.h" @@ -24,6 +25,15 @@ extern "C" { /** * Forward declaration for Orca text-to-speech engine. Orca converts text to spoken audio without network latency. + * It has two modes of operation. + * 1) Single synthesis: converts a given text to audio. Function `pv_orca_synthesize()` returns the raw audio data, + * function `pv_orca_synthesize_to_file()` saves the audio to a file. + * 2) Streaming synthesis: Converts a stream of text to a stream of audio. An OrcaStream object can be opened with + * `pv_orca_stream_open()` and text can be added with `pv_orca_stream_synthesize()`. The audio is + * generated in chunks whenever enough text has been buffered. When the text stream is finalized, + * the caller needs to use `pv_orca_stream_flush()` to generate the audio for the remaining text that has + * not been synthesized. The stream can be closed with `pv_orca_stream_close()`. + * Single synthesis functions cannot be called while a stream is open. */ typedef struct pv_orca pv_orca_t; @@ -33,15 +43,9 @@ typedef struct pv_orca pv_orca_t; * @param access_key AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) * @param model_path Absolute path to the file containing Orca's model parameters. * @param[out] object Constructed instance of Orca. - * @return A status code indicating the result of the initialization. Possible values include: - * - `PV_STATUS_OUT_OF_MEMORY`: Memory allocation failure. - * - `PV_STATUS_IO_ERROR`: Input/output error. - * - `PV_STATUS_INVALID_ARGUMENT`: Invalid input argument. - * - `PV_STATUS_RUNTIME_ERROR`: Error during runtime. - * - `PV_STATUS_ACTIVATION_ERROR`: Activation-related error. - * - `PV_STATUS_ACTIVATION_LIMIT_REACHED`: Activation limit reached. - * - `PV_STATUS_ACTIVATION_THROTTLED`: Activation throttled. - * - `PV_STATUS_ACTIVATION_REFUSED`: Activation refused. + * @return Status code. Returns `PV_STATUS_OUT_OF_MEMORY`, `PV_STATUS_IO_ERROR`, `PV_STATUS_INVALID_ARGUMENT`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. */ PV_API pv_status_t pv_orca_init( const char *access_key, @@ -56,24 +60,24 @@ PV_API pv_status_t pv_orca_init( PV_API void pv_orca_delete(pv_orca_t *object); /** - * Gets an array of characters that are accepted as input to Orca synthesize functions. + * Returns an array of characters that are accepted as input to Orca synthesize functions. * * @param object Constructed instance of Orca. * @param[out] num_characters Number of valid characters. * @param[out] characters An array of valid characters for Orca. - * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. */ PV_API pv_status_t pv_orca_valid_characters( const pv_orca_t *object, int32_t *num_characters, - const char *const **characters); + const char ***characters); /** * Deletes the characters previously created by `pv_orca_valid_characters()`. * - * @param characters The characters returned from `pv_orca_valid_characters()`. + * @param characters The characters returned from `pv_orca_valid_characters()`. */ -PV_API void pv_orca_valid_characters_delete(const char *const *characters); +PV_API void pv_orca_valid_characters_delete(const char **characters); /** * Gets the sampling rate of the audio produced by Orca. @@ -87,21 +91,23 @@ PV_API pv_status_t pv_orca_sample_rate(const pv_orca_t *object, int32_t *sample_ /** * Gets the maximum number of characters that can be synthesized at once. * - * @return Maximum character limit + * @param object Constructed instance of Orca. + * @param[out] max_character_limit Maximum number of characters that can be synthesized at once. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API int32_t pv_orca_max_character_limit(void); +PV_API pv_status_t pv_orca_max_character_limit(const pv_orca_t *object, int32_t *max_character_limit); /** * Forward declaration for pv_orca_synthesize_params object. This object can be parsed to Orca synthesize functions to - * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init` and deleted with - * `pv_orca_synthesize_params_delete`. The object's properties can be set with `pv_orca_synthesize_params_set_*` - * and returned with `pv_orca_synthesize_params_get_*`. + * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init()` and deleted with + * `pv_orca_synthesize_params_delete()`. The object's properties can be set with `pv_orca_synthesize_params_set_*()` + * and returned with `pv_orca_synthesize_params_get_*()`. */ typedef struct pv_orca_synthesize_params pv_orca_synthesize_params_t; /** * Constructor for the pv_orca_synthesize_params object. - * + * * @param[out] object Constructed instance of pv_orca_synthesize_params. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. */ @@ -109,14 +115,14 @@ PV_API pv_status_t pv_orca_synthesize_params_init(pv_orca_synthesize_params_t ** /** * Destructor for the pv_orca_synthesize_params object. - * + * * @param object The pv_orca_synthesize_params object. */ PV_API void pv_orca_synthesize_params_delete(pv_orca_synthesize_params_t *object); /** * Setter for the speech rate. - * + * * @param object Constructed instance of pv_orca_synthesize_params. * @param speech_rate The pace of the speech. Valid values are within [0.7, 1.3]. * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. @@ -127,7 +133,7 @@ PV_API pv_status_t pv_orca_synthesize_params_set_speech_rate( /** * Getter for the speech rate. - * + * * @param object Constructed instance of pv_orca_synthesize_params. * @param[out] speech_rate The pace of the speech. * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. @@ -136,56 +142,195 @@ PV_API pv_status_t pv_orca_synthesize_params_get_speech_rate( const pv_orca_synthesize_params_t *object, float *speech_rate); +/** + * Setter for the random state used in synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_set_random_state( + pv_orca_synthesize_params_t *object, + int64_t random_state); + +/** + * Getter for random state used in synthesize functions. If no state has been set via + * `pv_orca_synthesize_params_set_random_state()`, the default value of the state is -1, which means a + * random state is used in the synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param[out] random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_get_random_state( + const pv_orca_synthesize_params_t *object, + int64_t *random_state); + +/** + * A synthesized phoneme and its associated metadata. + */ +typedef struct { + char *phoneme; /** Synthesized phoneme. */ + float start_sec; /** Start of phoneme in seconds. */ + float end_sec; /** End of phoneme in seconds. */ +} pv_orca_phoneme_alignment_t; + +/** + * A synthesized word and its associated metadata. + */ +typedef struct { + char *word; /** Synthesized word. */ + float start_sec; /** Start of word in seconds. */ + float end_sec; /** End of word in seconds. */ + + int32_t num_phonemes; /** Number of phonemes in the word. */ + pv_orca_phoneme_alignment_t **phonemes; /** Array of phonemes in the word. */ +} pv_orca_word_alignment_t; + /** * Generates audio from text. The returned audio contains the speech representation of the text. - * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_delete_pcm` + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. + * The memory of the returned audio and the alignment metadata is allocated by Orca and can be deleted with + * `pv_orca_pcm_delete()` and `pv_orca_word_alignments_delete()`, respectively. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param[out] num_samples The length of the pcm. * @param[out] pcm The output audio. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, int32_t *num_samples, - int16_t **pcm); + int16_t **pcm, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); /** * Generates audio from text and saves it to a file. The file contains the speech representation of the text. + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. + * The memory of the returned alignment metadata is allocated by Orca and can be deleted with + * `pv_orca_word_alignments_delete()`. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t()' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param output_path Absolute path to the output audio file. The output file is saved as `WAV (.wav)` * and consists of a single mono channel. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize_to_file( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, - const char *output_path); + const char *output_path, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); + +/** + * Forward declaration for OrcaStream object for converting a text stream into a spoken audio stream. + */ +typedef struct pv_orca_stream pv_orca_stream_t; + +/** + * Opens a new OrcaStream object. + * + * @param object The Orca object. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. + * @param[out] stream The OrcaStream object. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. + */ +PV_API pv_status_t pv_orca_stream_open( + pv_orca_t *object, + const pv_orca_synthesize_params_t *synthesize_params, + pv_orca_stream_t **stream); + +/** + * Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the buffered + * text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk for the remaining + * text that has not yet been synthesized. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `pv_orca_valid_characters()`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. They need to be + * added in a single call to this function. The pronunciation is expressed in ARPAbet format, + * e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @param[out] num_samples The length of the pcm produced, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio chunk, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_synthesize( + pv_orca_stream_t *object, + const char *text, + int32_t *num_samples, + int16_t **pcm); + +/** + * Generates audio for all of the buffered text that was added to the OrcaStream object + * via `pv_orca_stream_synthesize()`. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param[out] num_samples The length of the pcm, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_flush( + pv_orca_stream_t *object, + int32_t *num_samples, + int16_t **pcm); + +/** + * Deletes the OrcaStream object. + * + * @param object The OrcaStream object. + */ +PV_API void pv_orca_stream_close(pv_orca_stream_t *object); /** * Deletes the audio previously generated by the Orca synthesize functions. * - * @param object The pcm generated by orca synthesize functions. + * @param object The pcm generated by Orca synthesize functions. + */ +PV_API void pv_orca_pcm_delete(int16_t *pcm); + +/** + * Deletes word alignments returned from Orca synthesize functions. + * + * @param num_alignments Number of alignments. + * @param alignments Alignments returned from Orca synthesize functions. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API void pv_orca_delete_pcm(int16_t *pcm); +PV_API pv_status_t pv_orca_word_alignments_delete( + int32_t num_alignments, + pv_orca_word_alignment_t **alignments); /** * Getter for version. diff --git a/lib/android/arm64-v8a/libpv_orca.so b/lib/android/arm64-v8a/libpv_orca.so index ccf3dade..4f673db0 100755 Binary files a/lib/android/arm64-v8a/libpv_orca.so and b/lib/android/arm64-v8a/libpv_orca.so differ diff --git a/lib/android/armeabi-v7a/libpv_orca.so b/lib/android/armeabi-v7a/libpv_orca.so index 23d224e4..1fcea8d8 100755 Binary files a/lib/android/armeabi-v7a/libpv_orca.so and b/lib/android/armeabi-v7a/libpv_orca.so differ diff --git a/lib/android/x86/libpv_orca.so b/lib/android/x86/libpv_orca.so index 29721586..5a3bd5ed 100755 Binary files a/lib/android/x86/libpv_orca.so and b/lib/android/x86/libpv_orca.so differ diff --git a/lib/android/x86_64/libpv_orca.so b/lib/android/x86_64/libpv_orca.so index a466696b..f3af786a 100755 Binary files a/lib/android/x86_64/libpv_orca.so and b/lib/android/x86_64/libpv_orca.so differ diff --git a/lib/common/orca_params_female.pv b/lib/common/orca_params_female.pv index c6a6fd44..674f9f54 100644 Binary files a/lib/common/orca_params_female.pv and b/lib/common/orca_params_female.pv differ diff --git a/lib/common/orca_params_male.pv b/lib/common/orca_params_male.pv index 0a0f0eaf..8262b8f8 100644 Binary files a/lib/common/orca_params_male.pv and b/lib/common/orca_params_male.pv differ diff --git a/lib/ios/PvOrca.xcframework/Info.plist b/lib/ios/PvOrca.xcframework/Info.plist index ef4eadaa..0ee9469b 100644 --- a/lib/ios/PvOrca.xcframework/Info.plist +++ b/lib/ios/PvOrca.xcframework/Info.plist @@ -6,30 +6,30 @@ LibraryIdentifier - ios-arm64_x86_64-simulator + ios-arm64 LibraryPath PvOrca.framework SupportedArchitectures arm64 - x86_64 SupportedPlatform ios - SupportedPlatformVariant - simulator LibraryIdentifier - ios-arm64 + ios-arm64_x86_64-simulator LibraryPath PvOrca.framework SupportedArchitectures arm64 + x86_64 SupportedPlatform ios + SupportedPlatformVariant + simulator CFBundlePackageType diff --git a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/picovoice.h b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/picovoice.h index a89365dc..d2388709 100644 --- a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/picovoice.h +++ b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/picovoice.h @@ -1,5 +1,5 @@ /* - Copyright 2024 Picovoice Inc. + Copyright 2018-2023 Picovoice Inc. You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" file accompanying this source. diff --git a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/pv_orca.h b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/pv_orca.h index ef582786..75e429ff 100644 --- a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/pv_orca.h +++ b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Headers/pv_orca.h @@ -12,6 +12,7 @@ #ifndef PV_ORCA_H #define PV_ORCA_H +#include #include #include "picovoice.h" @@ -24,6 +25,15 @@ extern "C" { /** * Forward declaration for Orca text-to-speech engine. Orca converts text to spoken audio without network latency. + * It has two modes of operation. + * 1) Single synthesis: converts a given text to audio. Function `pv_orca_synthesize()` returns the raw audio data, + * function `pv_orca_synthesize_to_file()` saves the audio to a file. + * 2) Streaming synthesis: Converts a stream of text to a stream of audio. An OrcaStream object can be opened with + * `pv_orca_stream_open()` and text can be added with `pv_orca_stream_synthesize()`. The audio is + * generated in chunks whenever enough text has been buffered. When the text stream is finalized, + * the caller needs to use `pv_orca_stream_flush()` to generate the audio for the remaining text that has + * not been synthesized. The stream can be closed with `pv_orca_stream_close()`. + * Single synthesis functions cannot be called while a stream is open. */ typedef struct pv_orca pv_orca_t; @@ -33,15 +43,9 @@ typedef struct pv_orca pv_orca_t; * @param access_key AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) * @param model_path Absolute path to the file containing Orca's model parameters. * @param[out] object Constructed instance of Orca. - * @return A status code indicating the result of the initialization. Possible values include: - * - `PV_STATUS_OUT_OF_MEMORY`: Memory allocation failure. - * - `PV_STATUS_IO_ERROR`: Input/output error. - * - `PV_STATUS_INVALID_ARGUMENT`: Invalid input argument. - * - `PV_STATUS_RUNTIME_ERROR`: Error during runtime. - * - `PV_STATUS_ACTIVATION_ERROR`: Activation-related error. - * - `PV_STATUS_ACTIVATION_LIMIT_REACHED`: Activation limit reached. - * - `PV_STATUS_ACTIVATION_THROTTLED`: Activation throttled. - * - `PV_STATUS_ACTIVATION_REFUSED`: Activation refused. + * @return Status code. Returns `PV_STATUS_OUT_OF_MEMORY`, `PV_STATUS_IO_ERROR`, `PV_STATUS_INVALID_ARGUMENT`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. */ PV_API pv_status_t pv_orca_init( const char *access_key, @@ -56,24 +60,24 @@ PV_API pv_status_t pv_orca_init( PV_API void pv_orca_delete(pv_orca_t *object); /** - * Gets an array of characters that are accepted as input to Orca synthesize functions. + * Returns an array of characters that are accepted as input to Orca synthesize functions. * * @param object Constructed instance of Orca. * @param[out] num_characters Number of valid characters. * @param[out] characters An array of valid characters for Orca. - * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. */ PV_API pv_status_t pv_orca_valid_characters( const pv_orca_t *object, int32_t *num_characters, - const char *const **characters); + const char ***characters); /** * Deletes the characters previously created by `pv_orca_valid_characters()`. * - * @param characters The characters returned from `pv_orca_valid_characters()`. + * @param characters The characters returned from `pv_orca_valid_characters()`. */ -PV_API void pv_orca_valid_characters_delete(const char *const *characters); +PV_API void pv_orca_valid_characters_delete(const char **characters); /** * Gets the sampling rate of the audio produced by Orca. @@ -87,15 +91,17 @@ PV_API pv_status_t pv_orca_sample_rate(const pv_orca_t *object, int32_t *sample_ /** * Gets the maximum number of characters that can be synthesized at once. * - * @return Maximum character limit + * @param object Constructed instance of Orca. + * @param[out] max_character_limit Maximum number of characters that can be synthesized at once. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API int32_t pv_orca_max_character_limit(void); +PV_API pv_status_t pv_orca_max_character_limit(const pv_orca_t *object, int32_t *max_character_limit); /** * Forward declaration for pv_orca_synthesize_params object. This object can be parsed to Orca synthesize functions to - * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init` and deleted with - * `pv_orca_synthesize_params_delete`. The object's properties can be set with `pv_orca_synthesize_params_set_*` - * and returned with `pv_orca_synthesize_params_get_*`. + * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init()` and deleted with + * `pv_orca_synthesize_params_delete()`. The object's properties can be set with `pv_orca_synthesize_params_set_*()` + * and returned with `pv_orca_synthesize_params_get_()*`. */ typedef struct pv_orca_synthesize_params pv_orca_synthesize_params_t; @@ -136,56 +142,192 @@ PV_API pv_status_t pv_orca_synthesize_params_get_speech_rate( const pv_orca_synthesize_params_t *object, float *speech_rate); +/** + * Setter for the random state used in synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_set_random_state( + pv_orca_synthesize_params_t *object, + int64_t random_state); + +/** + * Getter for random state used in synthesize functions. If no state has been set via + * `pv_orca_synthesize_params_set_random_state()`, the default value of the state is -1, which means a + * random state is used in the synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param[out] random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_get_random_state( + const pv_orca_synthesize_params_t *object, + int64_t *random_state); + +/** + * A synthesized phoneme and its associated metadata. + */ +typedef struct { + char *phoneme; /** Synthesized phoneme. */ + float start_sec; /** Start of phoneme in seconds. */ + float end_sec; /** End of phoneme in seconds. */ +} pv_orca_phoneme_alignment_t; + +/** + * A synthesized word and its associated metadata. + */ +typedef struct { + char *word; /** Synthesized word. */ + float start_sec; /** Start of word in seconds. */ + float end_sec; /** End of word in seconds. */ + + int32_t num_phonemes; /** Number of phonemes in the word. */ + pv_orca_phoneme_alignment_t **phonemes; /** Array of phonemes in the word. */ +} pv_orca_word_alignment_t; + /** * Generates audio from text. The returned audio contains the speech representation of the text. - * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_delete_pcm` + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. + * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_pcm_delete()` * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param[out] num_samples The length of the pcm. * @param[out] pcm The output audio. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, int32_t *num_samples, - int16_t **pcm); + int16_t **pcm, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); /** * Generates audio from text and saves it to a file. The file contains the speech representation of the text. + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t()' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param output_path Absolute path to the output audio file. The output file is saved as `WAV (.wav)` * and consists of a single mono channel. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize_to_file( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, - const char *output_path); + const char *output_path, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); + +/** + * Forward declaration for OrcaStream object for converting a text stream into a spoken audio stream. + */ +typedef struct pv_orca_stream pv_orca_stream_t; + +/** + * Opens a new OrcaStream object. + * + * @param object The Orca object. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. + * @param[out] stream The OrcaStream object. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. + */ +PV_API pv_status_t pv_orca_stream_open( + pv_orca_t *object, + const pv_orca_synthesize_params_t *synthesize_params, + pv_orca_stream_t **stream); + +/** + * Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the buffered + * text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk for the remaining + * text that has not yet been synthesized. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param text A chunk of text from a text input stream. Characters not supported by Orca will be ignored. + * Valid characters can be retrieved by calling `pv_orca_valid_characters()`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. They need to be + * added in a single call to this function. The pronunciation is expressed in ARPAbet format, + * e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @param[out] num_samples The length of the pcm produced, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio chunk, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_synthesize( + pv_orca_stream_t *object, + const char *text, + int32_t *num_samples, + int16_t **pcm); + +/** + * Generates audio for all of the buffered text that was added to the OrcaStream object + * via `pv_orca_stream_synthesize()`. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param[out] num_samples The length of the pcm, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_flush( + pv_orca_stream_t *object, + int32_t *num_samples, + int16_t **pcm); + +/** + * Deletes the OrcaStream object. + * + * @param object The OrcaStream object. + */ +PV_API void pv_orca_stream_close(pv_orca_stream_t *object); /** * Deletes the audio previously generated by the Orca synthesize functions. * - * @param object The pcm generated by orca synthesize functions. + * @param object The pcm generated by Orca synthesize functions. + */ +PV_API void pv_orca_pcm_delete(int16_t *pcm); + +/** + * Deletes word alignments returned from Orca synthesize functions. + * + * @param num_alignments Number of alignments. + * @param alignments Alignments returned from Orca synthesize functions. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API void pv_orca_delete_pcm(int16_t *pcm); +PV_API pv_status_t pv_orca_word_alignments_delete( + int32_t num_alignments, + pv_orca_word_alignment_t **alignments); /** * Getter for version. diff --git a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Info.plist b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Info.plist index acea2a6f..82b0263d 100644 Binary files a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Info.plist and b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/Info.plist differ diff --git a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/PvOrca b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/PvOrca index d14e1562..edd32d45 100755 Binary files a/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/PvOrca and b/lib/ios/PvOrca.xcframework/ios-arm64/PvOrca.framework/PvOrca differ diff --git a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/picovoice.h b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/picovoice.h index a89365dc..d2388709 100644 --- a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/picovoice.h +++ b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/picovoice.h @@ -1,5 +1,5 @@ /* - Copyright 2024 Picovoice Inc. + Copyright 2018-2023 Picovoice Inc. You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" file accompanying this source. diff --git a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/pv_orca.h b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/pv_orca.h index ef582786..75e429ff 100644 --- a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/pv_orca.h +++ b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Headers/pv_orca.h @@ -12,6 +12,7 @@ #ifndef PV_ORCA_H #define PV_ORCA_H +#include #include #include "picovoice.h" @@ -24,6 +25,15 @@ extern "C" { /** * Forward declaration for Orca text-to-speech engine. Orca converts text to spoken audio without network latency. + * It has two modes of operation. + * 1) Single synthesis: converts a given text to audio. Function `pv_orca_synthesize()` returns the raw audio data, + * function `pv_orca_synthesize_to_file()` saves the audio to a file. + * 2) Streaming synthesis: Converts a stream of text to a stream of audio. An OrcaStream object can be opened with + * `pv_orca_stream_open()` and text can be added with `pv_orca_stream_synthesize()`. The audio is + * generated in chunks whenever enough text has been buffered. When the text stream is finalized, + * the caller needs to use `pv_orca_stream_flush()` to generate the audio for the remaining text that has + * not been synthesized. The stream can be closed with `pv_orca_stream_close()`. + * Single synthesis functions cannot be called while a stream is open. */ typedef struct pv_orca pv_orca_t; @@ -33,15 +43,9 @@ typedef struct pv_orca pv_orca_t; * @param access_key AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) * @param model_path Absolute path to the file containing Orca's model parameters. * @param[out] object Constructed instance of Orca. - * @return A status code indicating the result of the initialization. Possible values include: - * - `PV_STATUS_OUT_OF_MEMORY`: Memory allocation failure. - * - `PV_STATUS_IO_ERROR`: Input/output error. - * - `PV_STATUS_INVALID_ARGUMENT`: Invalid input argument. - * - `PV_STATUS_RUNTIME_ERROR`: Error during runtime. - * - `PV_STATUS_ACTIVATION_ERROR`: Activation-related error. - * - `PV_STATUS_ACTIVATION_LIMIT_REACHED`: Activation limit reached. - * - `PV_STATUS_ACTIVATION_THROTTLED`: Activation throttled. - * - `PV_STATUS_ACTIVATION_REFUSED`: Activation refused. + * @return Status code. Returns `PV_STATUS_OUT_OF_MEMORY`, `PV_STATUS_IO_ERROR`, `PV_STATUS_INVALID_ARGUMENT`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. */ PV_API pv_status_t pv_orca_init( const char *access_key, @@ -56,24 +60,24 @@ PV_API pv_status_t pv_orca_init( PV_API void pv_orca_delete(pv_orca_t *object); /** - * Gets an array of characters that are accepted as input to Orca synthesize functions. + * Returns an array of characters that are accepted as input to Orca synthesize functions. * * @param object Constructed instance of Orca. * @param[out] num_characters Number of valid characters. * @param[out] characters An array of valid characters for Orca. - * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. */ PV_API pv_status_t pv_orca_valid_characters( const pv_orca_t *object, int32_t *num_characters, - const char *const **characters); + const char ***characters); /** * Deletes the characters previously created by `pv_orca_valid_characters()`. * - * @param characters The characters returned from `pv_orca_valid_characters()`. + * @param characters The characters returned from `pv_orca_valid_characters()`. */ -PV_API void pv_orca_valid_characters_delete(const char *const *characters); +PV_API void pv_orca_valid_characters_delete(const char **characters); /** * Gets the sampling rate of the audio produced by Orca. @@ -87,15 +91,17 @@ PV_API pv_status_t pv_orca_sample_rate(const pv_orca_t *object, int32_t *sample_ /** * Gets the maximum number of characters that can be synthesized at once. * - * @return Maximum character limit + * @param object Constructed instance of Orca. + * @param[out] max_character_limit Maximum number of characters that can be synthesized at once. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API int32_t pv_orca_max_character_limit(void); +PV_API pv_status_t pv_orca_max_character_limit(const pv_orca_t *object, int32_t *max_character_limit); /** * Forward declaration for pv_orca_synthesize_params object. This object can be parsed to Orca synthesize functions to - * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init` and deleted with - * `pv_orca_synthesize_params_delete`. The object's properties can be set with `pv_orca_synthesize_params_set_*` - * and returned with `pv_orca_synthesize_params_get_*`. + * control the synthesized audio. An instance can be created with `pv_orca_synthesize_params_init()` and deleted with + * `pv_orca_synthesize_params_delete()`. The object's properties can be set with `pv_orca_synthesize_params_set_*()` + * and returned with `pv_orca_synthesize_params_get_()*`. */ typedef struct pv_orca_synthesize_params pv_orca_synthesize_params_t; @@ -136,56 +142,192 @@ PV_API pv_status_t pv_orca_synthesize_params_get_speech_rate( const pv_orca_synthesize_params_t *object, float *speech_rate); +/** + * Setter for the random state used in synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_set_random_state( + pv_orca_synthesize_params_t *object, + int64_t random_state); + +/** + * Getter for random state used in synthesize functions. If no state has been set via + * `pv_orca_synthesize_params_set_random_state()`, the default value of the state is -1, which means a + * random state is used in the synthesize functions. + * + * @param object Constructed instance of pv_orca_synthesize_params. + * @param[out] random_state The random state used in synthesize functions. + * @return Returns `PV_STATUS_INVALID_ARGUMENT` on failure. + */ +PV_API pv_status_t pv_orca_synthesize_params_get_random_state( + const pv_orca_synthesize_params_t *object, + int64_t *random_state); + +/** + * A synthesized phoneme and its associated metadata. + */ +typedef struct { + char *phoneme; /** Synthesized phoneme. */ + float start_sec; /** Start of phoneme in seconds. */ + float end_sec; /** End of phoneme in seconds. */ +} pv_orca_phoneme_alignment_t; + +/** + * A synthesized word and its associated metadata. + */ +typedef struct { + char *word; /** Synthesized word. */ + float start_sec; /** Start of word in seconds. */ + float end_sec; /** End of word in seconds. */ + + int32_t num_phonemes; /** Number of phonemes in the word. */ + pv_orca_phoneme_alignment_t **phonemes; /** Array of phonemes in the word. */ +} pv_orca_word_alignment_t; + /** * Generates audio from text. The returned audio contains the speech representation of the text. - * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_delete_pcm` + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. + * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_pcm_delete()` * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param[out] num_samples The length of the pcm. * @param[out] pcm The output audio. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, int32_t *num_samples, - int16_t **pcm); + int16_t **pcm, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); /** * Generates audio from text and saves it to a file. The file contains the speech representation of the text. + * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling * `pv_orca_max_character_limit()`. Allowed characters can be retrieved by calling `pv_orca_valid_characters()`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. - * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_text_params_t()' for details. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. * @param output_path Absolute path to the output audio file. The output file is saved as `WAV (.wav)` * and consists of a single mono channel. + * @param[out] num_alignments Number of returned alignments. + * @param[out] alignments Alignments of synthesized words, phonemes, and their associated metadata. * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY`, * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, * `PV_STATUS_ACTIVATION_THROTTLED`, or `PV_STATUS_ACTIVATION_REFUSED` on failure. + * Returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. */ PV_API pv_status_t pv_orca_synthesize_to_file( const pv_orca_t *object, const char *text, const pv_orca_synthesize_params_t *synthesize_params, - const char *output_path); + const char *output_path, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments); + +/** + * Forward declaration for OrcaStream object for converting a text stream into a spoken audio stream. + */ +typedef struct pv_orca_stream pv_orca_stream_t; + +/** + * Opens a new OrcaStream object. + * + * @param object The Orca object. + * @param synthesize_params Global parameters for synthesized text. See 'pv_orca_synthesize_params_t' for details. + * @param[out] stream The OrcaStream object. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` or `PV_STATUS_OUT_OF_MEMORY` on failure. + */ +PV_API pv_status_t pv_orca_stream_open( + pv_orca_t *object, + const pv_orca_synthesize_params_t *synthesize_params, + pv_orca_stream_t **stream); + +/** + * Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the buffered + * text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk for the remaining + * text that has not yet been synthesized. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param text A chunk of text from a text input stream. Characters not supported by Orca will be ignored. + * Valid characters can be retrieved by calling `pv_orca_valid_characters()`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. They need to be + * added in a single call to this function. The pronunciation is expressed in ARPAbet format, + * e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @param[out] num_samples The length of the pcm produced, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio chunk, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_synthesize( + pv_orca_stream_t *object, + const char *text, + int32_t *num_samples, + int16_t **pcm); + +/** + * Generates audio for all of the buffered text that was added to the OrcaStream object + * via `pv_orca_stream_synthesize()`. + * The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + * + * @param object The OrcaStream object. + * @param[out] num_samples The length of the pcm, `0` if no audio chunk has been produced. + * @param[out] pcm The output audio, `NULL` if no audio chunk has been produced. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT`, `PV_STATUS_OUT_OF_MEMORY`, + * `PV_STATUS_RUNTIME_ERROR`, `PV_STATUS_ACTIVATION_ERROR`, `PV_STATUS_ACTIVATION_LIMIT_REACHED`, + * `PV_STATUS_ACTIVATION_THROTTLED`, `PV_STATUS_ACTIVATION_REFUSED`, or `PV_STATUS_INVALID_STATE` on failure. + */ +PV_API pv_status_t pv_orca_stream_flush( + pv_orca_stream_t *object, + int32_t *num_samples, + int16_t **pcm); + +/** + * Deletes the OrcaStream object. + * + * @param object The OrcaStream object. + */ +PV_API void pv_orca_stream_close(pv_orca_stream_t *object); /** * Deletes the audio previously generated by the Orca synthesize functions. * - * @param object The pcm generated by orca synthesize functions. + * @param object The pcm generated by Orca synthesize functions. + */ +PV_API void pv_orca_pcm_delete(int16_t *pcm); + +/** + * Deletes word alignments returned from Orca synthesize functions. + * + * @param num_alignments Number of alignments. + * @param alignments Alignments returned from Orca synthesize functions. + * @return Status code. Returns `PV_STATUS_INVALID_ARGUMENT` on failure. */ -PV_API void pv_orca_delete_pcm(int16_t *pcm); +PV_API pv_status_t pv_orca_word_alignments_delete( + int32_t num_alignments, + pv_orca_word_alignment_t **alignments); /** * Getter for version. diff --git a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Info.plist b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Info.plist index 864bd74a..1ae31c30 100644 Binary files a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Info.plist and b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/Info.plist differ diff --git a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/PvOrca b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/PvOrca index 50fde6b5..39a4865b 100755 Binary files a/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/PvOrca and b/lib/ios/PvOrca.xcframework/ios-arm64_x86_64-simulator/PvOrca.framework/PvOrca differ diff --git a/lib/java/jetson/cortex-a57-aarch64/libpv_orca_jni.so b/lib/java/jetson/cortex-a57-aarch64/libpv_orca_jni.so index cd94da30..ab9501cd 100755 Binary files a/lib/java/jetson/cortex-a57-aarch64/libpv_orca_jni.so and b/lib/java/jetson/cortex-a57-aarch64/libpv_orca_jni.so differ diff --git a/lib/java/linux/x86_64/libpv_orca_jni.so b/lib/java/linux/x86_64/libpv_orca_jni.so index 0b32c7b1..5b066ded 100755 Binary files a/lib/java/linux/x86_64/libpv_orca_jni.so and b/lib/java/linux/x86_64/libpv_orca_jni.so differ diff --git a/lib/java/mac/arm64/libpv_orca_jni.dylib b/lib/java/mac/arm64/libpv_orca_jni.dylib index fac8c380..85b7272a 100755 Binary files a/lib/java/mac/arm64/libpv_orca_jni.dylib and b/lib/java/mac/arm64/libpv_orca_jni.dylib differ diff --git a/lib/java/mac/x86_64/libpv_orca_jni.dylib b/lib/java/mac/x86_64/libpv_orca_jni.dylib index 80922102..6f04f053 100755 Binary files a/lib/java/mac/x86_64/libpv_orca_jni.dylib and b/lib/java/mac/x86_64/libpv_orca_jni.dylib differ diff --git a/lib/java/raspberry-pi/cortex-a53-aarch64/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a53-aarch64/libpv_orca_jni.so index d53619e6..58c74b7a 100755 Binary files a/lib/java/raspberry-pi/cortex-a53-aarch64/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a53-aarch64/libpv_orca_jni.so differ diff --git a/lib/java/raspberry-pi/cortex-a53/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a53/libpv_orca_jni.so index 38587648..0dadcd29 100755 Binary files a/lib/java/raspberry-pi/cortex-a53/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a53/libpv_orca_jni.so differ diff --git a/lib/java/raspberry-pi/cortex-a72-aarch64/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a72-aarch64/libpv_orca_jni.so index 1f847dd3..73495a43 100755 Binary files a/lib/java/raspberry-pi/cortex-a72-aarch64/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a72-aarch64/libpv_orca_jni.so differ diff --git a/lib/java/raspberry-pi/cortex-a72/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a72/libpv_orca_jni.so index 8ffe85f1..16ac9d59 100755 Binary files a/lib/java/raspberry-pi/cortex-a72/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a72/libpv_orca_jni.so differ diff --git a/lib/java/raspberry-pi/cortex-a76-aarch64/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a76-aarch64/libpv_orca_jni.so index df3eaf13..119a4b1b 100755 Binary files a/lib/java/raspberry-pi/cortex-a76-aarch64/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a76-aarch64/libpv_orca_jni.so differ diff --git a/lib/java/raspberry-pi/cortex-a76/libpv_orca_jni.so b/lib/java/raspberry-pi/cortex-a76/libpv_orca_jni.so index f4a9b7c7..a76a6b9d 100755 Binary files a/lib/java/raspberry-pi/cortex-a76/libpv_orca_jni.so and b/lib/java/raspberry-pi/cortex-a76/libpv_orca_jni.so differ diff --git a/lib/java/windows/amd64/pv_orca_jni.dll b/lib/java/windows/amd64/pv_orca_jni.dll index 6f173148..3f70fd16 100644 Binary files a/lib/java/windows/amd64/pv_orca_jni.dll and b/lib/java/windows/amd64/pv_orca_jni.dll differ diff --git a/lib/jetson/cortex-a57-aarch64/libpv_orca.so b/lib/jetson/cortex-a57-aarch64/libpv_orca.so index 62737ec3..a3837e3b 100755 Binary files a/lib/jetson/cortex-a57-aarch64/libpv_orca.so and b/lib/jetson/cortex-a57-aarch64/libpv_orca.so differ diff --git a/lib/linux/x86_64/libpv_orca.so b/lib/linux/x86_64/libpv_orca.so index 4916c5b5..91445b06 100755 Binary files a/lib/linux/x86_64/libpv_orca.so and b/lib/linux/x86_64/libpv_orca.so differ diff --git a/lib/mac/arm64/libpv_orca.dylib b/lib/mac/arm64/libpv_orca.dylib index ff398bf0..7bc11453 100755 Binary files a/lib/mac/arm64/libpv_orca.dylib and b/lib/mac/arm64/libpv_orca.dylib differ diff --git a/lib/mac/x86_64/libpv_orca.dylib b/lib/mac/x86_64/libpv_orca.dylib index 486c7d1a..eb87bef0 100755 Binary files a/lib/mac/x86_64/libpv_orca.dylib and b/lib/mac/x86_64/libpv_orca.dylib differ diff --git a/lib/raspberry-pi/cortex-a53-aarch64/libpv_orca.so b/lib/raspberry-pi/cortex-a53-aarch64/libpv_orca.so index d110fab1..6280599a 100755 Binary files a/lib/raspberry-pi/cortex-a53-aarch64/libpv_orca.so and b/lib/raspberry-pi/cortex-a53-aarch64/libpv_orca.so differ diff --git a/lib/raspberry-pi/cortex-a53/libpv_orca.so b/lib/raspberry-pi/cortex-a53/libpv_orca.so index c7ccea7c..217c7775 100755 Binary files a/lib/raspberry-pi/cortex-a53/libpv_orca.so and b/lib/raspberry-pi/cortex-a53/libpv_orca.so differ diff --git a/lib/raspberry-pi/cortex-a72-aarch64/libpv_orca.so b/lib/raspberry-pi/cortex-a72-aarch64/libpv_orca.so index 355b3151..8feb1497 100755 Binary files a/lib/raspberry-pi/cortex-a72-aarch64/libpv_orca.so and b/lib/raspberry-pi/cortex-a72-aarch64/libpv_orca.so differ diff --git a/lib/raspberry-pi/cortex-a72/libpv_orca.so b/lib/raspberry-pi/cortex-a72/libpv_orca.so index 1c4d63d8..2deb70be 100755 Binary files a/lib/raspberry-pi/cortex-a72/libpv_orca.so and b/lib/raspberry-pi/cortex-a72/libpv_orca.so differ diff --git a/lib/raspberry-pi/cortex-a76-aarch64/libpv_orca.so b/lib/raspberry-pi/cortex-a76-aarch64/libpv_orca.so index 4ef59ba3..27c95c97 100755 Binary files a/lib/raspberry-pi/cortex-a76-aarch64/libpv_orca.so and b/lib/raspberry-pi/cortex-a76-aarch64/libpv_orca.so differ diff --git a/lib/raspberry-pi/cortex-a76/libpv_orca.so b/lib/raspberry-pi/cortex-a76/libpv_orca.so index a1efe7fb..2f3e13a6 100755 Binary files a/lib/raspberry-pi/cortex-a76/libpv_orca.so and b/lib/raspberry-pi/cortex-a76/libpv_orca.so differ diff --git a/lib/wasm/pv_orca.wasm b/lib/wasm/pv_orca.wasm index 0f31d36a..ae633328 100755 Binary files a/lib/wasm/pv_orca.wasm and b/lib/wasm/pv_orca.wasm differ diff --git a/lib/wasm/pv_orca_simd.wasm b/lib/wasm/pv_orca_simd.wasm index eb455d10..2e2c6c8f 100755 Binary files a/lib/wasm/pv_orca_simd.wasm and b/lib/wasm/pv_orca_simd.wasm differ diff --git a/lib/windows/amd64/libpv_orca.dll b/lib/windows/amd64/libpv_orca.dll index 4e07be42..7d22db17 100644 Binary files a/lib/windows/amd64/libpv_orca.dll and b/lib/windows/amd64/libpv_orca.dll differ diff --git a/resources/.lint/spell-check/dict.txt b/resources/.lint/spell-check/dict.txt index 81f953e9..e1b1675f 100644 --- a/resources/.lint/spell-check/dict.txt +++ b/resources/.lint/spell-check/dict.txt @@ -40,3 +40,21 @@ Sevilla editdistance pvleopard xcworkspace +sounddevice +tiktoken +samplerate +dtype +outdata +AILLM +streamable +frombuffer +drwav +wchars +pvrecorder +pvcheetah +itok +numpy +btns +Btns +pltf +usleep diff --git a/resources/.lint/swift/.swiftlint.yml b/resources/.lint/swift/.swiftlint.yml index 14b07450..4d45b6e5 100644 --- a/resources/.lint/swift/.swiftlint.yml +++ b/resources/.lint/swift/.swiftlint.yml @@ -5,5 +5,7 @@ disabled_rules: - implicit_getter - cyclomatic_complexity - function_parameter_count + - file_length + - type_body_length excluded: - - ${PWD}/**/Pods \ No newline at end of file + - ${PWD}/**/Pods diff --git a/resources/.test/models/leopard_params.pv b/resources/.test/models/leopard_params.pv deleted file mode 100644 index 2a490dfd..00000000 Binary files a/resources/.test/models/leopard_params.pv and /dev/null differ diff --git a/resources/.test/test_data.json b/resources/.test/test_data.json index 34b3786c..2415de59 100644 --- a/resources/.test/test_data.json +++ b/resources/.test/test_data.json @@ -3,12 +3,9 @@ "text": "It doesn't matter how slowly you go, as long as you do not stop!", "text_no_punctuation": "It doesn't matter how slowly you go as long as you do not stop", "text_custom_pronunciation": "I {live|L IH V} in {Sevilla|S EH V IY Y AH}. We have great {live|L AY V} sports!", + "text_alignment": "Test alignment.", "text_invalid": [ - "Numbers 1992", - "7.38", - "", - "Symbols *&$", - "reiner@gmail.com", + "Symbols *$", "Escape characters \n", "\"ی\", \"ء\"", "ॐÁ hindi and spanish", @@ -24,5 +21,94 @@ "Wrong {custom|K AH S T AM } pron" ] }, - "wer_threshold": 0.2 + "random_state": 42, + "alignments": [ + { + "word": "Test", + "start_sec": 0.000, + "end_sec": 0.32, + "phonemes": [ + { + "phoneme": "T", + "start_sec": 0.00, + "end_sec": 0.09 + }, + { + "phoneme": "EH", + "start_sec": 0.093, + "end_sec": 0.19 + }, + { + "phoneme": "S", + "start_sec": 0.19, + "end_sec": 0.25 + }, + { + "phoneme": "T", + "start_sec": 0.25, + "end_sec": 0.32 + } + ] + }, + { + "word": "alignment", + "start_sec": 0.32, + "end_sec": 0.85, + "phonemes": [ + { + "phoneme": "AH", + "start_sec": 0.32, + "end_sec": 0.37 + }, + { + "phoneme": "L", + "start_sec": 0.37, + "end_sec": 0.44 + }, + { + "phoneme": "AY", + "start_sec": 0.44, + "end_sec": 0.53 + }, + { + "phoneme": "N", + "start_sec": 0.53, + "end_sec": 0.60 + }, + { + "phoneme": "M", + "start_sec": 0.60, + "end_sec": 0.65 + }, + { + "phoneme": "AH", + "start_sec": 0.65, + "end_sec": 0.72 + }, + { + "phoneme": "N", + "start_sec": 0.72, + "end_sec": 0.78 + }, + { + "phoneme": "T", + "start_sec": 0.78, + "end_sec": 0.85 + } + ] + }, + { + "word": ".", + "start_sec": 0.85, + "end_sec": 0.94, + "phonemes": [ + { + "phoneme": ".", + "start_sec": 0.85, + "end_sec": 0.94 + } + ] + } + ], + "audio_data_folder": "resources/.test/wav/" } diff --git a/resources/.test/wav/orca_params_female_single.wav b/resources/.test/wav/orca_params_female_single.wav new file mode 100644 index 00000000..39ac1f25 Binary files /dev/null and b/resources/.test/wav/orca_params_female_single.wav differ diff --git a/resources/.test/wav/orca_params_female_stream.wav b/resources/.test/wav/orca_params_female_stream.wav new file mode 100644 index 00000000..5f17839d Binary files /dev/null and b/resources/.test/wav/orca_params_female_stream.wav differ diff --git a/resources/.test/wav/orca_params_male_single.wav b/resources/.test/wav/orca_params_male_single.wav new file mode 100644 index 00000000..744fc6b7 Binary files /dev/null and b/resources/.test/wav/orca_params_male_single.wav differ diff --git a/resources/.test/wav/orca_params_male_stream.wav b/resources/.test/wav/orca_params_male_stream.wav new file mode 100644 index 00000000..89ea01cf Binary files /dev/null and b/resources/.test/wav/orca_params_male_stream.wav differ diff --git a/resources/assets/orca_streaming_animation.gif b/resources/assets/orca_streaming_animation.gif new file mode 100644 index 00000000..87c4187e Binary files /dev/null and b/resources/assets/orca_streaming_animation.gif differ diff --git a/resources/demo/demo_data.json b/resources/demo/demo_data.json new file mode 100644 index 00000000..846d61a7 --- /dev/null +++ b/resources/demo/demo_data.json @@ -0,0 +1,12 @@ +{ + "demo_sentences": [ + "Your flight to Paris is scheduled for tomorrow morning at nine AM. Don't forget to pack your passport, and arrive at the airport at least two hours before departure to allow time for security checks.", + "Your meeting with the marketing team has been rescheduled to Friday at two PM. Be sure to review the updated agenda and prepare any necessary materials beforehand.", + "The new restaurant in town, Culinary Delights, has received excellent reviews for its fusion cuisine. Would you like me to make a reservation for dinner?", + "Your fitness tracker data indicates that you've achieved your weekly step goal. Congratulations!", + "Your car's maintenance reminder just popped up. It's time for an oil change and tire rotation. Shall I schedule an appointment with an auto service center?", + "Your meditation app membership expires in three days. Would you like to renew it, or shall I explore other mindfulness resources for you?", + "Your credit card statement shows a higher-than-usual balance this month. Shall I provide a breakdown of your expenses to help you identify any unnecessary purchases?", + "Your favorite book club is hosting a discussion on the latest bestseller next Thursday. Would you like me to add it to your calendar, or perhaps you'd prefer to join virtually?" + ] +}